In [26]:
import pandas as pd

In [27]:
df = pd.read_csv("../data/processed/phage_clean_data.csv", index_col=0)
df.head()

Unnamed: 0,Accession,staining,Genome Length (bp),Jumbophage,molGC (%),Molecule,Number CDS,Positive Strand (%),Negative Strand (%),Coding Capacity (%),tRNAs
0,MN335248,negative,7045,False,60.298,ss-DNA,13,84.615385,15.384615,88.828957,0
1,MK250029,negative,540217,True,25.796,DNA,830,47.108434,52.891566,68.324951,30
2,MK250028,negative,550053,True,26.012,DNA,859,52.270081,47.729919,69.188424,29
3,MK250027,negative,551627,True,26.022,DNA,860,53.023256,46.976744,69.318761,33
4,MK250026,negative,550702,True,26.02,DNA,859,53.201397,46.798603,69.363285,33


## 1. One-hot encoding for molecule type

In [28]:
df['Molecule'].unique()

array(['ss-DNA', 'DNA', 'RNA', 'ss-RNA', 'cRNA'], dtype=object)

In [29]:
# Only one entry has cRNA, which is the RNA transcript of a cDNA (which itself comes from RNA)
df[df['Molecule'] == "cRNA"] 

Unnamed: 0,Accession,staining,Genome Length (bp),Jumbophage,molGC (%),Molecule,Number CDS,Positive Strand (%),Negative Strand (%),Coding Capacity (%),tRNAs
6958,MT366568,positive,45168,False,33.287,cRNA,68,89.705882,10.294118,93.152232,0


In [30]:
# Changing the cRNA to RNA
df.loc[df['Molecule'] == 'cRNA', 'Molecule'] = 'RNA'
df[df['Molecule'] == "cRNA"] 

Unnamed: 0,Accession,staining,Genome Length (bp),Jumbophage,molGC (%),Molecule,Number CDS,Positive Strand (%),Negative Strand (%),Coding Capacity (%),tRNAs


One-hot encoding of the molecule column

In [31]:
df = pd.get_dummies(df, columns=['Molecule'])
df

Unnamed: 0,Accession,staining,Genome Length (bp),Jumbophage,molGC (%),Number CDS,Positive Strand (%),Negative Strand (%),Coding Capacity (%),tRNAs,Molecule_DNA,Molecule_RNA,Molecule_ss-DNA,Molecule_ss-RNA
0,MN335248,negative,7045,False,60.298,13,84.615385,15.384615,88.828957,0,False,False,True,False
1,MK250029,negative,540217,True,25.796,830,47.108434,52.891566,68.324951,30,True,False,False,False
2,MK250028,negative,550053,True,26.012,859,52.270081,47.729919,69.188424,29,True,False,False,False
3,MK250027,negative,551627,True,26.022,860,53.023256,46.976744,69.318761,33,True,False,False,False
4,MK250026,negative,550702,True,26.020,859,53.201397,46.798603,69.363285,33,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11151,EF380009,negative,5386,False,44.653,6,100.000000,0.000000,65.837356,0,False,False,True,False
11152,DQ490056,positive,26537,False,37.099,48,72.916667,27.083333,92.892942,0,True,False,False,False
11153,AJ414696,negative,32308,False,25.269,47,53.191489,46.808511,81.425653,0,True,False,False,False
11154,AJ344259,negative,35450,False,25.199,54,55.555556,44.444444,78.854725,0,True,False,False,False


In [34]:
# List of columns to be converted into binary

columns_to_convert = ['Jumbophage', 'Molecule_ss-DNA', 'Molecule_DNA', 'Molecule_RNA', 'Molecule_ss-RNA']
df['Jumbophage'] = df['Jumbophage'].astype(int)  # Convert True/False to 1/0

# Convert True/False to 1/0 for the specified columns
df[columns_to_convert] = df[columns_to_convert].astype(int)

df

Unnamed: 0,Accession,staining,Genome Length (bp),Jumbophage,molGC (%),Number CDS,Positive Strand (%),Negative Strand (%),Coding Capacity (%),tRNAs,Molecule_DNA,Molecule_RNA,Molecule_ss-DNA,Molecule_ss-RNA
0,MN335248,negative,7045,0,60.298,13,84.615385,15.384615,88.828957,0,0,0,1,0
1,MK250029,negative,540217,1,25.796,830,47.108434,52.891566,68.324951,30,1,0,0,0
2,MK250028,negative,550053,1,26.012,859,52.270081,47.729919,69.188424,29,1,0,0,0
3,MK250027,negative,551627,1,26.022,860,53.023256,46.976744,69.318761,33,1,0,0,0
4,MK250026,negative,550702,1,26.020,859,53.201397,46.798603,69.363285,33,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11151,EF380009,negative,5386,0,44.653,6,100.000000,0.000000,65.837356,0,0,0,1,0
11152,DQ490056,positive,26537,0,37.099,48,72.916667,27.083333,92.892942,0,1,0,0,0
11153,AJ414696,negative,32308,0,25.269,47,53.191489,46.808511,81.425653,0,1,0,0,0
11154,AJ344259,negative,35450,0,25.199,54,55.555556,44.444444,78.854725,0,1,0,0,0


In [36]:
df.to_csv("../data/processed/model_data.csv", index=False)