# DeepTropism Notebook 1 - Data Wrangling
On this notebook we are going to start organizing our HIV-1 V3 loop Dataset to develop the Deep Learning model.<br>
The goal of our model is to define the tropism of the virus solely based on the aminoacid sequence of virus V3 loop.

In [16]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import torchvision
import matplotlib.pyplot as plt
import os

In [130]:
cd datasets

/home/gabriel/Documents/Repos/DeepTropism/datasets


## Creating one Dataframe with all the sequences from the different datasets

In [23]:
df_newdb = pd.read_csv('datasets/processed_tsv/newdb_all.tsv', sep='\t',
                       names=['seq_name', 'dataset', 'label', 'sequence'])
df_webpssm = pd.read_csv('datasets/processed_tsv/webpssm_all.tsv',sep='\t',
                       names=['seq_name', 'dataset', 'label', 'sequence'])
df_hivcopred = pd.read_csv('datasets/processed_tsv/hivcopred_all.tsv',sep='\t',
                       names=['seq_name', 'dataset', 'label', 'sequence'])

In [24]:
df_newdb.head()

Unnamed: 0,seq_name,dataset,label,sequence
0,RAB014775,newdb,CCR5,CTRPSNNTRTGITIGPGQVWYRTGDIIGDIRKAYC
1,RAB014776,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGDIRQAYC
2,RAB014778,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGDIRKAYC
3,RAB014781,newdb,CCR5,CTRPSNNTRTSVTIGPGQVWYRTGDIIGDIRQAYC
4,RAB014834,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGNIRKAYC


In [25]:
df_webpssm.head()

Unnamed: 0,seq_name,dataset,label,sequence
0,95ZW84_ZW_C_NSI_u20_BATRA_(2000),webpssm,CCR5,CTRPNNNTRKSMRIGPGQTFYATGDIIGDIRQAHC
1,95ZW295_ZW_C_NSI_u21_BATRA_(2000),webpssm,CCR5,CTRPNNNTRKSMRIGPGQVFYATDGIIGDIRQAHC
2,95ZW377_ZW_C_NSI_u22_BATRA_(2000),webpssm,CCR5,CTRPSNNTRKSIRIGPGQTFYATNDIIGDIRQAHC
3,95ZW530_ZW_C_NSI_u23_BATRA_(2000),webpssm,CCR5,CTRPGNNTRKSIRIGPGQAFFATGDIIGDIRQAHC
4,95ZW560_ZW_C_NSI_u24_BATRA_(2000),webpssm,CCR5,CTRPGNNTRKSIRIGPGQTFYAANGIIGDIRQAHC


In [26]:
df_hivcopred.head()

Unnamed: 0,seq_name,dataset,label,sequence
0,RFJ977091,hivcopred,CCR5,CARPGNNTKKSVRIGPGQTFYATGDIIGDIRQAHC
1,RFJ977094,hivcopred,CCR5,CARPGNNTRKSVRIGPGQAFYATGDIIGDIRQAHC
2,RDQ382364,hivcopred,CCR5,CARPGNNTRKSVRIGPGQTFFATGDIIGDIRKAHC
3,RFJ376003,hivcopred,CCR5,CARPGNNTRKSXRIGPGQSFHATGEIIGNIREAHC
4,RDQ382371,hivcopred,CCR5,CARPGNNTRRSVRIGPGQAFYATGEIIGDIRKAHC


These three datasets were separated in different list based on the tropism classification already. <br>
The datasets Geno2pheno and CM the classification was defined on the sequence name, so need to extract it.


In [27]:
df_cm = pd.read_csv('datasets/processed_tsv/cm.tsv', sep='\t',
                       names=['seq_name', 'dataset', 'sequence'])
df_g2p = pd.read_csv('datasets/processed_tsv/g2p_str.tsv',sep='\t',
                       names=['seq_name', 'dataset', 'sequence'])

In [28]:
df_cm.head()

Unnamed: 0,seq_name,dataset,sequence
0,-.HM246206.A.CCR5,cm,CVRPNNNTKKSVIGPGQTYANNIIGDIRKAC
1,ACH142.HQ644967.B.CCR5,cm,CTRPNNNTRKSIHIGPGRAFYATGDIIGDIRKAHC
2,TH020.U08754.01_AE.CCR5,cm,CTRPFNNTRTSLTIGPGQVFYRTGDIIGDIRKAYC
3,CW012.AJ418502.B.CCR5,cm,CTRLNNNTRKSIHMGPGRAFYTTGEIIGDIRQAHC
4,BP00069.JN687773.B.CCR5,cm,CTRPYNNTRRSIPIGPGRAFYATGEVIGNIRKAYC


In [29]:
df_g2p.head()

Unnamed: 0,seq_name,dataset,sequence
0,CCR5_1471_29187_CN_2003_B,geno2pheno,CTQTQQQYK-K-KYTSR-------TRASMVCNR---RNNRR---YK...
1,CCR5_AM262114_21502_FR_1995_O,geno2pheno,CVRPGSN-S-V-QEIKI---GP---MAWYSMQL---EQDGKRANAR...
2,CCR5_BCF02_13870_FR_1990_O,geno2pheno,CQRPGHQ-T-V-QEIRI---GP---MAWYS-MG---LAAGNGSESR...
3,CCR5_CA9_357_CM_1993_O,geno2pheno,CERPGNH-T-V-QEIRI---GP---LAWYS-MGIEKNSKNS---SR...
4,CCR5_BCF01_572_FR_1990_O,geno2pheno,CHRPGNL-S-V-QEMKI---GP---LSWYS-MG---LAANSSIKSR...


To make it easier to process the two remaining Datasets we are going to concatenate them.

In [30]:
df_g2p_cm = pd.concat([df_cm, df_g2p])

In [31]:
# Print sizes
print(df_cm.shape)
print(df_g2p.shape)
print(df_g2p_cm.shape)

(2679, 3)
(1188, 3)
(3867, 3)


In [32]:
df_g2p_cm.head(10)

Unnamed: 0,seq_name,dataset,sequence
0,-.HM246206.A.CCR5,cm,CVRPNNNTKKSVIGPGQTYANNIIGDIRKAC
1,ACH142.HQ644967.B.CCR5,cm,CTRPNNNTRKSIHIGPGRAFYATGDIIGDIRKAHC
2,TH020.U08754.01_AE.CCR5,cm,CTRPFNNTRTSLTIGPGQVFYRTGDIIGDIRKAYC
3,CW012.AJ418502.B.CCR5,cm,CTRLNNNTRKSIHMGPGRAFYTTGEIIGDIRQAHC
4,BP00069.JN687773.B.CCR5,cm,CTRPYNNTRRSIPIGPGRAFYATGEVIGNIRKAYC
5,I.DQ061525.B.CCR5,cm,CIRPNNNTRKSIHIGPGRAFYATGEIIGDIRQAHC
6,500.HQ377462.B.CCR5,cm,CTRPNNNTRKSISMGPGRAFYATGGIIGNIRQAHC
7,Pat1.AF541040.B.CCR5,cm,CTRPNNNTRKSIHIGPGRAFYTTGEIIGDIRQAHC
8,CMP013.JX140646.02_AG.CCR5,cm,CMRPNNNTRESVRIGPGQAFYATGEIIGDIRQAHC
9,U.DQ061827.B.CCR5,cm,CTRPNNNTRKGIHMGPGKVFYATGQIIGDIRQAHC


Check if there are labels 'CCR5' or 'CXCR4' on every row of the df_g2p_cm Dataframe.

In [33]:
df_g2p_cm[~((df_g2p_cm.seq_name.str.contains('CCR5'))|
          (df_g2p_cm.seq_name.str.contains('CXCR4')))]

Unnamed: 0,seq_name,dataset,sequence


In [36]:
def get_label(row):
    """
    Function to return co-receptor type based on seq_name
    
    Parameters
     - row (Series): A row of a Dataframe containing information for sample
    
    return (string): A type of co-receptor: 'R5X4', 'CCR5', 'CXCR4'
    
    """
    if 'CCR5' in row['seq_name'] and 'CXCR4' in row['seq_name']:
        return 'R5X4'
    elif 'CCR5' in row['seq_name']:
        return 'CCR5'
    elif 'CXCR4' in row['seq_name']:
        return 'CXCR4'

Apply get_label to df_g2p_cm

In [37]:
df_g2p_cm['label'] =  df_g2p_cm.apply(get_label, axis=1)

# Reorder columns
df_g2p_cm = df_g2p_cm[['seq_name', 'dataset', 'label', 'sequence']]

In [38]:
df_g2p_cm.head(10)

Unnamed: 0,seq_name,dataset,label,sequence
0,-.HM246206.A.CCR5,cm,CCR5,CVRPNNNTKKSVIGPGQTYANNIIGDIRKAC
1,ACH142.HQ644967.B.CCR5,cm,CCR5,CTRPNNNTRKSIHIGPGRAFYATGDIIGDIRKAHC
2,TH020.U08754.01_AE.CCR5,cm,CCR5,CTRPFNNTRTSLTIGPGQVFYRTGDIIGDIRKAYC
3,CW012.AJ418502.B.CCR5,cm,CCR5,CTRLNNNTRKSIHMGPGRAFYTTGEIIGDIRQAHC
4,BP00069.JN687773.B.CCR5,cm,CCR5,CTRPYNNTRRSIPIGPGRAFYATGEVIGNIRKAYC
5,I.DQ061525.B.CCR5,cm,CCR5,CIRPNNNTRKSIHIGPGRAFYATGEIIGDIRQAHC
6,500.HQ377462.B.CCR5,cm,CCR5,CTRPNNNTRKSISMGPGRAFYATGGIIGNIRQAHC
7,Pat1.AF541040.B.CCR5,cm,CCR5,CTRPNNNTRKSIHIGPGRAFYTTGEIIGDIRQAHC
8,CMP013.JX140646.02_AG.CCR5,cm,CCR5,CMRPNNNTRESVRIGPGQAFYATGEIIGDIRQAHC
9,U.DQ061827.B.CCR5,cm,CCR5,CTRPNNNTRKGIHMGPGKVFYATGQIIGDIRQAHC


Now that all Dataframes have labels, we concatenate them into one main Dataframe.

In [72]:
df_datasets = pd.concat([df_newdb,df_webpssm,df_hivcopred, df_g2p_cm])

In [73]:
df_datasets.shape

(9550, 4)

In [74]:
df_datasets.head(10)

Unnamed: 0,seq_name,dataset,label,sequence
0,RAB014775,newdb,CCR5,CTRPSNNTRTGITIGPGQVWYRTGDIIGDIRKAYC
1,RAB014776,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGDIRQAYC
2,RAB014778,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGDIRKAYC
3,RAB014781,newdb,CCR5,CTRPSNNTRTSVTIGPGQVWYRTGDIIGDIRQAYC
4,RAB014834,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGNIRKAYC
5,RAB023804,newdb,CCR5,CTRPNNNTRKSIRIGPGQTFYATGDIIGDIRQAHC
6,RAB287376,newdb,CCR5,CVRPNNNTRTSVRIGPGQTFYATGEIIGDIRQAFC
7,RAB553911,newdb,CCR5,CERPNNNTRRSIQIGPGRAWFEAEDIIGDIRKAHC
8,RAB553912,newdb,CCR5,CTRPNDNTRKSINIAPGRAFYATGDIIGDIRQAHC
9,RAB553913,newdb,CCR5,CTRPNNNTRKGIHMGPGRAIYTTDIIGDIRQAHC


In [75]:
df_datasets_validation = df_datasets[df_datasets.label == 'validation']
df_datasets_validation.head(10)

Unnamed: 0,seq_name,dataset,label,sequence
279,C.ZM.89.ZM20__phen_SI,webpssm,validation,CARPGNNTRKSIRIGPGQTFFATGAIIGDIRQAHC
280,C.ZW.01.TC28_2__phen_SI,webpssm,validation,CGRPNNHRIKGLRIGPGRAFFAMGAIGGEIRQAHC
281,C.ZW.01.TC03_1__phen_SI,webpssm,validation,CIRPGNNTSKSIRIGQRRPVYVH-KIIGDIRQAHC
282,C.ET.97.PHD79C1__phen_SI,webpssm,validation,CIRPNNNTRKSVRIGPGQAFYATGDIIGDIRQAHC
283,C.ZW.01.TC28_1__phen_SI,webpssm,validation,CMRPNNNTRKSVRIGPGQTFFATGAIIGNIRQAHC
284,AC.RW.92.92RW009_di1sCD__phen_SI,webpssm,validation,CPRPNNNTRKSVHIGPGQAFYATGDVIGDIRQAYC
285,AC.RW.92.92RW009_1gCR_AC.RW.92.92RW009_1gER_AC...,webpssm,validation,CSRPNNNTRKSVHIGPGQAFYATGDVIGDIRQAYC
286,C.ZW.01.TC22__phen_SI,webpssm,validation,CTRPGNKTRQSIRIGRGQSFHATGAIIGDIRKAYC
287,C.ZW.01.TC30__phen_SI,webpssm,validation,CTRPGNNT-----IGPGRTFYATDRIIGDIRQAHC
288,C.ZW.01.TC29__phen_SI,webpssm,validation,CTRPGNNTRKGLRIGPGRTIYATEVTVGDIRQAYC


In [76]:
df_datasets_validation.shape

(71, 4)

In [77]:
def label_validation_dataset(row):
    """
    Function to return co-receptor type based on seq_name for Webpssm 
    
    Parameters
     - row (Series): A row of a Dataframe containing information for sample
    
    return (string): A type of co-receptor: 'CCR5', 'CXCR4'
    
    """
    if 'NSI' in row.seq_name:
        return 'CCR5'
    elif 'SI' in row.seq_name:
        return 'CXCR4'

In [78]:
df_datasets_validation['label'] = df_datasets_validation.apply(label_validation_dataset, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [79]:
df_datasets_validation

Unnamed: 0,seq_name,dataset,label,sequence
279,C.ZM.89.ZM20__phen_SI,webpssm,CXCR4,CARPGNNTRKSIRIGPGQTFFATGAIIGDIRQAHC
280,C.ZW.01.TC28_2__phen_SI,webpssm,CXCR4,CGRPNNHRIKGLRIGPGRAFFAMGAIGGEIRQAHC
281,C.ZW.01.TC03_1__phen_SI,webpssm,CXCR4,CIRPGNNTSKSIRIGQRRPVYVH-KIIGDIRQAHC
282,C.ET.97.PHD79C1__phen_SI,webpssm,CXCR4,CIRPNNNTRKSVRIGPGQAFYATGDIIGDIRQAHC
283,C.ZW.01.TC28_1__phen_SI,webpssm,CXCR4,CMRPNNNTRKSVRIGPGQTFFATGAIIGNIRQAHC
...,...,...,...,...
345,C.ZW.01.TC33__phen_NSI,webpssm,CCR5,CTRPNNNTRTSVRIGPGQAFYATGDIIGDIRQAHC
346,C.FR.93.FRMP37__phen_NSI,webpssm,CCR5,CTRPSNNTRKSIRIGPGQAFYATNGIIGDIRAAHC
347,C.ZW.01.TC32__phen_NSI,webpssm,CCR5,CTRPSNNTRKSVWLGPGRAFYT-NKVIGNIRKAHC
348,C.FR.91.FRMP197__phen_NSI,webpssm,CCR5,CTRPYNNTRQSIRIGPGQTFYATGDIIGDIRKAHC


In [80]:
# Save validation dataset to TSV
df_datasets_validation.to_csv('webpssm_validation_labeled.tsv', sep='\t')

In [95]:
# Now concatenate the parsed df_datasets_validation to df_datasets
df_datasets_final = pd.concat([df_datasets[df_datasets.label != 'validation'], df_datasets_validation])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [96]:
df_datasets_final.head()

Unnamed: 0,dataset,index,label,seq_name,sequence
0,newdb,0.0,CCR5,RAB014775,CTRPSNNTRTGITIGPGQVWYRTGDIIGDIRKAYC
1,newdb,1.0,CCR5,RAB014776,CTRPSNNTRTSITIGPGQVWYRTGDIIGDIRQAYC
2,newdb,2.0,CCR5,RAB014778,CTRPSNNTRTSITIGPGQVWYRTGDIIGDIRKAYC
3,newdb,3.0,CCR5,RAB014781,CTRPSNNTRTSVTIGPGQVWYRTGDIIGDIRQAYC
4,newdb,4.0,CCR5,RAB014834,CTRPSNNTRTSITIGPGQVWYRTGDIIGNIRKAYC


In [97]:
df_datasets_final.shape

(9550, 5)

Now we replace the '-' on sequences to remove duplicated ones.<br>
First we check the number of sequences with '-' on the Dataframe.

In [98]:
df_datasets_final[df_datasets_final.sequence.str.contains('-')].shape

(1225, 5)

In [99]:
df_datasets_final[df_datasets_final.sequence.str.contains('-')].head(10)

Unnamed: 0,dataset,index,label,seq_name,sequence
3106,webpssm,108.0,CCR5,TV013_ZA_C_NSI/CCR5_u125_TREURNICHT_(2002),CTRPNNNTRRSIRIGPGQAFY-TNDIIGDIRQAHC
3125,webpssm,127.0,CCR5,98TZ013_TZ_C_CCR5_u144_RODENBURG_(2001),CTRPGNNTRKSVRIGPGQTFY-TNDIIGDIRQAYC
3143,webpssm,145.0,CCR5,S018_MW_C_CCR5_u162_PING_(1999),CVRPNNNTRKSIRIGPGQTFYA-NDIIGDIRQAHC
3151,webpssm,153.0,CCR5,S031_MW_C_CCR5_u170_PING_(1999),CTRPNNNTRKSIRIGPGQTFYA-NDIIGDIRQAHC
3154,webpssm,156.0,CCR5,S180_MW_C_CCR5_u173_PING_(1999),CTRPGNNTRTSIRIGPGQTFFANN-IIGDIRQAHC
3169,webpssm,171.0,CCR5,DU179MAY99U-R5_ZA_C_CCR5_u19_NICD_(UNPUBL),CTRPGNNTRKSIRIGPGQAFY-TNHIIGDIRQAYC
3201,webpssm,203.0,CCR5,TM3__ZA_C_NSI/CCR5_u194_CHOGE_(IN_PRESS),CTRPGNNTRKSIRIGPGQTFYA-NDIIGDIRQAYC
3205,webpssm,207.0,CCR5,TM10__ZA_C_NSI/CCR5_u198_CHOGE_(IN_PRESS),CTRPNNNTRKSIRIGPGQTFYATN-IIGDIRQAYC
3214,webpssm,216.0,CCR5,TM31__ZA_C_NSI/CCR5_u207_CHOGE_(IN_PRESS),CTRPGSNTRRSIRIGPGQAFY-TQDIIGDIRQAHC
3226,webpssm,228.0,CXCR4,95ZW748_ZW_C_SI_u1_BATRA_(2000),CTRPNNNVRKHIRIGIGKVFYA-NDIIGDIRQARC


In [100]:
df_datasets_final['sequence'] = df_datasets_final['sequence'].str.replace('-', '', regex=False)

Check if the replace worked:

In [101]:
df_datasets_final[df_datasets_final.sequence.str.contains('-')].shape

(0, 5)

Now that our sequences don't have '-' we can drop the duplicated sequences to avoid repetitive data on our trainning set.

In [102]:
df_datasets_final.shape

(9550, 5)

In [103]:
df_datasets_final.head(20)

Unnamed: 0,dataset,index,label,seq_name,sequence
0,newdb,0.0,CCR5,RAB014775,CTRPSNNTRTGITIGPGQVWYRTGDIIGDIRKAYC
1,newdb,1.0,CCR5,RAB014776,CTRPSNNTRTSITIGPGQVWYRTGDIIGDIRQAYC
2,newdb,2.0,CCR5,RAB014778,CTRPSNNTRTSITIGPGQVWYRTGDIIGDIRKAYC
3,newdb,3.0,CCR5,RAB014781,CTRPSNNTRTSVTIGPGQVWYRTGDIIGDIRQAYC
4,newdb,4.0,CCR5,RAB014834,CTRPSNNTRTSITIGPGQVWYRTGDIIGNIRKAYC
5,newdb,5.0,CCR5,RAB023804,CTRPNNNTRKSIRIGPGQTFYATGDIIGDIRQAHC
6,newdb,6.0,CCR5,RAB287376,CVRPNNNTRTSVRIGPGQTFYATGEIIGDIRQAFC
7,newdb,7.0,CCR5,RAB553911,CERPNNNTRRSIQIGPGRAWFEAEDIIGDIRKAHC
8,newdb,8.0,CCR5,RAB553912,CTRPNDNTRKSINIAPGRAFYATGDIIGDIRQAHC
9,newdb,9.0,CCR5,RAB553913,CTRPNNNTRKGIHMGPGRAIYTTDIIGDIRQAHC


In [110]:
df_datasets_final = df_datasets_final.reset_index(drop=True)
df_datasets_final

Unnamed: 0,dataset,index,label,seq_name,sequence
0,newdb,0.0,CCR5,RAB014775,CTRPSNNTRTGITIGPGQVWYRTGDIIGDIRKAYC
1,newdb,1.0,CCR5,RAB014776,CTRPSNNTRTSITIGPGQVWYRTGDIIGDIRQAYC
2,newdb,2.0,CCR5,RAB014778,CTRPSNNTRTSITIGPGQVWYRTGDIIGDIRKAYC
3,newdb,3.0,CCR5,RAB014781,CTRPSNNTRTSVTIGPGQVWYRTGDIIGDIRQAYC
4,newdb,4.0,CCR5,RAB014834,CTRPSNNTRTSITIGPGQVWYRTGDIIGNIRKAYC
...,...,...,...,...,...
9545,webpssm,,CCR5,C.ZW.01.TC33__phen_NSI,CTRPNNNTRTSVRIGPGQAFYATGDIIGDIRQAHC
9546,webpssm,,CCR5,C.FR.93.FRMP37__phen_NSI,CTRPSNNTRKSIRIGPGQAFYATNGIIGDIRAAHC
9547,webpssm,,CCR5,C.ZW.01.TC32__phen_NSI,CTRPSNNTRKSVWLGPGRAFYTNKVIGNIRKAHC
9548,webpssm,,CCR5,C.FR.91.FRMP197__phen_NSI,CTRPYNNTRQSIRIGPGQTFYATGDIIGDIRKAHC


In [116]:
#df_datasets_final.drop(['index'], axis=1, inplace=True)
df_datasets_final = df_datasets_final[['seq_name', 'dataset', 'label', 'sequence']]
df_datasets_final

Unnamed: 0,seq_name,dataset,label,sequence
0,RAB014775,newdb,CCR5,CTRPSNNTRTGITIGPGQVWYRTGDIIGDIRKAYC
1,RAB014776,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGDIRQAYC
2,RAB014778,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGDIRKAYC
3,RAB014781,newdb,CCR5,CTRPSNNTRTSVTIGPGQVWYRTGDIIGDIRQAYC
4,RAB014834,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGNIRKAYC
...,...,...,...,...
9545,C.ZW.01.TC33__phen_NSI,webpssm,CCR5,CTRPNNNTRTSVRIGPGQAFYATGDIIGDIRQAHC
9546,C.FR.93.FRMP37__phen_NSI,webpssm,CCR5,CTRPSNNTRKSIRIGPGQAFYATNGIIGDIRAAHC
9547,C.ZW.01.TC32__phen_NSI,webpssm,CCR5,CTRPSNNTRKSVWLGPGRAFYTNKVIGNIRKAHC
9548,C.FR.91.FRMP197__phen_NSI,webpssm,CCR5,CTRPYNNTRQSIRIGPGQTFYATGDIIGDIRKAHC


In [129]:
# Create TSV file from df_datasets
df_datasets_final.to_csv('all_datasets_raw.tsv', sep='\t')

# Create fasta file from the df_unique_seqs
with open('dataset_all_seqs.fasta', 'w') as f:
    for index, row in df_datasets_final.iterrows():
        f.write(f'>{row.seq_name}|{row.dataset}|{row.label}\n')
        f.write(f'{row.sequence}\n')    

In [120]:
# Check number of duplicated sequences
df_datasets_final.duplicated(subset='sequence', keep=False).sum()

8765

In [127]:
df_datasets_final[df_datasets_final.sequence.str.contains('-')]

Unnamed: 0,seq_name,dataset,label,sequence


In [135]:
# Get diversity of lenghts of sequences on df_datasets_final
set(df_datasets_final['sequence'].apply(len))

{21, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39}

In [136]:
len(set(df_datasets_final.sequence.to_list()))

3608

In [137]:
# Create a Dataframe with unique sequences
df_unique_seqs = df_datasets_final.drop_duplicates(subset='sequence', keep='first')
df_unique_seqs.shape

(3608, 4)

In [138]:
df_datasets_final[df_datasets_final.label == 'validation'].head()

Unnamed: 0,seq_name,dataset,label,sequence


As we can see, there are 3608 unique sequences on our Dataset.<br>
We are going to use these unique sequences to do the alignment and split data into trainning, validation and test sets.<br><br>
To execute the alignment we are going to create a fasta file out of the Dataframe.


In [139]:
# Create fasta file from the df_unique_seqs
with open('dataset_unique_seqs.fasta', 'w') as f:
    for index, row in df_unique_seqs.iterrows():
        f.write(f'>{row.seq_name}|{row.dataset}|{row.label}\n')
        f.write(f'{row.sequence}\n')        

# Align sequences using Muscle aligner
To get an alignment for the sequences on our dataset we are going to use [Muscle Aligner](https://www.drive5.com/muscle/).

In [45]:
os.system('/home/gabriel/Documents/Bioinformatics/muscle3.8.31_i86linux64 -in datasets/dataset_unique_seqs.fasta -out datasets/dataset_unique_seqs_aligned.fasta -gapopen -15')

In [152]:
# Creating Dataframe from Muscle aligned output
df_aligned = pd.read_csv('/home/gabriel/Documents/Repos/DeepTropism/datasets/dataset_unique_seqs_aligned_gapopen15_MAIN.tsv',sep='\t',
                       names=['seq_name', 'dataset', 'label', 'sequence_aligned'])

Now that we have a Dataframe with all the unique sequences aligned and labeled we are going to separate ou train set, validation set and test set on a proportion of 80/10/10.

# Using the df_unique_seqs
This dataset was generated by the alignment of the original dataset, and the resulted alignment consists of a 60 position string for each sequence with '-' representing gaps.

In [153]:
df_aligned.head()

Unnamed: 0,seq_name,dataset,label,sequence_aligned
0,CCR5_1471_29187_CN_2003_B,geno2pheno,CCR5,CTQTQQQY-KKKY---T----------SRTR-ASM-----V-CNRR...
1,CCR5/CXCR4/CCR1/CCR2b/CCR3/CCR4_MVP5180_67_CM_...,geno2pheno,R5X4,CIREGIAE-VQDI---Y--T--G-P-----M-RWRSMTLKR-SNNT...
2,RKF859742,newdb,CCR5,CERPTMDI-QDIH------I--G-P-----M-AWYSTYIER-QAKG...
3,RAF009608,hivcopred,CCR5,CSRPEMDV-QEIR---N-----G-P-----M-AWYSMALAK-GGTT...
4,RKF859743,newdb,CCR5,CRRPAMKV-QEMR---I----------G--PMAWY-----S-MALE...


In [155]:
# Check sizes for sequence column
set(df_aligned['sequence_aligned'].apply(len))

{60}

## Append aligned sequence to df_datasets_final
In order to create a Dataset with all the sequences and respective alignments, we are going to set on the df_datasets_final the respective aligned sequenced fetched on df_aligned

In [158]:
df_datasets_final.head(10)

Unnamed: 0,seq_name,dataset,label,sequence
0,RAB014775,newdb,CCR5,CTRPSNNTRTGITIGPGQVWYRTGDIIGDIRKAYC
1,RAB014776,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGDIRQAYC
2,RAB014778,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGDIRKAYC
3,RAB014781,newdb,CCR5,CTRPSNNTRTSVTIGPGQVWYRTGDIIGDIRQAYC
4,RAB014834,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGNIRKAYC
5,RAB023804,newdb,CCR5,CTRPNNNTRKSIRIGPGQTFYATGDIIGDIRQAHC
6,RAB287376,newdb,CCR5,CVRPNNNTRTSVRIGPGQTFYATGEIIGDIRQAFC
7,RAB553911,newdb,CCR5,CERPNNNTRRSIQIGPGRAWFEAEDIIGDIRKAHC
8,RAB553912,newdb,CCR5,CTRPNDNTRKSINIAPGRAFYATGDIIGDIRQAHC
9,RAB553913,newdb,CCR5,CTRPNNNTRKGIHMGPGRAIYTTDIIGDIRQAHC


In [159]:
df_datasets.shape

(9550, 4)

In [164]:
# Create a column with raw sequence on df_aligned to use as reference
df_aligned['raw_sequence'] = df_aligned['sequence_aligned'].str.replace('-', '', regex=False)

In [176]:
# Iterate over df_datasets_final to set sequence_aligned
for index, row in df_datasets_final.iterrows():
    #if str(df[df.sequence_no_space == row.sequence ].sequence.values[0]):
    
    try:
        df_datasets_final.at[index, 'sequence_aligned'] = str(df_aligned[
                                     df_aligned.raw_sequence == row.sequence ].sequence_aligned.values[0])
    
    except IndexError:
        try:
            df_datasets_final.at[index, 'sequence_aligned'] = str(df_aligned[
                                     df_aligned.seq_name == row.seq_name ].sequence_aligned.values[0])        
        except IndexError:
            print(index, row.seq_name)
            print(index, f'Row sequence:{row.sequence}')
            df_datasets_final.at[index, 'sequence_aligned'] = 'Error'

In [178]:
# Check if there were rows with Errors
df_datasets_final[df_datasets_final.sequence_aligned == 'Error']

Unnamed: 0,seq_name,dataset,label,sequence,sequence_aligned


# Check if all the rows have sequence and sequence_aligned matching

In [179]:
df_datasets_final[df_datasets_final.sequence != df_datasets_final.sequence_aligned.str.replace('-', '', regex=True)]

Unnamed: 0,seq_name,dataset,label,sequence,sequence_aligned
4423,RFJ375975,hivcopred,CCR5,CTRPNNNTRKSJRIGPGQAFYATGDIIGDIREAHC,CTRPNNNT-RKSX---R--I--G-P--G--Q-AFY---A-T--GDI...
5064,DGQ401718,hivcopred,R5X4,CIRPGNNTRTSVXJGPGXTFYATGDIIGDIRQAHC,CIRPGNNT-RTSV---X--X--G-P--G--X-TFY---A-T--GDI...
5341,DFJ376010,hivcopred,R5X4,CTRPXXSXRRXIRJGPGXVXYXXXXIGDIRQAXC,CTRPXXSX-RRXI---R--X--G-P--G--X-VXY-----X-XXXI...


As we can see, three rows showed an inconsistency between original and aligned sequence.<br>
This was caused by the aligner that do not consider the 'J' as it is a dubious representation of Leucine or Isoleucine.<br>
We decided to manually edit it to correspond to the original sequence.

In [184]:
dict_manual_edit = {'RFJ375975':[4423,'CTRPNNNT-RKSJ---R--I--G-P--G--Q-AFY---A-T--GDI-I--GDIREAHC--'],
                    'DGQ401718':[5064, 'CIRPGNNT-RTSV---X--J--G-P--G--X-TFY---A-T--GDI-I--GDIRQAHC--'],
                    'DFJ376010':[5341, 'CTRPXXSX-RRXI---R--J--G-P--G--X-VXY-----X-XXXI----GDIRQAXC--']}

In [187]:
# Replace the sequences with new values from dict_manual_edit
for key, value in dict_manual_edit.items():
    df_datasets_final.at[value[0], 'sequence_aligned'] = value[1]    

In [188]:
# Check if the changes were correctly made
df_datasets_final[df_datasets_final.sequence != df_datasets_final.sequence_aligned.str.replace('-', '', regex=True)]

Unnamed: 0,seq_name,dataset,label,sequence,sequence_aligned


# Defining the label as numeric 
* 'CCR5' = 0 
* 'CXCR4' = 1 
* 'R5X4' = 1

In [205]:
# Function to call labels
def tropism_label(row):
    """
    Define numeric label, 'CCR5' as 0 
    and 'CXCR4' or 'R5X4' as 1
    """
    # For CCR5
    if str(row.label).strip() == 'CCR5':
        return 0
    # For CXCR4
    elif str(row.label).strip() == 'CXCR4':
        return 1
    # For R5X4
    elif str(row.label).strip() == 'R5X4':
        return 1

In [208]:
df_datasets_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9550 entries, 0 to 9549
Data columns (total 6 columns):
seq_name            9550 non-null object
dataset             9550 non-null object
label               9550 non-null object
sequence            9550 non-null object
sequence_aligned    9550 non-null object
label_numeric       9479 non-null float64
dtypes: float64(1), object(5)
memory usage: 447.8+ KB


In [213]:
df_datasets_final['label_numeric'] = df_datasets_final.apply(tropism_label, axis=1)
df_datasets_final['label_numeric'] = df_datasets_final['label_numeric'].astype(int)


In [214]:
df_datasets_final.head()

Unnamed: 0,seq_name,dataset,label,sequence,sequence_aligned,label_numeric
0,RAB014775,newdb,CCR5,CTRPSNNTRTGITIGPGQVWYRTGDIIGDIRKAYC,CTRPSNNT-RTGI---T--I--G-P--G--Q-VWY---R-T--GDI...,0
1,RAB014776,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGDIRQAYC,CTRPSNNT-RTSI---T--I--G-P--G--Q-VWY---R-T--GDI...,0
2,RAB014778,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGDIRKAYC,CTRPSNNT-RTSI---T--I--G-P--G--Q-VWY---R-T--GDI...,0
3,RAB014781,newdb,CCR5,CTRPSNNTRTSVTIGPGQVWYRTGDIIGDIRQAYC,CTRPSNNT-RTSV---T--I--G-P--G--Q-VWY---R-T--GDI...,0
4,RAB014834,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGNIRKAYC,CTRPSNNT-RTSI---T--I--G-P--G--Q-VWY---R-T--GDI...,0


In [216]:
df_datasets_final.label.value_counts()

CCR5     7705
CXCR4     937
R5X4      908
Name: label, dtype: int64

In [217]:
df_datasets_final.label_numeric.value_counts()

0    7705
1    1845
Name: label_numeric, dtype: int64

In [218]:
df_datasets_final[df_datasets_final.sequence_aligned.str.len() != 60]

Unnamed: 0,seq_name,dataset,label,sequence,sequence_aligned,label_numeric


In [223]:
# Write down the 
df_datasets_final.to_csv('deeptropism_full_curated_dataset.tsv', sep='\t' , index=False)

In [221]:
set(df_datasets_final['sequence_aligned'].apply(len))

{60}

In [222]:
df_datasets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9550 entries, 0 to 9549
Data columns (total 4 columns):
seq_name    9550 non-null object
dataset     9550 non-null object
label       9550 non-null object
sequence    9550 non-null object
dtypes: object(4)
memory usage: 298.6+ KB
