In [13]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import torchvision
import matplotlib.pyplot as plt
import os

In [14]:
ls datasets

41598_2019_46420_MOESM2_ESM.xlsx   newdb_all_hxb2_aligned.fasta
aminoacids_oneletter_code.csv      newdb_all_hxb2.fasta
cm_aligned.fasta                   newdb_and_hivcopred_aligned.fasta
cm_aligned.tsv                     newdb_and_hivcopred_aligned_refined.fasta
cm.fasta                           newdb_and_hivcopred_aligned.tsv
dataset_chen.csv                   newdb_and_hivcopred.fasta
dataset_chen.fasta                 newdb_ccr5.fasta
g2p_str.fasta                      newdb_cxcr4.fasta
hivcopred_all.fasta                newdb_dualtropic.fasta
hivcopred_ccr5.fasta               newdb_wrangled.tsv
hivcopred_cxcr4.fasta              [0m[01;34mprocessed_tsv[0m/
hivcopred_r5x4.fasta               [01;34msrep21280[0m/
muscle_aligner_test_aligned.fasta  teste_1seq_to_all_aligned.fasta
muscle_aligner_test.fasta          teste_1seq_to_all.fasta
newdb_aligned_all_labels.tsv       train_mnist.csv
newdb_aligned.csv                  webpssm_ccr5.fasta
newdb_aligned_muscle.fasta        

## Creating one Dataframe with all the sequences from the different datasets

In [15]:
df_newdb = pd.read_csv('datasets/processed_tsv/newdb_all.tsv', sep='\t',
                       names=['seq_name', 'dataset', 'label', 'sequence'])
df_webpssm = pd.read_csv('datasets/processed_tsv/webpssm_all.tsv',sep='\t',
                       names=['seq_name', 'dataset', 'label', 'sequence'])
df_hivcopred = pd.read_csv('datasets/processed_tsv/hivcopred_all.tsv',sep='\t',
                       names=['seq_name', 'dataset', 'label', 'sequence'])

In [16]:
df_newdb.head()

Unnamed: 0,seq_name,dataset,label,sequence
0,RAB014775,newdb,CCR5,CTRPSNNTRTGITIGPGQVWYRTGDIIGDIRKAYC
1,RAB014776,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGDIRQAYC
2,RAB014778,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGDIRKAYC
3,RAB014781,newdb,CCR5,CTRPSNNTRTSVTIGPGQVWYRTGDIIGDIRQAYC
4,RAB014834,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGNIRKAYC


In [17]:
df_webpssm.head()

Unnamed: 0,seq_name,dataset,label,sequence
0,95ZW84_ZW_C_NSI_u20_BATRA_(2000),webpssm,CCR5,CTRPNNNTRKSMRIGPGQTFYATGDIIGDIRQAHC
1,95ZW295_ZW_C_NSI_u21_BATRA_(2000),webpssm,CCR5,CTRPNNNTRKSMRIGPGQVFYATDGIIGDIRQAHC
2,95ZW377_ZW_C_NSI_u22_BATRA_(2000),webpssm,CCR5,CTRPSNNTRKSIRIGPGQTFYATNDIIGDIRQAHC
3,95ZW530_ZW_C_NSI_u23_BATRA_(2000),webpssm,CCR5,CTRPGNNTRKSIRIGPGQAFFATGDIIGDIRQAHC
4,95ZW560_ZW_C_NSI_u24_BATRA_(2000),webpssm,CCR5,CTRPGNNTRKSIRIGPGQTFYAANGIIGDIRQAHC


In [18]:
df_hivcopred.head()

Unnamed: 0,seq_name,dataset,label,sequence
0,RFJ977091,hivcopred,CCR5,CARPGNNTKKSVRIGPGQTFYATGDIIGDIRQAHC
1,RFJ977094,hivcopred,CCR5,CARPGNNTRKSVRIGPGQAFYATGDIIGDIRQAHC
2,RDQ382364,hivcopred,CCR5,CARPGNNTRKSVRIGPGQTFFATGDIIGDIRKAHC
3,RFJ376003,hivcopred,CCR5,CARPGNNTRKSXRIGPGQSFHATGEIIGNIREAHC
4,RDQ382371,hivcopred,CCR5,CARPGNNTRRSVRIGPGQAFYATGEIIGDIRKAHC


These three datasets were separated in different list based on the tropism classification already. <br>
The datasets Geno2pheno and CM the classification was defined on the sequence name, so need to extract it.


In [19]:
df_cm = pd.read_csv('datasets/processed_tsv/cm.tsv', sep='\t',
                       names=['seq_name', 'dataset', 'sequence'])
df_g2p = pd.read_csv('datasets/processed_tsv/g2p_str.tsv',sep='\t',
                       names=['seq_name', 'dataset', 'sequence'])

In [20]:
df_cm.head()

Unnamed: 0,seq_name,dataset,sequence
0,-.HM246206.A.CCR5,cm,CVRPNNNTKKSVIGPGQTYANNIIGDIRKAC
1,ACH142.HQ644967.B.CCR5,cm,CTRPNNNTRKSIHIGPGRAFYATGDIIGDIRKAHC
2,TH020.U08754.01_AE.CCR5,cm,CTRPFNNTRTSLTIGPGQVFYRTGDIIGDIRKAYC
3,CW012.AJ418502.B.CCR5,cm,CTRLNNNTRKSIHMGPGRAFYTTGEIIGDIRQAHC
4,BP00069.JN687773.B.CCR5,cm,CTRPYNNTRRSIPIGPGRAFYATGEVIGNIRKAYC


In [21]:
df_g2p.head()

Unnamed: 0,seq_name,dataset,sequence
0,CCR5_1471_29187_CN_2003_B,geno2pheno,CTQTQQQYK-K-KYTSR-------TRASMVCNR---RNNRR---YK...
1,CCR5_AM262114_21502_FR_1995_O,geno2pheno,CVRPGSN-S-V-QEIKI---GP---MAWYSMQL---EQDGKRANAR...
2,CCR5_BCF02_13870_FR_1990_O,geno2pheno,CQRPGHQ-T-V-QEIRI---GP---MAWYS-MG---LAAGNGSESR...
3,CCR5_CA9_357_CM_1993_O,geno2pheno,CERPGNH-T-V-QEIRI---GP---LAWYS-MGIEKNSKNS---SR...
4,CCR5_BCF01_572_FR_1990_O,geno2pheno,CHRPGNL-S-V-QEMKI---GP---LSWYS-MG---LAANSSIKSR...


To make it easier to process the two remaining Datasets we are going to concatenate them.

In [22]:
df_g2p_cm = pd.concat([df_cm, df_g2p])

In [23]:
# Print sizes
print(df_cm.shape)
print(df_g2p.shape)
print(df_g2p_cm.shape)

(2679, 3)
(1188, 3)
(3867, 3)


In [24]:
df_g2p_cm.head(10)

Unnamed: 0,seq_name,dataset,sequence
0,-.HM246206.A.CCR5,cm,CVRPNNNTKKSVIGPGQTYANNIIGDIRKAC
1,ACH142.HQ644967.B.CCR5,cm,CTRPNNNTRKSIHIGPGRAFYATGDIIGDIRKAHC
2,TH020.U08754.01_AE.CCR5,cm,CTRPFNNTRTSLTIGPGQVFYRTGDIIGDIRKAYC
3,CW012.AJ418502.B.CCR5,cm,CTRLNNNTRKSIHMGPGRAFYTTGEIIGDIRQAHC
4,BP00069.JN687773.B.CCR5,cm,CTRPYNNTRRSIPIGPGRAFYATGEVIGNIRKAYC
5,I.DQ061525.B.CCR5,cm,CIRPNNNTRKSIHIGPGRAFYATGEIIGDIRQAHC
6,500.HQ377462.B.CCR5,cm,CTRPNNNTRKSISMGPGRAFYATGGIIGNIRQAHC
7,Pat1.AF541040.B.CCR5,cm,CTRPNNNTRKSIHIGPGRAFYTTGEIIGDIRQAHC
8,CMP013.JX140646.02_AG.CCR5,cm,CMRPNNNTRESVRIGPGQAFYATGEIIGDIRQAHC
9,U.DQ061827.B.CCR5,cm,CTRPNNNTRKGIHMGPGKVFYATGQIIGDIRQAHC


Check if there are labels 'CCR5' or 'CXCR4' on every row of the df_g2p_cm Dataframe.

In [25]:
df_g2p_cm[~((df_g2p_cm.seq_name.str.contains('CCR5'))|
          (df_g2p_cm.seq_name.str.contains('CXCR4')))]

Unnamed: 0,seq_name,dataset,sequence


In [26]:
def get_label(row):
    if 'CCR5' in row['seq_name'] and 'CXCR4' in row['seq_name']:
        return 'R5X4'
    elif 'CCR5' in row['seq_name']:
        return 'CCR5'
    elif 'CXCR4' in row['seq_name']:
        return 'CXCR4'

Apply get_label to df_g2p_cm

In [27]:
df_g2p_cm['label'] =  df_g2p_cm.apply(get_label, axis=1)

# Reorder columns
df_g2p_cm = df_g2p_cm[['seq_name', 'dataset', 'label', 'sequence']]

In [28]:
df_g2p_cm.head(10)

Unnamed: 0,seq_name,dataset,label,sequence
0,-.HM246206.A.CCR5,cm,CCR5,CVRPNNNTKKSVIGPGQTYANNIIGDIRKAC
1,ACH142.HQ644967.B.CCR5,cm,CCR5,CTRPNNNTRKSIHIGPGRAFYATGDIIGDIRKAHC
2,TH020.U08754.01_AE.CCR5,cm,CCR5,CTRPFNNTRTSLTIGPGQVFYRTGDIIGDIRKAYC
3,CW012.AJ418502.B.CCR5,cm,CCR5,CTRLNNNTRKSIHMGPGRAFYTTGEIIGDIRQAHC
4,BP00069.JN687773.B.CCR5,cm,CCR5,CTRPYNNTRRSIPIGPGRAFYATGEVIGNIRKAYC
5,I.DQ061525.B.CCR5,cm,CCR5,CIRPNNNTRKSIHIGPGRAFYATGEIIGDIRQAHC
6,500.HQ377462.B.CCR5,cm,CCR5,CTRPNNNTRKSISMGPGRAFYATGGIIGNIRQAHC
7,Pat1.AF541040.B.CCR5,cm,CCR5,CTRPNNNTRKSIHIGPGRAFYTTGEIIGDIRQAHC
8,CMP013.JX140646.02_AG.CCR5,cm,CCR5,CMRPNNNTRESVRIGPGQAFYATGEIIGDIRQAHC
9,U.DQ061827.B.CCR5,cm,CCR5,CTRPNNNTRKGIHMGPGKVFYATGQIIGDIRQAHC


Now that all Dataframes have labels, we concatenate them into one main Dataframe.

In [29]:
df_datasets = pd.concat([df_newdb,df_webpssm,df_hivcopred, df_g2p_cm])

In [30]:
df_datasets.shape

(9550, 4)

In [31]:
df_datasets.head(10)

Unnamed: 0,seq_name,dataset,label,sequence
0,RAB014775,newdb,CCR5,CTRPSNNTRTGITIGPGQVWYRTGDIIGDIRKAYC
1,RAB014776,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGDIRQAYC
2,RAB014778,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGDIRKAYC
3,RAB014781,newdb,CCR5,CTRPSNNTRTSVTIGPGQVWYRTGDIIGDIRQAYC
4,RAB014834,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGNIRKAYC
5,RAB023804,newdb,CCR5,CTRPNNNTRKSIRIGPGQTFYATGDIIGDIRQAHC
6,RAB287376,newdb,CCR5,CVRPNNNTRTSVRIGPGQTFYATGEIIGDIRQAFC
7,RAB553911,newdb,CCR5,CERPNNNTRRSIQIGPGRAWFEAEDIIGDIRKAHC
8,RAB553912,newdb,CCR5,CTRPNDNTRKSINIAPGRAFYATGDIIGDIRQAHC
9,RAB553913,newdb,CCR5,CTRPNNNTRKGIHMGPGRAIYTTDIIGDIRQAHC


In [285]:
df_datasets_validation = df_datasets[df_datasets.label == 'validation']
df_datasets_validation.head(10)

Unnamed: 0,seq_name,dataset,label,sequence
279,C.ZM.89.ZM20__phen_SI,webpssm,validation,CARPGNNTRKSIRIGPGQTFFATGAIIGDIRQAHC
280,C.ZW.01.TC28_2__phen_SI,webpssm,validation,CGRPNNHRIKGLRIGPGRAFFAMGAIGGEIRQAHC
281,C.ZW.01.TC03_1__phen_SI,webpssm,validation,CIRPGNNTSKSIRIGQRRPVYVHKIIGDIRQAHC
282,C.ET.97.PHD79C1__phen_SI,webpssm,validation,CIRPNNNTRKSVRIGPGQAFYATGDIIGDIRQAHC
283,C.ZW.01.TC28_1__phen_SI,webpssm,validation,CMRPNNNTRKSVRIGPGQTFFATGAIIGNIRQAHC
284,AC.RW.92.92RW009_di1sCD__phen_SI,webpssm,validation,CPRPNNNTRKSVHIGPGQAFYATGDVIGDIRQAYC
285,AC.RW.92.92RW009_1gCR_AC.RW.92.92RW009_1gER_AC...,webpssm,validation,CSRPNNNTRKSVHIGPGQAFYATGDVIGDIRQAYC
286,C.ZW.01.TC22__phen_SI,webpssm,validation,CTRPGNKTRQSIRIGRGQSFHATGAIIGDIRKAYC
287,C.ZW.01.TC30__phen_SI,webpssm,validation,CTRPGNNTIGPGRTFYATDRIIGDIRQAHC
288,C.ZW.01.TC29__phen_SI,webpssm,validation,CTRPGNNTRKGLRIGPGRTIYATEVTVGDIRQAYC


In [286]:
df_datasets_validation.shape

(71, 4)

In [287]:
def label_validation_dataset(row):
    if 'NSI' in row.seq_name:
        return 'CCR5'
    elif 'SI' in row.seq_name:
        return 'CXCR4'

In [288]:
df_datasets_validation['label'] = df_datasets_validation.apply(label_validation_dataset, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [289]:
df_datasets_validation

Unnamed: 0,seq_name,dataset,label,sequence
279,C.ZM.89.ZM20__phen_SI,webpssm,CXCR4,CARPGNNTRKSIRIGPGQTFFATGAIIGDIRQAHC
280,C.ZW.01.TC28_2__phen_SI,webpssm,CXCR4,CGRPNNHRIKGLRIGPGRAFFAMGAIGGEIRQAHC
281,C.ZW.01.TC03_1__phen_SI,webpssm,CXCR4,CIRPGNNTSKSIRIGQRRPVYVHKIIGDIRQAHC
282,C.ET.97.PHD79C1__phen_SI,webpssm,CXCR4,CIRPNNNTRKSVRIGPGQAFYATGDIIGDIRQAHC
283,C.ZW.01.TC28_1__phen_SI,webpssm,CXCR4,CMRPNNNTRKSVRIGPGQTFFATGAIIGNIRQAHC
...,...,...,...,...
345,C.ZW.01.TC33__phen_NSI,webpssm,CCR5,CTRPNNNTRTSVRIGPGQAFYATGDIIGDIRQAHC
346,C.FR.93.FRMP37__phen_NSI,webpssm,CCR5,CTRPSNNTRKSIRIGPGQAFYATNGIIGDIRAAHC
347,C.ZW.01.TC32__phen_NSI,webpssm,CCR5,CTRPSNNTRKSVWLGPGRAFYTNKVIGNIRKAHC
348,C.FR.91.FRMP197__phen_NSI,webpssm,CCR5,CTRPYNNTRQSIRIGPGQTFYATGDIIGDIRKAHC


In [290]:
# Save validation dataset to TSV
df_datasets_validation.to_csv('webpssm_validation_labeled.tsv', sep='\t')

In [292]:
# Now concatenate the parsed df_datasets_validation to df_datasets
df_datasets = pd.concat([df_datasets[df_datasets.label != 'validation'], df_datasets_validation])

In [295]:
# Save the full Dataset TSV
df_datasets.to_csv('datasets_concat_raw.tsv', sep='\t')

In [297]:
df_datasets.head()

Unnamed: 0,seq_name,dataset,label,sequence
0,RAB014775,newdb,CCR5,CTRPSNNTRTGITIGPGQVWYRTGDIIGDIRKAYC
1,RAB014776,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGDIRQAYC
2,RAB014778,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGDIRKAYC
3,RAB014781,newdb,CCR5,CTRPSNNTRTSVTIGPGQVWYRTGDIIGDIRQAYC
4,RAB014834,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGNIRKAYC


Now we replace the '-' on sequences to remove duplicated ones.<br>
First we check the number of sequences with '-' on the Dataframe.

In [32]:
df_datasets[df_datasets.sequence.str.contains('-')].shape

(1225, 4)

In [33]:
df_datasets[df_datasets.sequence.str.contains('-')].head(10)

Unnamed: 0,seq_name,dataset,label,sequence
108,TV013_ZA_C_NSI/CCR5_u125_TREURNICHT_(2002),webpssm,CCR5,CTRPNNNTRRSIRIGPGQAFY-TNDIIGDIRQAHC
127,98TZ013_TZ_C_CCR5_u144_RODENBURG_(2001),webpssm,CCR5,CTRPGNNTRKSVRIGPGQTFY-TNDIIGDIRQAYC
145,S018_MW_C_CCR5_u162_PING_(1999),webpssm,CCR5,CVRPNNNTRKSIRIGPGQTFYA-NDIIGDIRQAHC
153,S031_MW_C_CCR5_u170_PING_(1999),webpssm,CCR5,CTRPNNNTRKSIRIGPGQTFYA-NDIIGDIRQAHC
156,S180_MW_C_CCR5_u173_PING_(1999),webpssm,CCR5,CTRPGNNTRTSIRIGPGQTFFANN-IIGDIRQAHC
171,DU179MAY99U-R5_ZA_C_CCR5_u19_NICD_(UNPUBL),webpssm,CCR5,CTRPGNNTRKSIRIGPGQAFY-TNHIIGDIRQAYC
203,TM3__ZA_C_NSI/CCR5_u194_CHOGE_(IN_PRESS),webpssm,CCR5,CTRPGNNTRKSIRIGPGQTFYA-NDIIGDIRQAYC
207,TM10__ZA_C_NSI/CCR5_u198_CHOGE_(IN_PRESS),webpssm,CCR5,CTRPNNNTRKSIRIGPGQTFYATN-IIGDIRQAYC
216,TM31__ZA_C_NSI/CCR5_u207_CHOGE_(IN_PRESS),webpssm,CCR5,CTRPGSNTRRSIRIGPGQAFY-TQDIIGDIRQAHC
228,95ZW748_ZW_C_SI_u1_BATRA_(2000),webpssm,CXCR4,CTRPNNNVRKHIRIGIGKVFYA-NDIIGDIRQARC


In [34]:
df_datasets['sequence'] = df_datasets['sequence'].str.replace('-', '', regex=False)

Check if the replace worked:

In [35]:
df_datasets[df_datasets.sequence.str.contains('-')].shape

(0, 4)

Now that our sequences don't have '-' we can drop the duplicated sequences to avoid repetitive data on our trainning set.

In [36]:
df_datasets.shape

(9550, 4)

In [37]:
df_datasets.to_csv('all_datasets_raw.tsv', sep='\t')

In [38]:
df_datasets.duplicated(subset='sequence', keep=False).sum()

8765

In [39]:
set(df_datasets['sequence'].apply(len))

{21, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39}

In [40]:
df_datasets

Unnamed: 0,seq_name,dataset,label,sequence
0,RAB014775,newdb,CCR5,CTRPSNNTRTGITIGPGQVWYRTGDIIGDIRKAYC
1,RAB014776,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGDIRQAYC
2,RAB014778,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGDIRKAYC
3,RAB014781,newdb,CCR5,CTRPSNNTRTSVTIGPGQVWYRTGDIIGDIRQAYC
4,RAB014834,newdb,CCR5,CTRPSNNTRTSITIGPGQVWYRTGDIIGNIRKAYC
...,...,...,...,...
1183,CCR5_338MPc01_29448_UG_1997_D,geno2pheno,CCR5,CTRPSNNTRQSTHIGPGQALFTTNVIGNIRQAHC
1184,CCR5_94UG114_147_UG_1994_D,geno2pheno,CCR5,CIRPYNNTRQSTRIGPGQALFTTKVIGDIRQAHC
1185,CCR5_338FIc01_29465_UG_1997_D,geno2pheno,CCR5,CTRPSNNTRQSTRIGPGQALFTTKVIGDIRQAYC
1186,CCR5/CXCR4_DM3_16_28506_UG_-_D,geno2pheno,R5X4,CTRPYNNTRQSTHIGPGQSLFTTKVIGDIRQAYC


In [41]:
len(set(df_datasets.sequence.to_list()))

3608

In [42]:
# Create a Dataframe with unique sequences
df_unique_seqs = df_datasets.drop_duplicates(subset='sequence', keep='first')
df_unique_seqs.shape

(3608, 4)

In [44]:
df_unique_seqs[df_unique_seqs.label == 'validation'].head()

Unnamed: 0,seq_name,dataset,label,sequence
280,C.ZW.01.TC28_2__phen_SI,webpssm,validation,CGRPNNHRIKGLRIGPGRAFFAMGAIGGEIRQAHC
286,C.ZW.01.TC22__phen_SI,webpssm,validation,CTRPGNKTRQSIRIGRGQSFHATGAIIGDIRKAYC
287,C.ZW.01.TC30__phen_SI,webpssm,validation,CTRPGNNTIGPGRTFYATDRIIGDIRQAHC
293,C.ZA.99.ZASW7__phen_SI,webpssm,validation,CTRPGSNKQRNIRIGPGRAFHTNGVIGDIRKAYC
294,C.ZW.01.TC13__phen_SI,webpssm,validation,CTRPNNTRKSVGIGPGKTFYAHGEVIGNIRQAHC


As we can see, there are 3608 unique sequences on our Dataset.<br>
We are going to use these unique sequences to do the alignment and split data into trainning, validation and test sets.<br><br>
To execute the alignment we are going to create a fasta file out of the Dataframe.


In [93]:
# Create fasta file from the df_unique_seqs
with open('dataset_unique_seqs.fasta', 'w') as f:
    for index, row in df_unique_seqs.iterrows():
        f.write(f'>{row.seq_name}|{row.dataset}|{row.label}\n')
        f.write(f'{row.sequence}\n')        

In [45]:
# Align sequences using Muscle
#os.system('/home/gabriel/Documents/Bioinformatics/muscle3.8.31_i86linux64 -in dataset_unique_seqs.fasta -out dataset_unique_seqs_aligned.fasta -gapopen 15')

In [96]:
# Creating Dataframe from Muscle aligned output
df_aligned = pd.read_csv('datasets/',sep='\t',
                       names=['seq_name', 'dataset', 'label', 'sequence'])

KeyboardInterrupt: 

Now that we have a Dataframe with all the unique sequences aligned and labeled we are going to separate ou train set, validation set and test set on a proportion of 80/10/10.

## Old approach

In [170]:
df = pd.read_csv('/home/gabriel/Documents/Repos/DeepTropism/dataset_unique_seqs_aligned_gapopen_15_old.tsv', sep='\t', names=['name', 'dataset', 'label','sequence'])

In [171]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3608 entries, 0 to 3607
Data columns (total 4 columns):
name        3608 non-null object
dataset     3608 non-null object
label       3608 non-null object
sequence    3608 non-null object
dtypes: object(4)
memory usage: 112.9+ KB


In [172]:
df.head(10)

Unnamed: 0,name,dataset,label,sequence
0,CCR5_1471_29187_CN_2003_B,geno2pheno,CCR5,CTQTQQQY-KKKY---T----------SRTR-ASM-----V-CNRR...
1,CCR5/CXCR4/CCR1/CCR2b/CCR3/CCR4_MVP5180_67_CM_...,geno2pheno,R5X4,CIREGIAE-VQDI---Y--T--G-P-----M-RWRSMTLKR-SNNT...
2,RKF859742,newdb,CCR5,CERPTMDI-QDIH------I--G-P-----M-AWYSTYIER-QAKG...
3,RAF009608,hivcopred,CCR5,CSRPEMDV-QEIR---N-----G-P-----M-AWYSMALAK-GGTT...
4,RKF859743,newdb,CCR5,CRRPAMKV-QEMR---I----------G--PMAWY-----S-MALE...
5,RAF009610,hivcopred,CCR5,CSRPAMEV--QEM---R--I--G-P-----M-AWYSMALER-GGTT...
6,RAM262120,newdb,CCR5,CVRPGNNS-VQEM---R--V--G-P-----M-AWYSMELEK-NGSR...
7,RAM262127,newdb,CCR5,CVRPGDNS-VKEM---R--A--G-P-----M-AWYSMELER-NGSR...
8,RAM262126,newdb,CCR5,CVRPGNNS-VKEM---R--V--G-P-----M-ALYSMELER-NGSR...
9,RAM262125,newdb,CCR5,CVRPGNNT-VKEM---R--V--G-P-----M-AWYSMELER-NGSR...


In [173]:
df.drop_duplicates(subset='sequence', keep='first', inplace=True)

In [174]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3608 entries, 0 to 3607
Data columns (total 4 columns):
name        3608 non-null object
dataset     3608 non-null object
label       3608 non-null object
sequence    3608 non-null object
dtypes: object(4)
memory usage: 140.9+ KB


In [175]:
seq = df.loc[0, 'sequence']
seq

'CTQTQQQY-KKKY---T----------SRTR-ASM-----V-CNRR----NNRRYKTSTL'

In [188]:
# Function to call labels
def tropism_label(row):
    # For CCR5
    if row.label == 'CCR5':
        return 0
    # For CXCR4
    elif row.label == 'CXCR4':
        return 1
    # For R5X4
    elif row.label == 'R5X4':
        return 1

In [189]:
df_validation = df[df.label == 'validation']
df_validation.head()

Unnamed: 0,name,dataset,label,sequence
93,C.ZW.01.TC19__phen_NSI,webpssm,validation,CIRPGNNT-RQSV---R--I--G-P--G--Q-TFY---A-T--GDI...
130,C.ZW.01.TC34__phen_NSI,webpssm,validation,CTRPNNNT-RKSI---R--I--G-P--G--Q-VFY---A-T--GEI...
131,C.ZW.01.TC27__phen_NSI,webpssm,validation,CTRPNNNT-RKSI---R--I--G-P--G--Q-VFY---A-P--GDI...
143,C.ZW.01.TC12__phen_NSI,webpssm,validation,CTRPNNNT-RESV---R--I--G-P--G--Q-VFY---A-T--GDI...
147,C.FR.90.FRMP19__phen_NSI,webpssm,validation,CTRPGNNT-RESI---R--I--G-P--G--Q-TFY---A-T--GDI...


In [179]:
df_validation.shape

(36, 4)

In [None]:
def label_validation_samples(row):
    if 

## Define label_numeric to df

In [190]:
df['label_numeric'] = df.apply(tropism_label, axis=1)

In [191]:
df.label.value_counts()

CCR5          2757
R5X4           484
CXCR4          331
validation      36
Name: label, dtype: int64

In [192]:
df = df[df.label != 'validation']

In [193]:
## Remove validation rows
df = df[df.label != 'validation']

In [194]:
df.label.value_counts()

CCR5     2757
R5X4      484
CXCR4     331
Name: label, dtype: int64

In [195]:
df.label_numeric.value_counts()

0.0    2757
1.0     815
Name: label_numeric, dtype: int64

In [196]:
df.shape

(3572, 5)

In [197]:
df.head()

Unnamed: 0,name,dataset,label,sequence,label_numeric
0,CCR5_1471_29187_CN_2003_B,geno2pheno,CCR5,CTQTQQQY-KKKY---T----------SRTR-ASM-----V-CNRR...,0.0
1,CCR5/CXCR4/CCR1/CCR2b/CCR3/CCR4_MVP5180_67_CM_...,geno2pheno,R5X4,CIREGIAE-VQDI---Y--T--G-P-----M-RWRSMTLKR-SNNT...,1.0
2,RKF859742,newdb,CCR5,CERPTMDI-QDIH------I--G-P-----M-AWYSTYIER-QAKG...,0.0
3,RAF009608,hivcopred,CCR5,CSRPEMDV-QEIR---N-----G-P-----M-AWYSMALAK-GGTT...,0.0
4,RKF859743,newdb,CCR5,CRRPAMKV-QEMR---I----------G--PMAWY-----S-MALE...,0.0


In [198]:
# Check of len of sequence is the same for all rows
set(df['sequence'].apply(len))

{60}

In [199]:
# Save the Newdb processed dataset into TSV
df.to_csv('newdb_wrangled.tsv', sep='\t')

## Converting Protein Sequence to Vectors

In [200]:
df_aa = pd.read_csv('datasets/aminoacids_oneletter_code.csv', sep='\t')

In [201]:
df_aa

Unnamed: 0,Pos_array,3-letters-code,1-letter-code,Aminoacid
0,1.0,Ala,A,Alanine
1,2.0,Asn,N,Asparagine
2,3.0,Asp,D,Aspartic acid
3,4.0,Cys,C,Cysteine
4,5.0,Gln,Q,Glutamine
5,6.0,Glu,E,Glutamic acid
6,7.0,Gly,G,Glycine
7,8.0,His,H,Histidine
8,9.0,Ile,I,Isoleucine
9,10.0,Leu,L,Leucine


In [202]:
def get_array_from_sequence(protein_sequence):
    """
    Function to convert a protein sequence into a tensor.
    Each amino acid is represented by an numpy array of zeros of size 26,
    and the dict_aa_pos defines the position to be converted to 1.
    
    The function iterates over the protein sequences and stacks the arrays.
    At the end the arrays are linearized and converted to a tensor of size
    n x 26, with n the size of the protein.
    
    If the character is not present on the dict_aa_pos (eg. '-') the respective
    array is formed by zeros, and represents a missing value.
    """
    dict_aa_pos = {
    'A':1, 'R':2, 'N':3, 'D':4, 'C':5, 'Q':6, 'E':7, 'G':8,
    'H':9, 'I':10, 'L':11, 'K':12, 'M':13, 'F':14, 'P':15, 
    'O':16, 'S':17, 'U':18, 'T':19, 'W':20, 'Y':21, 'V':22, 
    'B':23, 'Z':24, 'J':25, 'X':0}
    
    f_array = np.zeros(26)
    for aa in protein_sequence:
        arr = np.zeros(26)
        if dict_aa_pos.get(aa):
            arr[dict_aa_pos.get(aa)] = 1
        f_array = np.vstack((f_array, arr))
    f_array = np.delete(f_array, 0,0)
    
    #return torch.from_numpy((f_array.flatten()).astype(float))
    return f_array.flatten().astype(float)
    

In [203]:
ex = get_array_from_sequence('CSRP-GNN-TR-TSI---PI--GP-GR--A-WF---AT--G----D--V-TGDPRKAHC')

In [204]:
sum(ex)

35.0

In [205]:
df.head(20)

Unnamed: 0,name,dataset,label,sequence,label_numeric
0,CCR5_1471_29187_CN_2003_B,geno2pheno,CCR5,CTQTQQQY-KKKY---T----------SRTR-ASM-----V-CNRR...,0.0
1,CCR5/CXCR4/CCR1/CCR2b/CCR3/CCR4_MVP5180_67_CM_...,geno2pheno,R5X4,CIREGIAE-VQDI---Y--T--G-P-----M-RWRSMTLKR-SNNT...,1.0
2,RKF859742,newdb,CCR5,CERPTMDI-QDIH------I--G-P-----M-AWYSTYIER-QAKG...,0.0
3,RAF009608,hivcopred,CCR5,CSRPEMDV-QEIR---N-----G-P-----M-AWYSMALAK-GGTT...,0.0
4,RKF859743,newdb,CCR5,CRRPAMKV-QEMR---I----------G--PMAWY-----S-MALE...,0.0
5,RAF009610,hivcopred,CCR5,CSRPAMEV--QEM---R--I--G-P-----M-AWYSMALER-GGTT...,0.0
6,RAM262120,newdb,CCR5,CVRPGNNS-VQEM---R--V--G-P-----M-AWYSMELEK-NGSR...,0.0
7,RAM262127,newdb,CCR5,CVRPGDNS-VKEM---R--A--G-P-----M-AWYSMELER-NGSR...,0.0
8,RAM262126,newdb,CCR5,CVRPGNNS-VKEM---R--V--G-P-----M-ALYSMELER-NGSR...,0.0
9,RAM262125,newdb,CCR5,CVRPGNNT-VKEM---R--V--G-P-----M-AWYSMELER-NGSR...,0.0


In [206]:
# Create list to append data from the df
list_data = []
list_labels = []

# Convert the sequences and labels to arrays to use as data on pytorch
for index, row in df.iterrows():
    list_data.append(get_array_from_sequence(str(row.sequence)))
    list_labels.append(int(row.label_numeric))
    

In [208]:
len(list_labels)

3572

In [209]:
len(list_labels)

3572

In [135]:
batch_size = 32
validation_split = .2
shuffle_dataset = True
random_seed= 5

# Creating data indices for training and validation splits:
dataset_size = len(list_data)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))

In [136]:
split

714

In [137]:
if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

In [138]:
len(val_indices)

714

In [211]:
len(train_indices)

2858

In [140]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

# To transform list_data and list_labels on trainloader

# For training
train_data = []
for i in train_indices:
    train_data.append([list_data[i], list_labels[i]])

train_loader = torch.utils.data.DataLoader(train_data, shuffle=True, batch_size=64)

# For validation
validation_data = []
for j in val_indices:
    test_data.append([list_data[j], list_labels[j]])

test_loader = torch.utils.data.DataLoader(test_data, shuffle=True, batch_size=64)


## New approach to create Dataloader

In [226]:
# For training
train_data = []
train_label = []
for i in train_indices:
    train_data.append(list_data[i])
    train_label.append(np.array(list_labels[i]))
    

#train_loader = torch.utils.data.DataLoader(train_data, shuffle=True, batch_size=64)

# For validation
validation_data = []
validation_label = []
for j in val_indices:
    validation_data.append(list_data[i])
    validation_label.append(np.array(list_labels[i]))

#test_loader = torch.utils.data.DataLoader(test_data, shuffle=True, batch_size=64)


In [227]:
#train_label = [np.array(x) for x in train_label]

In [229]:
# To transform list_data and list_labels on trainloader
train_tensor_x = torch.stack([torch.from_numpy(i) for i in train_data]) # transform to torch tensors
train_tensor_y = torch.stack([torch.from_numpy(i) for i in train_label])

train_dataset = torch.utils.data.TensorDataset(train_tensor_x,train_tensor_y) # create your datset
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=64) # create your dataloader

# The same for validation
validation_tensor_x = torch.stack([torch.from_numpy(i) for i in validation_data]) # transform to torch tensors
validation_tensor_y = torch.stack([torch.from_numpy(i) for i in validation_label])

validation_dataset = torch.utils.data.TensorDataset(validation_tensor_x,validation_tensor_y) # create your datset
validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=64) # create your dataloader


In [92]:
# For training
train_data = []
for i in train_indices:
    train_data.append([list_data[i], list_labels[i]])

train_loader = torch.utils.data.DataLoader(train_data, shuffle=True, batch_size=32)

# For validation
test_data = []
for j in val_indices:
    test_data.append([list_data[j], list_labels[j]])

test_loader = torch.utils.data.DataLoader(test_data, shuffle=True, batch_size=32)

In [230]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.linear1 = nn.Linear(1560,250)
        self.linear2 = nn.Linear(250,100)
        self.linear3 = nn.Linear(100,2)
    
    def forward(self,X):
        X = F.relu(self.linear1(X))
        X = F.relu(self.linear2(X))
        X = self.linear3(X)
        return F.log_softmax(X, dim=1)
 
#mlp = MLP()
#print(mlp)
model = MLP()
model = model.float()

In [253]:
import torch.nn as nn
import torch.nn.functional as F

# Define the class Net
class Net(nn.Module):
    def __init__(self):    
        # Define all the parameters of the net
        super(Net, self).__init__()
        self.fc1 = nn.Linear(1560, 200)
        self.fc2 = nn.Linear(200, 2)

    def forward(self, x):   
        # Do the forward pass
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
model = Net()
model = model.float()

In [42]:
class AminoNet(nn.Module):
    def __init__(self):
        super(AminoNet, self).__init__()
        #self.conv1 = nn.Conv2d(3, 6, 5)
        #self.pool = nn.MaxPool2d(2, 2)
        #self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(30, 150)
        self.fc2 = nn.Linear(150, 50)
        self.fc3 = nn.Linear(50, 2)
        self.conv1 = nn.Conv1d(1560, 30, 52, 26)

    def forward(self, x):
        #x = self.pool(F.relu(self.conv1(x)))
        #x = self.pool(F.relu(self.conv2(x)))
        #x = x.view(-1, 16 * 5 * 5)
        #x = F.relu(self.conv1(self.fc1(x)))
        x = F.relu(self.conv1(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = AminoNet()
model = model.float()

In [234]:
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        #self.conv1 = nn.Conv2d(3, 6, 5)
        #self.pool = nn.MaxPool2d(2, 2)
        #self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(1560, 780)
        self.fc2 = nn.Linear(780, 300)
        self.fc3 = nn.Linear(300, 600)
        self.fc4 = nn.Linear(600, 200)
        self.fc5 = nn.Linear(200, 50)
        self.fc6 = nn.Linear(50, 2)
        self.Drop1 = nn.Dropout(p=0.2)
        self.Drop2 = nn.Dropout(p=0.4)

    def forward(self, x):
        #x = self.pool(F.relu(self.conv1(x)))
        #x = self.pool(F.relu(self.conv2(x)))
        #x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.Drop1(x)
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = self.Drop2(x)
        x = F.relu(self.fc5(x))
        x = self.fc6(x)
        return x


model = Net()
model = model.float()

In [231]:
model

MLP(
  (linear1): Linear(in_features=1560, out_features=250, bias=True)
  (linear2): Linear(in_features=250, out_features=100, bias=True)
  (linear3): Linear(in_features=100, out_features=2, bias=True)
)

In [234]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.002, momentum=0.9)

In [235]:
for epoch in range(100):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(train_dataloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs.float())
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        #print(running_loss)
        if i % 3239 == 0:    # print every 3239 mini-batches
            print('[%d, %5d] loss: %.5f' %
                  (epoch + 1, i + 1, running_loss / 50))
            running_loss = 0.0

print('Finished Training')

[1,     1] loss: 0.00444
[2,     1] loss: 0.00426
[3,     1] loss: 0.00428
[4,     1] loss: 0.00430
[5,     1] loss: 0.00431
[6,     1] loss: 0.00431
[7,     1] loss: 0.00431
[8,     1] loss: 0.00431
[9,     1] loss: 0.00431
[10,     1] loss: 0.00431
[11,     1] loss: 0.00431
[12,     1] loss: 0.00430
[13,     1] loss: 0.00430
[14,     1] loss: 0.00430
[15,     1] loss: 0.00429
[16,     1] loss: 0.00429
[17,     1] loss: 0.00428
[18,     1] loss: 0.00428
[19,     1] loss: 0.00428
[20,     1] loss: 0.00427
[21,     1] loss: 0.00427
[22,     1] loss: 0.00426
[23,     1] loss: 0.00426
[24,     1] loss: 0.00425
[25,     1] loss: 0.00424
[26,     1] loss: 0.00424
[27,     1] loss: 0.00423
[28,     1] loss: 0.00423
[29,     1] loss: 0.00422
[30,     1] loss: 0.00422
[31,     1] loss: 0.00421
[32,     1] loss: 0.00420
[33,     1] loss: 0.00419
[34,     1] loss: 0.00418
[35,     1] loss: 0.00417
[36,     1] loss: 0.00417
[37,     1] loss: 0.00416
[38,     1] loss: 0.00414
[39,     1] loss: 0.0

In [280]:
correct = 0
total = 0
error = 0
labels_array = np.empty([0])
predict_array = np.empty([0])

with torch.no_grad():
    for data in validation_dataloader:
        images, labels = data
        outputs = model(images.float())
        _, predicted = torch.max(outputs.data, 1)
        
        labels_array = np.concatenate([labels_array, labels])
        predict_array = np.concatenate([predict_array, predicted])
        
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        error += (predicted != labels).sum().item()

print('Accuracy of the network on the 10000 test images: %d %%' % (
    100 * correct / total))

Accuracy of the network on the 10000 test images: 100 %


In [281]:
def show_metrics(y_true, y_score):
    # True positive
    tp = np.sum(y_true * y_score)
    # False positive
    fp = np.sum((y_true == 0) * y_score)
    # True negative
    tn = np.sum((y_true==0) * (y_score==0))
    # False negative
    fn = np.sum(y_true * (y_score==0))

    # True positive rate (sensitivity or recall)
    tpr = tp / (tp + fn)
    # False positive rate (fall-out)
    fpr = fp / (fp + tn)
    # Precision
    precision = tp / (tp + fp)
    # True negatvie tate (specificity)
    tnr = 1 - fpr
    # F1 score
    f1 = 2*tp / (2*tp + fp + fn)
    # ROC-AUC for binary classification
    auc = (tpr+tnr) / 2
    # MCC
    mcc = (tp * tn - fp * fn) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    print("True positive: ", tp)
    print("False positive: ", fp)
    print("True negative: ", tn)
    print("False negative: ", fn)

    print("True positive rate (recall): ", tpr)
    print("False positive rate: ", fpr)
    print("Precision: ", precision)
    print("True negative rate: ", tnr)
    print("F1: ", f1)
    print("ROC-AUC: ", auc)
    print("MCC: ", mcc)

In [282]:
show_metrics(labels_array, predict_array)

True positive:  0.0
False positive:  0.0
True negative:  714
False negative:  0.0
True positive rate (recall):  nan
False positive rate:  0.0
Precision:  nan
True negative rate:  1.0
F1:  nan
ROC-AUC:  nan
MCC:  nan


  if sys.path[0] == '':
  app.launch_new_instance()


# Processing CM dataset

In [164]:
df_cm = pd.read_csv('datasets/cm_aligned.tsv', sep='\t', names=['name','sequence'])

In [167]:
df_cm.head(10)

Unnamed: 0,name,sequence
0,1432.KF859742.O.CCR5,C-ERPTMD--I-QD---I-H--IGP----MA-WY--STYIER-QAK...
1,MD47.KF859744.O.CCR5,C-QRPGHQ-EI-QE---I-K--TGP-----LAWY--SMYLKE-NTT...
2,BCF02.U24562.O.CCR5,C-QRPGHQ-TV-QE---I-R--IGP-----MAWY--SM---G-LAA...
3,152.KF859743.O.CCR5,C-RRPAMK--V-QE---M-R--IGP----MA-WY--SMALEE-GNN...
4,DUR.X84327.O.CCR5,C-VRPGNN-SV-QE---I-K--IGP-----MAWY--SM-----QIE...
5,DUR.AM262121.O.CCR5,C-VRPGNN-SV-QE---M-R--VGP--M--A-WY--SM-----ELE...
6,DUR.AM262130.O.CCR5,C-VRPGNN-SV-KE---M-R--VGP--M--A-LY--SM-----ELE...
7,DUR.AM262127.O.CCR5,C-VRPGDN-SV-KE---M-R--AGP-----MAWY--SM-----ELE...
8,CA9.X96522.O.CCR5,C-ERPGNH-TV-QE---I-R--IGP-LA----WY--SM---G-IEK...
9,BCF01.U24566.O.CCR5,C-HRPGNL-SV-QE---M-K--IGP--LS---WY--SM---G-LAA...


In [186]:
df_cm.name

0             1432.KF859742.O.CCR5
1             MD47.KF859744.O.CCR5
2              BCF02.U24562.O.CCR5
3              152.KF859743.O.CCR5
4                DUR.X84327.O.CCR5
                   ...            
2674    H13988_DS2.JF508074.B.CCR5
2675    H13988_DS2.JF508043.B.CCR5
2676            39.AF022258.B.CCR5
2677           122.DQ002264.B.CCR5
2678          Pat1.AF541016.B.CCR5
Name: name, Length: 2679, dtype: object

In [187]:
df.loc[0]['name']

'RKF859742'

In [193]:
df_cm['label'] = df_cm.apply(get_label, axis=1)

In [194]:
df_cm['label_numeric'] = df_cm.apply(tropism_label, axis=1)

In [196]:
df_cm.head(20)

Unnamed: 0,name,sequence,label,label_numeric
0,1432.KF859742.O.CCR5,C-ERPTMD--I-QD---I-H--IGP----MA-WY--STYIER-QAK...,CCR5,0
1,MD47.KF859744.O.CCR5,C-QRPGHQ-EI-QE---I-K--TGP-----LAWY--SMYLKE-NTT...,CCR5,0
2,BCF02.U24562.O.CCR5,C-QRPGHQ-TV-QE---I-R--IGP-----MAWY--SM---G-LAA...,CCR5,0
3,152.KF859743.O.CCR5,C-RRPAMK--V-QE---M-R--IGP----MA-WY--SMALEE-GNN...,CCR5,0
4,DUR.X84327.O.CCR5,C-VRPGNN-SV-QE---I-K--IGP-----MAWY--SM-----QIE...,CCR5,0
5,DUR.AM262121.O.CCR5,C-VRPGNN-SV-QE---M-R--VGP--M--A-WY--SM-----ELE...,CCR5,0
6,DUR.AM262130.O.CCR5,C-VRPGNN-SV-KE---M-R--VGP--M--A-LY--SM-----ELE...,CCR5,0
7,DUR.AM262127.O.CCR5,C-VRPGDN-SV-KE---M-R--AGP-----MAWY--SM-----ELE...,CCR5,0
8,CA9.X96522.O.CCR5,C-ERPGNH-TV-QE---I-R--IGP-LA----WY--SM---G-IEK...,CCR5,0
9,BCF01.U24566.O.CCR5,C-HRPGNL-SV-QE---M-K--IGP--LS---WY--SM---G-LAA...,CCR5,0


## Testing DeepSea architecture
https://github.com/PuYuQian/PyDeepSEAm

### Testing Conv1d
https://medium.com/@santi.pdp/how-pytorch-transposed-convs1d-work-a7adac63c4a5

In [173]:
import torch.nn as nn
import torch.nn.functional as F

In [176]:
x = torch.ones(1,1,7)
conv = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=3, stride=1, padding=0, bias=False)

In [179]:
conv.weight.data = torch.ones(1,1,3)
y = conv(x)

In [180]:
y

tensor([[[3., 3., 3., 3., 3.]]], grad_fn=<SqueezeBackward1>)

In [226]:
class DeepSEA(nn.Module):
    def __init__(self, ):
        super(DeepSEA, self).__init__()
        self.Conv1 = nn.Conv1d(in_channels=1, out_channels=320, kernel_size=8)
        self.Conv2 = nn.Conv1d(in_channels=320, out_channels=480, kernel_size=8)
        self.Conv3 = nn.Conv1d(in_channels=480, out_channels=960, kernel_size=8)
        self.Maxpool = nn.MaxPool1d(kernel_size=4, stride=4)
        self.Drop1 = nn.Dropout(p=0.2)
        self.Drop2 = nn.Dropout(p=0.5)
        self.Linear1 = nn.Linear(53*960, 925)
        self.Linear2 = nn.Linear(925, 2)

    def forward(self, input):
        x = self.Conv1(input)
        x = F.relu(x)
        x = self.Maxpool(x)
        x = self.Drop1(x)
        x = self.Conv2(x)
        x = F.relu(x)
        x = self.Maxpool(x)
        x = self.Drop1(x)
        x = self.Conv3(x)
        x = F.relu(x)
        x = self.Drop2(x)
        x = x.view(-1, 53*960)
        x = self.Linear1(x)
        x = F.relu(x)
        x = self.Linear2(x)
        return x

model = DeepSEA()
model = model.float()

In [227]:
import torch.optim as optim
#criterion = nn.CrossEntropyLoss()
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [228]:
for epoch in range(20):  # loop over the dataset multiple times

    running_loss = 0.0
    #for i, data in enumerate(train_data_loader):
    for i, data in enumerate(my_dataloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs.float())
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        #print(running_loss)
        if i % 1000 == 0:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 50))
            running_loss = 0.0

print('Finished Training')

RuntimeError: Expected 3-dimensional input for 3-dimensional weight 320 1, but got 2-dimensional input of size [64, 1560] instead

In [259]:
t1

tensor([1, 0, 1])

In [261]:
t1 = np.concatenate([t1, t2])

In [262]:
t1

array([1, 0, 1, 1, 0, 1])