In [44]:
import pandas as pd
import numpy as np
import torch

In [5]:
df = pd.read_csv('newdb_aligned.csv', sep='\t', names=['name', 'sequence'])

In [7]:
df.head()

Unnamed: 0,name,sequence
0,1432|KF859742|O|CCR5,C-ERPTMD--I-QD---I-H--IGP----MA-WY--STYIER-QAK...
1,MD47|KF859744|O|CCR5,C-QRPGHQ-EI-QE---I-K--TGP-----LAWY--SMYLKE-NTT...
2,BCF02|U24562|O|CCR5,C-QRPGHQ-TV-QE---I-R--IGP-----MAWY--SM---G-LAA...
3,152|KF859743|O|CCR5,C-RRPAMK--V-QE---M-R--IGP----MA-WY--SMALEE-GNN...
4,DUR|X84327|O|CCR5,C-VRPGNN-SV-QE---I-K--IGP-----MAWY--SM-----QIE...


In [14]:
# Define Labels
# For CCR5
df['ccr5'] = df.name.str.contains('CCR5')
df['cxcr4'] = df.name.str.contains('CXCR4')
df.head(10)

Unnamed: 0,name,sequence,ccr5,cxcr4
0,1432|KF859742|O|CCR5,C-ERPTMD--I-QD---I-H--IGP----MA-WY--STYIER-QAK...,True,False
1,MD47|KF859744|O|CCR5,C-QRPGHQ-EI-QE---I-K--TGP-----LAWY--SMYLKE-NTT...,True,False
2,BCF02|U24562|O|CCR5,C-QRPGHQ-TV-QE---I-R--IGP-----MAWY--SM---G-LAA...,True,False
3,152|KF859743|O|CCR5,C-RRPAMK--V-QE---M-R--IGP----MA-WY--SMALEE-GNN...,True,False
4,DUR|X84327|O|CCR5,C-VRPGNN-SV-QE---I-K--IGP-----MAWY--SM-----QIE...,True,False
5,DUR|AM262121|O|CCR5,C-VRPGNN-SV-QE---M-R--VGP--M--A-WY--SM-----ELE...,True,False
6,DUR|AM262130|O|CCR5,C-VRPGNN-SV-KE---M-R--VGP--M--A-LY--SM-----ELE...,True,False
7,DUR|AM262127|O|CCR5,C-VRPGDN-SV-KE---M-R--AGP-----MAWY--SM-----ELE...,True,False
8,CA9|X96522|O|CCR5,C-ERPGNH-TV-QE---I-R--IGP-LA----WY--SM---G-IEK...,True,False
9,BCF01|U24566|O|CCR5,C-HRPGNL-SV-QE---M-K--IGP--LS---WY--SM---G-LAA...,True,False


In [15]:
# Function to call labels
def tropism_label(row):
    
    if row.ccr5 and row.cxcr4:
        return 'dual_tropic'
    elif row.ccr5:
        return 'CCR5'
    elif row.cxcr4:
        return 'CXCR4'        

In [16]:
df['label'] = df.apply(tropism_label, axis=1)

In [21]:
df.label.value_counts()

CCR5           2354
CXCR4           277
dual_tropic      48
Name: label, dtype: int64

In [24]:
len(df[(df.ccr5 == False)|(df.cxcr4 == False)])

2631

In [25]:
df.shape

(2679, 5)

In [None]:
#df[df.label == 'dual_tropic']

In [32]:
# Check of len of sequence is the same for all rows
set(df['sequence'].apply(len))

{60}

## Converting Protein Sequence to Vectors

In [37]:
df_aa = pd.read_csv('aminoacids_oneletter_code.csv', sep='\t')

In [38]:
df_aa

Unnamed: 0,3-letters-code,1-letter-code,Aminoacid
0,Ala,A,Alanine
1,Arg,R,Arginine
2,Asn,N,Asparagine
3,Asp,D,Aspartic acid
4,Cys,C,Cysteine
5,Gln,Q,Glutamine
6,Glu,E,Glutamic acid
7,Gly,G,Glycine
8,His,H,Histidine
9,Ile,I,Isoleucine


In [41]:
df_aa['1-letter-code'].to_list()

['A',
 'R',
 'N',
 'D',
 'C',
 'Q',
 'E',
 'G',
 'H',
 'I',
 'L',
 'K',
 'M',
 'F',
 'P',
 'O',
 'S',
 'U',
 'T',
 'W',
 'Y',
 'V',
 'B',
 'Z',
 'X',
 'J',
 '-']

In [49]:
dict_aa_pos = {'A':1,
 'R':2,
 'N':3,
 'D':4,
 'C':5,
 'Q':6,
 'E':7,
 'G':8,
 'H':9,
 'I':10,
 'L':11,
 'K':12,
 'M':13,
 'F':14,
 'P':15,
 'O':16,
 'S':17,
 'U':18,
 'T':19,
 'W':20,
 'Y':21,
 'V':22,
 'B':23,
 'Z':24,
 'J':25,
 'X':26, 
 '-':None}

In [None]:
def get_array_from_sequence(seq):
    global dict_aa_pos
    
    #
    for aa in seq:
        # Create
        arr = np.zeros(26)
        arr[]
    
    
    
    

In [46]:
arr = np.zeros(5)

In [47]:
arr[2] = 1

In [48]:
arr

array([0., 0., 1., 0., 0.])

In [50]:
arr2 = np.zeros(5)

In [52]:
arr2

array([0., 0., 0., 0., 0.])

In [55]:
arr3 = np.concatenate([arr, arr2])

In [56]:
arr3

array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])

In [57]:
a = np.array([1, 2, 3])

In [58]:
a

array([1, 2, 3])

In [59]:
b = np.array([4, 5, 6])

In [61]:
np.vstack((arr, arr2))

array([[0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0.]])