In [1]:
!pip install propy3

Collecting propy3
  Downloading propy3-1.1.1-py3-none-any.whl.metadata (5.6 kB)
Downloading propy3-1.1.1-py3-none-any.whl (290 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.3/290.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: propy3
Successfully installed propy3-1.1.1


In [3]:
aa=['A','R','N','D','C','E','Q','G','H','I','L','K','M','F','P','S','T','W','Y','V']

sequence based features

AAC Amino Acid Composition https://www.sciencedirect.com/topics/immunology-and-microbiology/amino-acid-composition \
DPC dipeptide composition http://www.csbio.sjtu.edu.cn/bioinf/PseAA/dipeptide.htm

In [17]:
from propy.AAComposition import CalculateAAComposition
from propy.AAComposition import CalculateAADipeptideComposition
from propy.GetProteinFromUniprot import GetProteinSequence
from propy.QuasiSequenceOrder import GetQuasiSequenceOrder
import pandas as pd
import numpy as np

In [27]:
#sequence based features
def aac(example):
  #Amino acid composition
  res=CalculateAAComposition(example)
  return(res)

def aacount(example):
  #Amino acid count
  res=0
  for elem in example:
    res+=example.count(elem)
  res/=len(example)
  return(res)

def dpc(example):
  #Dipeptide composition
  diPeptides = [aa1 + aa2 for aa1 in aa for aa2 in aa]
  res=CalculateAADipeptideComposition(example)
  kkeys=list(res.keys())
  for key in kkeys:
    if(key not in diPeptides):
      res.pop(key)
  return res

def qso(example):
  #Quasi Sequence composition
  res=GetQuasiSequenceOrder(example)
  return res




#psychochemical features
def psychochemical_features(example, typer='hydrophobicity', mode='hydrophobicity'):
  main_dicter={}
  main_dicter['hydrophobicity']={'polar':['R', 'K', 'E', 'D', 'Q', 'N'],
                                 'neutral':['G', 'A', 'S', 'T', 'P', 'H', 'Y'],
                                 'hydrophobicity':['C', 'L', 'V', 'I', 'M', 'F', 'W']}
  main_dicter['vanDerWaals']={'0–2.78':['G', 'A', 'S', 'T', 'P', 'D', 'C'],
                              '2.95–4.0': ['N', 'V', 'E', 'Q', 'I', 'L'],
                              '4.03–8.08': ['M', 'H', 'K', 'F', 'R', 'Y', 'W']}
  main_dicter['polarity']={'4.9–6.2': ['L', 'I', 'F', 'W', 'C', 'M', 'V', 'Y'],
                           '8.0–9.2': ['P', 'A', 'T', 'G', 'S'],
                           '10.4–13.0': ['H', 'Q', 'R', 'K', 'N', 'E', 'D']}
  main_dicter['polarizability']={'0–0.108': ['G', 'A', 'S', 'D', 'T'],
                                 '0.128–0.186': ['C', 'P', 'N', 'V', 'E', 'Q', 'I', 'L'],
                                 '0.219–0.409': ['K', 'M', 'H', 'F', 'R', 'Y', 'W']}
  main_dicter['charge']={'positive': ['K', 'R'],
                         'neutral': ['A', 'N', 'C', 'Q', 'G', 'H', 'I', 'L', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'],
                         'negative': ['D', 'E']}
  main_dicter['secondaryStructure']={'helix': ['E', 'A', 'L', 'M', 'Q', 'K', 'R', 'H'],
                                     'strand': ['V', 'I', 'Y', 'C', 'W', 'F', 'T'],
                                     'coil': ['G', 'N', 'P', 'S', 'D']}
  main_dicter['solventAccessibility']={'buried': ['A', 'L', 'F', 'C', 'G', 'I', 'V', 'W'],
                                       'exposed': ['R', 'K', 'Q', 'E', 'N', 'D'],
                                       'intermediate': ['M', 'S', 'P', 'T', 'H', 'Y']}
  res=0
  for elem in main_dicter[typer][mode]:
    res+=example.count(elem)
  res/=len(example)
  return res

def ctdt(example):
  main_dicter={}
  main_dicter['hydrophobicity']={'polar':['R', 'K', 'E', 'D', 'Q', 'N'],
                                'neutral':['G', 'A', 'S', 'T', 'P', 'H', 'Y'],
                                'hydrophobicity':['C', 'L', 'V', 'I', 'M', 'F', 'W']}
  main_dicter['vanDerWaals']={'0–2.78':['G', 'A', 'S', 'T', 'P', 'D', 'C'],
                              '2.95–4.0': ['N', 'V', 'E', 'Q', 'I', 'L'],
                              '4.03–8.08': ['M', 'H', 'K', 'F', 'R', 'Y', 'W']}
  main_dicter['polarity']={'4.9–6.2': ['L', 'I', 'F', 'W', 'C', 'M', 'V', 'Y'],
                          '8.0–9.2': ['P', 'A', 'T', 'G', 'S'],
                          '10.4–13.0': ['H', 'Q', 'R', 'K', 'N', 'E', 'D']}
  main_dicter['polarizability']={'0–0.108': ['G', 'A', 'S', 'D', 'T'],
                                '0.128–0.186': ['C', 'P', 'N', 'V', 'E', 'Q', 'I', 'L'],
                                '0.219–0.409': ['K', 'M', 'H', 'F', 'R', 'Y', 'W']}
  main_dicter['charge']={'positive': ['K', 'R'],
                        'neutral': ['A', 'N', 'C', 'Q', 'G', 'H', 'I', 'L', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'],
                        'negative': ['D', 'E']}
  main_dicter['secondaryStructure']={'helix': ['E', 'A', 'L', 'M', 'Q', 'K', 'R', 'H'],
                                    'strand': ['V', 'I', 'Y', 'C', 'W', 'F', 'T'],
                                    'coil': ['G', 'N', 'P', 'S', 'D']}
  main_dicter['solventAccessibility']={'buried': ['A', 'L', 'F', 'C', 'G', 'I', 'V', 'W'],
                                      'exposed': ['R', 'K', 'Q', 'E', 'N', 'D'],
                                      'intermediate': ['M', 'S', 'P', 'T', 'H', 'Y']}

  res={}
  for typer in main_dicter.keys():
    midres=0
    for ama in main_dicter[typer][list(main_dicter[typer].keys())[0]]:
      for amb in main_dicter[typer][list(main_dicter[typer].keys())[1]]:
        midres+=(example.count(ama+amb)+example.count(amb+ama))
    midres/=(len(example)-1)
    res[typer+'.'+list(main_dicter[typer].keys())[0]+'/'+list(main_dicter[typer].keys())[1]]=midres

    midres=0
    for amc in main_dicter[typer][list(main_dicter[typer].keys())[2]]:
      for amb in main_dicter[typer][list(main_dicter[typer].keys())[1]]:
        midres+=(example.count(amc+amb)+example.count(amb+amc))
    midres/=(len(example)-1)
    res[typer+'.'+list(main_dicter[typer].keys())[1]+'/'+list(main_dicter[typer].keys())[2]]=midres

    midres=0
    for ama in main_dicter[typer][list(main_dicter[typer].keys())[0]]:
      for amc in main_dicter[typer][list(main_dicter[typer].keys())[2]]:
        midres+=(example.count(amc+ama)+example.count(ama+amc))
    midres/=(len(example)-1)
    res[typer+'.'+list(main_dicter[typer].keys())[0]+'/'+list(main_dicter[typer].keys())[2]]=midres

  return res

def get_features(example, name):
  result={'name':name, 'hydrophobicity.hydrophobicity':psychochemical_features(example, 'hydrophobicity', 'hydrophobicity'),
          'hydrophobicity.polar':psychochemical_features(example, 'hydrophobicity', 'polar'),'hydrophobicity.neutral':psychochemical_features(example, 'hydrophobicity', 'neutral'),
          'vanDerWaals.0–2.78':psychochemical_features(example, 'vanDerWaals', '0–2.78'),'vanDerWaals.2.95–4.0':psychochemical_features(example, 'vanDerWaals', '2.95–4.0'),
          'vanDerWaals.4.03–8.08':psychochemical_features(example, 'vanDerWaals', '4.03–8.08'), 'polarity.4.9–6.2':psychochemical_features(example, 'polarity', '4.9–6.2'),
          'polarity.8.0–9.2':psychochemical_features(example, 'polarity', '8.0–9.2'), 'polarity.10.4–13.0':psychochemical_features(example, 'polarity', '10.4–13.0'),
          'polarizability.0–0.108':psychochemical_features(example, 'polarizability', '0–0.108'),'polarizability.0.128–0.186':psychochemical_features(example, 'polarizability', '0.128–0.186'),
          'polarizability.0.219–0.409':psychochemical_features(example, 'polarizability', '0.219–0.409'), 'charge.positive':psychochemical_features(example, 'charge', 'positive'),
          'charge.neutral':psychochemical_features(example, 'charge', 'neutral'), 'charge.negative':psychochemical_features(example, 'charge', 'negative'),
          'secondaryStructure.helix':psychochemical_features(example, 'secondaryStructure', 'helix'),'secondaryStructure.strand':psychochemical_features(example, 'secondaryStructure', 'strand'),
          'secondaryStructure.coil':psychochemical_features(example, 'secondaryStructure', 'coil'), 'solventAccessibility.buried':psychochemical_features(example, 'solventAccessibility', 'buried'),
          'solventAccessibility.exposed':psychochemical_features(example, 'solventAccessibility', 'exposed'),'solventAccessibility.intermediate':psychochemical_features(example, 'solventAccessibility', 'intermediate')}

  ctdtC=ctdt(example)
  for key in ctdtC.keys():
    result['CTDT.'+key]=ctdtC[key]

  aacC=aac(example)
  for key in aacC.keys():
    result['AAC.'+key]=aacC[key]

  dpcC=dpc(example)
  for key in dpcC.keys():
    result['DPC.'+key]=dpcC[key]

  qsoC=qso(example)
  for key in qsoC.keys():
    result['QSO.'+key]=qsoC[key]
  return result

In [32]:
dicter={}
name=''
pos_count=0
with open('T6SE_Training_Pos_138.fasta', 'r') as file:
  lines=file.readlines()
  for i, line in enumerate(lines):
      print(f'{i}/{len(lines)}')
      if(line[0]!='>'):
        pos_count+=1
        features=get_features(line.replace('\n',''), name)
        if(len(dicter.keys())==0):
          for key in features.keys():
            dicter[key]=[features[key]]
        else:
          for key in features.keys():
            dicter[key].append(features[key])
      else:
        name=line.replace('\n','')

positive=[1 for i in range(pos_count)]

0/276
1/276
2/276
3/276
4/276
5/276
6/276
7/276
8/276
9/276
10/276
11/276
12/276
13/276
14/276
15/276
16/276
17/276
18/276
19/276
20/276
21/276
22/276
23/276
24/276
25/276
26/276
27/276
28/276
29/276
30/276
31/276
32/276
33/276
34/276
35/276
36/276
37/276
38/276
39/276
40/276
41/276
42/276
43/276
44/276
45/276
46/276
47/276
48/276
49/276
50/276
51/276
52/276
53/276
54/276
55/276
56/276
57/276
58/276
59/276
60/276
61/276
62/276
63/276
64/276
65/276
66/276
67/276
68/276
69/276
70/276
71/276
72/276
73/276
74/276
75/276
76/276
77/276
78/276
79/276
80/276
81/276
82/276
83/276
84/276
85/276
86/276
87/276
88/276
89/276
90/276
91/276
92/276
93/276
94/276
95/276
96/276
97/276
98/276
99/276
100/276
101/276
102/276
103/276
104/276
105/276
106/276
107/276
108/276
109/276
110/276
111/276
112/276
113/276
114/276
115/276
116/276
117/276
118/276
119/276
120/276
121/276
122/276
123/276
124/276
125/276
126/276
127/276
128/276
129/276
130/276
131/276
132/276
133/276
134/276
135/276
136/276
137/276
138/27

In [33]:
neg_count=0
with open('T6SE_Training_Neg_1112.fasta', 'r') as file:
  lines=file.readlines()
  for i, line in enumerate(lines):
      print(f'{i}/{len(lines)}')
      if(line[0]!='>'):
        neg_count+=1
        features=get_features(line.replace('\n',''), name)
        if(len(dicter.keys())==0):
          for key in features.keys():
            dicter[key]=[features[key]]
        else:
          for key in features.keys():
            dicter[key].append(features[key])
      else:
        name=line.replace('\n','')

positive=positive+[0 for i in range(neg_count)]
dicter['positive']=positive

0/2224
1/2224
2/2224
3/2224
4/2224
5/2224
6/2224
7/2224
8/2224
9/2224
10/2224
11/2224
12/2224
13/2224
14/2224
15/2224
16/2224
17/2224
18/2224
19/2224
20/2224
21/2224
22/2224
23/2224
24/2224
25/2224
26/2224
27/2224
28/2224
29/2224
30/2224
31/2224
32/2224
33/2224
34/2224
35/2224
36/2224
37/2224
38/2224
39/2224
40/2224
41/2224
42/2224
43/2224
44/2224
45/2224
46/2224
47/2224
48/2224
49/2224
50/2224
51/2224
52/2224
53/2224
54/2224
55/2224
56/2224
57/2224
58/2224
59/2224
60/2224
61/2224
62/2224
63/2224
64/2224
65/2224
66/2224
67/2224
68/2224
69/2224
70/2224
71/2224
72/2224
73/2224
74/2224
75/2224
76/2224
77/2224
78/2224
79/2224
80/2224
81/2224
82/2224
83/2224
84/2224
85/2224
86/2224
87/2224
88/2224
89/2224
90/2224
91/2224
92/2224
93/2224
94/2224
95/2224
96/2224
97/2224
98/2224
99/2224
100/2224
101/2224
102/2224
103/2224
104/2224
105/2224
106/2224
107/2224
108/2224
109/2224
110/2224
111/2224
112/2224
113/2224
114/2224
115/2224
116/2224
117/2224
118/2224
119/2224
120/2224
121/2224
122/2224
123

In [34]:
df_features=pd.DataFrame(dicter)
df_features.to_csv('features_test.csv')

In [35]:
df_features.head()

Unnamed: 0,name,hydrophobicity.hydrophobicity,hydrophobicity.polar,hydrophobicity.neutral,vanDerWaals.0–2.78,vanDerWaals.2.95–4.0,vanDerWaals.4.03–8.08,polarity.4.9–6.2,polarity.8.0–9.2,polarity.10.4–13.0,...,QSO.QSOgrant42,QSO.QSOgrant43,QSO.QSOgrant44,QSO.QSOgrant45,QSO.QSOgrant46,QSO.QSOgrant47,QSO.QSOgrant48,QSO.QSOgrant49,QSO.QSOgrant50,positive
0,>gi|77358963|ref|YP_338391.1|hypothetical prot...,0.260355,0.313609,0.426036,0.402367,0.331361,0.266272,0.301775,0.349112,0.349112,...,0.033805,0.029148,0.031191,0.033745,0.027529,0.029104,0.026548,0.027906,0.028787,1
1,>gi|9948776|gb|AAG06090.1|hypothetical protein...,0.246835,0.329114,0.424051,0.449367,0.278481,0.272152,0.303797,0.348101,0.348101,...,0.030881,0.028522,0.032185,0.033735,0.029191,0.027493,0.03054,0.029234,0.027712,1
2,>gi|32261790|gb|AAP76840.1|conserved hypotheti...,0.278107,0.313609,0.408284,0.39645,0.390533,0.213018,0.307692,0.343195,0.349112,...,0.033638,0.030436,0.03125,0.033124,0.031773,0.027635,0.028957,0.026723,0.025335,1
3,">gi|49080696|gb|AAT50048.1|PA0085, partial [sy...",0.276074,0.319018,0.404908,0.423313,0.343558,0.233129,0.306748,0.355828,0.337423,...,0.030688,0.031369,0.029408,0.033504,0.028853,0.028564,0.026577,0.028818,0.031626,1
4,>gi|32261789|gb|AAP76839.1|hypothetical protei...,0.273605,0.321888,0.404506,0.386266,0.417382,0.196352,0.313305,0.327253,0.359442,...,0.032351,0.032338,0.031225,0.034109,0.031757,0.033706,0.034148,0.033961,0.030573,1
