# Amino acid characteristics

Lets add some features for each fraction of the n-gram (amino acid).

1. Class (aliphatic, basic, amide, acid, sulfur-containing, basic aromatic, cyclic, hydroxyl-containing)
2. Side chain polarity
3. Side chain charge
4. Hydropathy index
5. Molecular Weight

In [1]:
import sqlite3
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from matplotlib import pyplot as plt
import seaborn as sns
import nltk
import sklearn
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import math

In [2]:
amino_acid_dict = {
    "A":{
        "side_chain_class":"aliphatic",
        "side_chain_polarity":"nonpolar",
        "side_chain_charge":"neutral",
        "hydropathy_index":1.8,
        "molecular_weight":89.094
    },
    "R":{
        "side_chain_class":"basic",
        "side_chain_polarity":"basic polar",
        "side_chain_charge":"positive",
        "hydropathy_index":-4.5,
        "molecular_weight":174.203
    },
    "N":{
        "side_chain_class":"amide",
        "side_chain_polarity":"polar",
        "side_chain_charge":"neutral",
        "hydropathy_index":-3.5,
        "molecular_weight":132.119
    },
    "D":{
        "side_chain_class":"acid",
        "side_chain_polarity":"acidic polar",
        "side_chain_charge":"negative",
        "hydropathy_index":-3.5,
        "molecular_weight":133.104
    },
    "C":{
        "side_chain_class":"sulfur-containing",
        "side_chain_polarity":"nonpolar",
        "side_chain_charge":"neutral",
        "hydropathy_index":2.5,
        "molecular_weight":121.154
    },
    "E":{
        "side_chain_class":"acid",
        "side_chain_polarity":"nonpolar",
        "side_chain_charge":"neutral",
        "hydropathy_index":-3.5,
        "molecular_weight":147.131
    },
    "Q":{
        "side_chain_class":"amide",
        "side_chain_polarity":"polar",
        "side_chain_charge":"neutral",
        "hydropathy_index":-3.5,
        "molecular_weight":146.146
    },
    "G":{
        "side_chain_class":"aliphatic",
        "side_chain_polarity":"nonpolar",
        "side_chain_charge":"neutral",
        "hydropathy_index":-0.4,
        "molecular_weight":75.067
    },
    "H":{
        "side_chain_class":"basic aromatic",
        "side_chain_polarity":"basic polar",
        "side_chain_charge":"neutral",
        "hydropathy_index":-3.2,
        "molecular_weight":155.156
    },
    "I":{
        "side_chain_class":"aliphatic",
        "side_chain_polarity":"nonpolar",
        "side_chain_charge":"neutral",
        "hydropathy_index":4.5,
        "molecular_weight":131.175
    },
    "L":{
        "side_chain_class":"aliphatic",
        "side_chain_polarity":"nonpolar",
        "side_chain_charge":"neutral",
        "hydropathy_index":3.8,
        "molecular_weight":131.175
    },
    "K":{
        "side_chain_class":"basic",
        "side_chain_polarity":"basic polar",
        "side_chain_charge":"positive",
        "hydropathy_index":-3.9,
        "molecular_weight":146.189
    },
    "M":{
        "side_chain_class":"sulfur-containing",
        "side_chain_polarity":"nonpolar",
        "side_chain_charge":"neutral",
        "hydropathy_index":1.9,
        "molecular_weight":149.208
    },
    "F":{
        "side_chain_class":"aromatic",
        "side_chain_polarity":"nonpolar",
        "side_chain_charge":"neutral",
        "hydropathy_index":2.8,
        "molecular_weight":165.192
    },
    "P":{
        "side_chain_class":"cyclic",
        "side_chain_polarity":"nonpolar",
        "side_chain_charge":"neutral",
        "hydropathy_index":-1.6,
        "molecular_weight":115.132
    },
    "S":{
        "side_chain_class":"hydroxyl-containing",
        "side_chain_polarity":"polar",
        "side_chain_charge":"neutral",
        "hydropathy_index":-0.8,
        "molecular_weight":105.093
    },
    "T":{
        "side_chain_class":"hydroxyl-containing",
        "side_chain_polarity":"polar",
        "side_chain_charge":"neutral",
        "hydropathy_index":-0.7,
        "molecular_weight":119.119
    },
    "W":{
        "side_chain_class":"aromatic",
        "side_chain_polarity":"nonpolar",
        "side_chain_charge":"neutral",
        "hydropathy_index":-0.9,
        "molecular_weight":204.228
    },
    "Y":{
        "side_chain_class":"aromatic",
        "side_chain_polarity":"polar",
        "side_chain_charge":"neutral",
        "hydropathy_index":-1.3,
        "molecular_weight":181.191
    },
    "V":{
        "side_chain_class":"aliphatic",
        "side_chain_polarity":"nonpolar",
        "side_chain_charge":"neutral",
        "hydropathy_index":4.2,
        "molecular_weight":117.148
    }
}

In [3]:
conn = sqlite3.connect("protein_training.db")
ngrams_df = pd.read_sql('SELECT protein, gram_num, gram_1, gram_2, gram_3, gram_4, gram_5 from protein_ngram',con=conn)
conn.close()
ngrams_df.sample(5)

Unnamed: 0,protein,gram_num,gram_1,gram_2,gram_3,gram_4,gram_5
9983039,PLCL1_HUMAN,922,S,L,K,Q,C
9090249,DIP2B_HUMAN,949,Q,P,G,V,G
11203698,NU205_HUMAN,1593,E,T,D,P,Q
3146096,CAN15_HUMAN,228,R,V,P,P,F
4026919,GUC2F_HUMAN,204,R,V,A,S,A


In [4]:
aa_vals = pd.melt(
    ngrams_df[[
        'protein','gram_num','gram_1','gram_2','gram_3','gram_4','gram_5']],
    id_vars=['protein','gram_num'],
    value_vars=['gram_1','gram_2','gram_3','gram_4','gram_5']
).value.unique()

In [5]:
for a in aa_vals:
    if a == "U":
        a = "C"
    if a not in amino_acid_dict:
        print(a)

In [6]:
def ngramMapper(grams):
    return_arr = []
    for gram in grams:
        if gram == "U":
            gram = "C"
        gram_vals = list(amino_acid_dict[gram].values())
        return_arr.extend(gram_vals)
    return return_arr

In [8]:
# Lets chunk this
chunk_size = 10000
chunk_list = list(range(0, ngrams_df.shape[0],chunk_size))
chunk_pairs = [[chunk_list[i-1], chunk_list[i]] for i, x in enumerate(chunk_list) if i != 0]
chunk_pairs.append([chunk_pairs[-1][-1], ngrams_df.shape[0]])
print(chunk_pairs[:5],"...",chunk_pairs[-5:])

[[0, 10000], [10000, 20000], [20000, 30000], [30000, 40000], [40000, 50000]] ... [[11270000, 11280000], [11280000, 11290000], [11290000, 11300000], [11300000, 11310000], [11310000, 11319554]]


In [8]:
# sanity check
chunk_0_shape = ngrams_df.iloc[chunk_pairs[0][0]:chunk_pairs[0][1],:].shape[0]
chunk_neg1_shape = ngrams_df.iloc[chunk_pairs[-1][0]:chunk_pairs[-1][1],:].shape[0]

print(chunk_0_shape > chunk_neg1_shape)
chunk_pairs[-1][1] == ngrams_df.shape[0]

True


True

In [9]:
gram_val_cols = []

for i in range(1,6):
    base = "gram_"+str(i)+"_"
    gram_val_cols.extend(
        [
            base+"side_class",
            base+"side_polarity",
            base+"side_charge",
            base+"hydropathy_index",
            base+"mol_weight"
        ]
    )
    
print(gram_val_cols)

['gram_1_side_class', 'gram_1_side_polarity', 'gram_1_side_charge', 'gram_1_hydropathy_index', 'gram_1_mol_weight', 'gram_2_side_class', 'gram_2_side_polarity', 'gram_2_side_charge', 'gram_2_hydropathy_index', 'gram_2_mol_weight', 'gram_3_side_class', 'gram_3_side_polarity', 'gram_3_side_charge', 'gram_3_hydropathy_index', 'gram_3_mol_weight', 'gram_4_side_class', 'gram_4_side_polarity', 'gram_4_side_charge', 'gram_4_hydropathy_index', 'gram_4_mol_weight', 'gram_5_side_class', 'gram_5_side_polarity', 'gram_5_side_charge', 'gram_5_hydropathy_index', 'gram_5_mol_weight']


In [10]:
conn = sqlite3.connect("protein_training.db")

for chunk in tqdm_notebook(chunk_pairs):
    start = chunk[0]
    end = chunk[1]
    
    ngram_df_chunk = ngrams_df.iloc[start:end,:]
    
    chunk_vals = ngram_df_chunk.apply(
        lambda row: ngramMapper(
            row[['gram_1','gram_2','gram_3','gram_4','gram_5']].values
        ),
        axis=1
    ).values.tolist()
    
    chunk_df = pd.DataFrame(chunk_vals, columns=gram_val_cols, index=range(start,end))
        
    write_df = pd.concat(
        [ngram_df_chunk[['protein','gram_num']],chunk_df],
        axis=1,
        sort=False
    )
            
    write_df.to_sql("protein_subngram_features", con=conn, index=False,
                    if_exists="append")
    
conn.close()

HBox(children=(IntProgress(value=0, max=1132), HTML(value='')))




KeyboardInterrupt: 

In [11]:
conn.close()