# N-Gram Strategy

Break my sequences down into n-grams. Then;
1. merge/map the protein-level sequence features into the n-gram row
2. merge/map the amino acid-level features into the n-gram row
3. build the final feature matrix
4. do some feature selection, dimension reduction
5. train the model
6. evaluate

In [48]:
import sqlite3
import nltk
import pandas as pd
import os
from tqdm import tqdm_notebook

In [52]:
conn = sqlite3.connect("human_protein.db")
prot_df = pd.read_sql('select protein, accession, sequence from protein',con=conn)
amy_df = pd.read_sql('select * from amyloid_prion_id_mapped', con=conn)
ignore_prots = pd.read_sql('select protein, uniprot_id from amy_ignore', con=conn)
conn.close()
display(prot_df.head()), display(amy_df.head()), display(ignore_prots)

Unnamed: 0,protein,accession,sequence
0,RL37A_HUMAN,"P61513,P12751,Q6FGF5",MAKRTKKVGIVGKYGTRYGASLRKMVKKIEISQHAKYTCSFCGKTK...
1,PYRG1_HUMAN,"P17812,B4DR64,D3DPW1,Q5VW67,Q96GK6",MKYILVTGGVISGIGKGIIASSVGTILKSCGLHVTSIKIDPYINID...
2,RL6_HUMAN,"Q02878,Q2M3Q3,Q8WW97",MAGEKVEKPDTKEKKPEAKKVDAGGKVKKGNLKAKKPKKGKPHCSR...
3,RAB10_HUMAN,"P61026,D6W538,O88386,Q6IA52,Q9D7X6,Q9H0T3",MAKKTYDLLFKLLLIGDSGVGKTCVLFRFSDDAFNTTFISTIGIDF...
4,RAB30_HUMAN,"Q15771,Q6FGK1,Q6MZH2,Q96CI8",MSMEDYDFLFKIVLIGNAGVGKTCLVRRFTQGLFPPGQGATIGVDF...


Unnamed: 0,uniprot_id,begin,end
0,P01236,35,62
1,P01236,71,85
2,P01258,90,95
3,P01258,99,104
4,P02647,70,83


Unnamed: 0,protein,uniprot_id
0,TADBP_HUMAN,Q13148
1,BGH3_HUMAN,Q15582
2,APOC3_HUMAN,P02656
3,ITM2B_HUMAN,Q9Y287
4,ITM2B_HUMAN,Q9Y287
5,GELS_HUMAN,P06396
6,NFH_HUMAN,P12036
7,APOA2_HUMAN,P02652


(None, None, None)

In [54]:
# Drop ignore rows
prot_df = prot_df[prot_df['protein'].isin(ignore_prots['protein'].values.tolist())==False]
amy_df = amy_df[amy_df['uniprot_id'].isin(ignore_prots['uniprot_id'].values.tolist())==False]

In [55]:
def nGrammer(protein, seq, n):
    return [[protein, i+1]+list(x) for i, x in enumerate(nltk.ngrams(seq, n))]

def dfNGrammer(protein_df, num_grams):
    gram_cols = ["protein","gram_num"]+["gram_"+str(i+1) for i in range(num_grams)]
    seq_grams = []
    seq_grams.extend([val for sublist in protein_df.apply(lambda row: nGrammer(row['protein'], row['sequence'], num_grams),axis=1) for val in sublist])
    gram_df = pd.DataFrame(seq_grams, columns=gram_cols)
    return gram_df

In [56]:
n = 5
ngram_df = dfNGrammer(prot_df,n).merge(prot_df[['protein','accession']],on='protein',how='left')

In [57]:
print(ngram_df.shape)
ngram_df.head()

(11318554, 8)


Unnamed: 0,protein,gram_num,gram_1,gram_2,gram_3,gram_4,gram_5,accession
0,RL37A_HUMAN,1,M,A,K,R,T,"P61513,P12751,Q6FGF5"
1,RL37A_HUMAN,2,A,K,R,T,K,"P61513,P12751,Q6FGF5"
2,RL37A_HUMAN,3,K,R,T,K,K,"P61513,P12751,Q6FGF5"
3,RL37A_HUMAN,4,R,T,K,K,V,"P61513,P12751,Q6FGF5"
4,RL37A_HUMAN,5,T,K,K,V,G,"P61513,P12751,Q6FGF5"


In [103]:
# reference dict for sequence annotations
seq_dict = dict()

for prot_id in tqdm_notebook(amy_df.uniprot_id.unique()):
    vals = amy_df[amy_df['uniprot_id']==prot_id][['begin','end']].values.tolist()
    range_vals = []
    for val in vals:
        range_vals.extend(range(val[0], val[1]+1))
    seq_dict[prot_id] = range_vals
    
 #   print(prot_id)

HBox(children=(IntProgress(value=0, max=111), HTML(value='')))




In [166]:
def amyloidSeqMapper(protein, gram_num, grams, accession, amyloid_ref):
    n = len(grams)
    acc_ids = accession.split(',')
    id_matches = [x for x in acc_ids if x in seq_dict.keys()]
    # If none of the accession keys are in my amyloid annotation dict
    if len(id_matches)==0:
        result = [0]*n
    else:
        # Get the values from the dict to check against
        tag_id = id_matches[0]
        prot_vals = seq_dict[tag_id]
        range_start = gram_num
        range_end = gram_num+n
        gram_range = list(range(range_start,range_end))
        result = [0 if x not in prot_vals else 1 for x in gram_range]
    return pd.Series(result)
        
#def chunkWriter(con, )
    
chunk = 11300
chunk_list = list(range(0, ngram_df.shape[0],chunk))
chunk_pairs = [[chunk_list[i-1], chunk_list[i]] for i, x in enumerate(chunk_list) if i != 0]
# Add last little chunk
chunk_pairs.append([chunk_pairs[-1][-1]+1, ngram_df.shape[0]-1])
print(ngram_df.shape[0])
print(chunk_pairs[:5],"...",chunk_pairs[-5:])

11318554
[[0, 11300], [11300, 22600], [22600, 33900], [33900, 45200], [45200, 56500]] ... [[11266100, 11277400], [11277400, 11288700], [11288700, 11300000], [11300000, 11311300], [11311301, 11318553]]


In [167]:
amyloid_cols = ['amyloid_'+str(a) for a in range(1, n+1)]
for a_col in amyloid_cols:
    ngram_df.loc[:,a_col] = 0

In [168]:
conn = sqlite3.connect("protein_training.db")

for i in tqdm_notebook(chunk_pairs):
    start = i[0]
    end = i[1]
    ngram_slice = ngram_df.loc[start:end,:]
    ngram_slice.loc[:,amyloid_cols] = ngram_slice.apply(
        lambda row: amyloidSeqMapper(
            row['protein'], 
            row['gram_num'],
            row[['gram_'+str(i) for i in range(1,n+1)]].values.tolist(),
            row['accession'], 
            amy_df
        ), axis=1).values.tolist()
    if start == 0:
        ngram_slice.to_sql('protein_ngram', con=conn, index=False, if_exists='replace')
    else:
        ngram_slice.to_sql('protein_ngram', con=conn, index=False, if_exists='append')
        
conn.close()

HBox(children=(IntProgress(value=0, max=1002), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [170]:
conn = sqlite3.connect("protein_training.db")

test_df = pd.read_sql('select * from protein_ngram limit 10', con=conn)

conn.close()
test_df.head()

Unnamed: 0,protein,gram_num,gram_1,gram_2,gram_3,gram_4,gram_5,accession,amyloid_1,amyloid_2,amyloid_3,amyloid_4,amyloid_5
0,RL37A_HUMAN,1,M,A,K,R,T,"P61513,P12751,Q6FGF5",0,0,0,0,0
1,RL37A_HUMAN,2,A,K,R,T,K,"P61513,P12751,Q6FGF5",0,0,0,0,0
2,RL37A_HUMAN,3,K,R,T,K,K,"P61513,P12751,Q6FGF5",0,0,0,0,0
3,RL37A_HUMAN,4,R,T,K,K,V,"P61513,P12751,Q6FGF5",0,0,0,0,0
4,RL37A_HUMAN,5,T,K,K,V,G,"P61513,P12751,Q6FGF5",0,0,0,0,0
