# libraries

In [54]:
import re
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.grid_search import GridSearchCV 
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn import preprocessing

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [4]:
%matplotlib inline
sns.plotting_context('poster')
sns.set_style('whitegrid')

# data

#### training datasets
variants and text files

In [7]:
train_v = pd.read_csv('training_variants/training_variants')

In [14]:
train_t = pd.read_csv('training_text/training_text', sep='\|\|',skiprows=1, engine='python', names=["ID","text"])

In [10]:
train = pd.merge(train_v,train_t, on = ['ID'])

In [29]:
train['data'] = 'train'

#### testing datasets
variant and text files

In [11]:
test_v = pd.read_csv('test_variants/test_variants')

In [15]:
test_t = pd.read_csv('test_text/test_text', sep='\|\|',skiprows=1, engine='python', names=["ID","text"])

In [16]:
test = pd.merge(test_v,test_t, on = ['ID'])

In [30]:
test['data'] = 'test'

#### combine training and testing datasets

In [34]:
df = pd.merge(train, test, how='outer')

# Features

## variants
Classifies variations into common types and standardizes their name. The order of commands in the fuc matters because deletions & insertions can happen simulteneously but coding is inconsistent. Testing: func is coding variations accurately. We might want to change the Fussion expression ("fs") because it could be problematic with new data, but hacky version works for now.

In [40]:
def var_recode(data, colname):
    recoded_colname = colname + '_recoded'
    #recode substitutions
    data[recoded_colname] = data[colname].str.replace('^[A-Z]\d+[A-Z\*]$|^(null)\d+[A-Z\*]$', 'Substitution')
    #recode deletions
    data.loc[data[colname].str.contains('del|silencing|hypermethylation', case=False), recoded_colname] = 'Deletion'
    #recode insertions
    data.loc[data[colname].str.contains('ins', case=False) , recoded_colname] = 'Insertion'
    #recode deletions/insertions
    data.loc[(data[colname].str.contains('del', case=False) & 
              data[colname].str.contains('ins', case=False)), recoded_colname] = 'InDel'
    #recode truncations
    data.loc[data[colname].str.contains('trunc', case=False), recoded_colname] = 'Truncation'
    #recode duplications
    data.loc[data[colname].str.contains('dup', case=False) , recoded_colname] = 'Duplication'
    #recode fusions 
    data.loc[data[colname].str.contains('fusion|fs', case=False), recoded_colname] = 'Fusion'

In [41]:
var_recode(df, 'Variation')

In [42]:
df.head()

Unnamed: 0,ID,Gene,Variation,Class,text,data,Variation_recoded
0,0,FAM58A,Truncating Mutations,1.0,Cyclin-dependent kinases (CDKs) regulate a var...,train,Truncation
1,1,CBL,W802*,2.0,Abstract Background Non-small cell lung canc...,train,Substitution
2,2,CBL,Q249E,2.0,Abstract Background Non-small cell lung canc...,train,Substitution
3,3,CBL,N454D,3.0,Recent evidence has demonstrated that acquired...,train,Substitution
4,4,CBL,L399V,4.0,Oncogenic mutations in the monomeric Casitas B...,train,Substitution


Classify the type of substitution: neg to pos, pos to neg, neg to neutral, etc. 

In [44]:
#aminoacid: charge
sub_dict= {'A': 'Neu', #'alanine'
           'R': 'Pos', #'arginine'
           'N': 'Neu', #'asparagine'
           'D': 'Neg', #'aspartic acid'
           'B': 'Neu', #'asparagine|aspartic acid'
           'C': 'Neu', #'cysteine'
           'E': 'Neg', #'glutamic acid'
           'Q': 'Neu', #'glutamine'
           'Z': 'Neu', #'glutamine|glutamic acid'
           'G': 'Neu', #'glycine'
           'H': 'Pos', #'histidine'
           'I': 'Neu', #'isoleucine'
           'L': 'Neu', #'leucine'
           'K': 'Pos', #'lysine'
           'M': 'Neu', #'methionine'
           'F': 'Neu', #'phenylalanine'
           'P': 'Neu', #'proline'
           'S': 'Neu', #'serine'
           'T': 'Neu', #'threonine'
           'W': 'Neu', #'tryptophan'
           'Y': 'Neu', #'tyrosine'
           'V': 'Neu' #'valine'
          }

In [45]:
def sub_recode(data, colname):
    recoded_colname = colname + '_recoded'
    data['s1'] = data.loc[data[colname].str.match('^[A-Z]\d+[A-Z]$')][colname].str[:1]
    data['s2'] = data.loc[data[colname].str.match('^[A-Z]\d+[A-Z]$')][colname].str[-1:]
    data['s1'].replace(sub_dict, inplace=True)
    data['s2'].replace(sub_dict, inplace=True)
    data['Sub_type'] = data['s1'] + data['s2']
    data.loc[data['Sub_type'].notnull(), recoded_colname] = data['Sub_type']
    data.drop(['s1', 's2', 'Sub_type'], axis=1, inplace=True)

In [46]:
sub_recode(df, 'Variation')

In [47]:
df.head()

Unnamed: 0,ID,Gene,Variation,Class,text,data,Variation_recoded
0,0,FAM58A,Truncating Mutations,1.0,Cyclin-dependent kinases (CDKs) regulate a var...,train,Truncation
1,1,CBL,W802*,2.0,Abstract Background Non-small cell lung canc...,train,Substitution
2,2,CBL,Q249E,2.0,Abstract Background Non-small cell lung canc...,train,NeuNeg
3,3,CBL,N454D,3.0,Recent evidence has demonstrated that acquired...,train,NeuNeg
4,4,CBL,L399V,4.0,Oncogenic mutations in the monomeric Casitas B...,train,NeuNeu


## genes
quick function to calculate gene frequency for a grouping variable 1) count of unique genes 2) total number of genes per group (similar to group frequency) 3) most frequent gene in each group 4) proportional frequency of gene accounting for size of group 5) reset index for easier plotting

In [49]:
def genefreq(data, group):
    newdf = data.groupby(group).agg({'Gene': pd.Series.nunique})
    newdf['Gene_total'] = data.groupby(group)['Gene'].count()
    newdf['Gene_most_frequent'] = data.groupby(group)['Gene'].agg(lambda x: x.value_counts().idxmax())
    newdf['Gene_most_frequent_count'] = data.groupby(group)['Gene'].agg(lambda x: x.value_counts().max())
    newdf['Gene_p_unique'] = newdf['Gene']/newdf['Gene_total']
    newdf['Gene_p_most_frequent'] = newdf['Gene_most_frequent_count']/newdf['Gene_total']
    newdf.reset_index(inplace=True)
    return newdf

In [50]:
genefreq(df, 'Variation_recoded')

Unnamed: 0,Variation_recoded,Gene,Gene_total,Gene_most_frequent,Gene_most_frequent_count,Gene_p_unique,Gene_p_most_frequent
0,596_619splice,1,1,HLA-A,1,1.0,1.0
1,963_D1010splice,1,1,MET,1,1.0,1.0
2,981_1028splice,1,1,MET,1,1.0,1.0
3,A113_splice,1,1,EIF1AX,1,1.0,1.0
4,AR-V7,1,1,AR,1,1.0,1.0
5,ARv567es,1,1,AR,1,1.0,1.0
6,Amplification,79,79,KMT2A,1,1.0,0.012658
7,C1385,1,1,EP300,1,1.0,1.0
8,CASP8L,1,1,CASP8,1,1.0,1.0
9,Copy Number Loss,1,1,FBXW7,1,1.0,1.0


## text

#### word count per entry

In [52]:
df.loc[:, 'Word_count']  = df["text"].apply(lambda x: len(x.split()))

#### top word frequencies

condcut removal of stop words and stemming to come up with the bag of words model

In [None]:
corpus = []
for i in range(0, 3321):
    review = re.sub('[^a-zA-Z]', ' ', df['text'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

find frequent words

In [None]:
word_vectorizer = CountVectorizer(ngram_range=(1,1), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(corpus)
frequencies = sum(sparse_matrix).toarray()[0]
h = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])
h.reset_index(inplace = True)
h.sort_values('frequency',ascending = False, inplace = True)

In [None]:
words = 'mutat, cancer, patient, protein, express, tumor, variant, kinase, domain, brca1, egfr, activation, ras, p53, exon, growth, clinical, signaling, function, pten, phospharylation, residue, resistance, raf, pathway, alk, disease, receptor, missense, breast, braf, inhibit, lung, tyrosine, acid, induce, survival, proliferation, akt, imatinib, fusion, oncogenic, transcription, deleterous, flt3, melanoma, somatic, gefitinib, brct, myc, amplification, genomic, pathogenic, benign, p21, phosphatase, sp3b1, bat3'.split(', ')


find frequency of top words in each entry

In [None]:
def word_count(df, words):    
    for word in words:
        w =[]
        for row in df['text']:
            n = row.lower().count(word)
            w.append(n)
        df[word] = w
    return df

In [None]:
merged_text = word_count(df, words)

find frequency of top words in each class

In [None]:
#load the training dataset
training_variants = pd.read_csv('training_variants.csv')
training_text = pd.read_csv('training_text.csv',sep='\|\|', skiprows=1, engine='python',names=["ID","text"])

In [None]:
#combine the two training datasets together
merged_variants_text = pd.merge(training_variants, training_text, on = ['ID'])

In [None]:
#group the merged dataset by class and count
text_group_count = merged_variants_text.groupby("Class")['ID'].count()
text_group_count.sort_values(ascending=False, inplace = True)

In [None]:
# check the class count results
text_group_count

In [None]:
# create a list that contains all the text classfied in Class 1 to 9
class_txt_assembly = []
for clas in range(1,10):
    class_txt= merged_variants_text.loc[merged_variants_text['Class']==clas]['text']
    class_txt_assembly.append(class_txt)

In [None]:
# define a function to calculate frequency of words to filter keywords
def classCorpus(corpus):
    word_vectorizer = CountVectorizer(ngram_range=(1,1), analyzer='word')
    sparse_matrix = word_vectorizer.fit_transform(corpus)
    frequencies = sum(sparse_matrix).toarray()[0]
    df=pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])
    df.sort_values('frequency', ascending = False, inplace = True)
    return df

In [None]:
# get the result of frequency of words in each class
result=[]
for corpus in class_txt_assembly:
    result.append(classCorpus(corpus))

In [None]:
# check the result of class 1 to 9
result[0][0:200]
result[1][0:200]
result[2][0:200]
result[3][0:200]
result[4][0:200]
result[5][0:200]
result[6][0:200]
result[7][0:200]
result[8][0:200]