In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [34]:
df_train_txt = pd.read_csv("data/training_text", sep='\|\|', header=None, skiprows=1, names=["ID","Text"])
df_train_var = pd.read_csv("data/training_variants")
df_train = pd.merge(df_train_var, df_train_txt, how='left', on='ID')

df_train.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,ID,Gene,Variation,Class,Text
0,0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,2,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,3,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...


In [6]:
#function to remove punctuations, capital letter and whitespaces 
import re
def remove_punctuation(text):
    """
    Args:
        text (str): A string.
    """
    return re.sub(r'[^a-zA-Z0-9\s]', '', text).strip().lower()

In [8]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/nikacheh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
#delete stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')

stopwords=stopwords.words('english')

def stopword(list):
    a=[]
    for word in list:
        if word not in stopwords:
            a.append(word)
    return a

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nikacheh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
df_train_txt.head()

Unnamed: 0,ID,Text,Text_num_words,Text_num_chars
0,0,"[cyclindependent, kinases, cdks, regulate, a, ...",6089,39672
1,1,"[abstract, background, nonsmall, cell, lung, c...",5756,36691
2,2,"[abstract, background, nonsmall, cell, lung, c...",5756,36691
3,3,"[recent, evidence, has, demonstrated, that, ac...",5572,36238
4,4,"[oncogenic, mutations, in, the, monomeric, cas...",6202,41308


In [12]:
#let's remove punctuation from text, tokenize and remove all stop words
df_train_txt["Text"] = df_train_txt["Text"].apply(lambda x: remove_punctuation(str(x)))
df_train_txt["Text"] = df_train_txt["Text"].apply(lambda x: nltk.word_tokenize(str(x)))
df_train_txt["Text"]=df_train_txt["Text"].apply(lambda x: stopword(x) )
df_train_txt["Text_num_words"] = df_train_txt["Text"].apply(lambda x: len(str(x).split()) )
df_train_txt["Text_num_chars"] = df_train_txt["Text"].apply(lambda x: len(str(x)) )

In [32]:
#transform categorical veector in numerical
def vectorize(column):
    unique_vector=column.unique()
    dictionary=dict(enumerate(unique_vector))
    num=[]
    for i in column:
        for j in dictionary.items():
            if i==j[1]:
                num.append(j[0])
    return np.asarray(num)  

In [33]:
df_train_txt['Gene']=vectorize(df_train.Gene)
df_train_txt['Variation']=vectorize(df_train.Variation)

In [31]:
df_train_txt.head()

Unnamed: 0,ID,Text,Text_num_words,Text_num_chars,Gene,Variation
0,0,"[cyclindependent, kinases, cdks, regulate, var...",4000,42527,0,0
1,1,"[abstract, background, nonsmall, cell, lung, c...",3743,38699,1,1
2,2,"[abstract, background, nonsmall, cell, lung, c...",3743,38699,1,2
3,3,"[recent, evidence, demonstrated, acquired, uni...",3665,38917,1,3
4,4,"[oncogenic, mutations, monomeric, casitas, bli...",4015,43517,1,4


In [25]:
y=df_train["Class"].values
features=['Text_num_words','Text_num_chars','Gene','Variation']
X=df_train_txt[features].values

In [26]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

logreg = LogisticRegression()
logreg.fit(X, y)

logreg.score(X,y)


0.29659741041854865

In [28]:
r=logreg.predict_proba(X)

In [29]:
d = {'ID': df_train_txt["ID"], 'class1': r[:,0], 'class2': r[:,1], 'class3': r[:,2], 'class4': r[:,3], 'class5': r[:,4]
   , 'class6': r[:,5], 'class7': r[:,6], 'class8': r[:,7], 'class9': r[:,8]}

df_res = pd.DataFrame(data=d)
submit = df_res.iloc[1:987,]
submit.head()

submit.to_csv('out3.csv', index = False)