In [88]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
import re
from collections import defaultdict
import os

In [48]:
stopwords=defaultdict(int)
with open('StopWords.txt','r') as f:
    for line in f:
        line= line.strip()
        stopwords[line]=1

In [55]:
df=pd.read_table('ICHI2016-TrainData.tsv')
testdf = pd.read_table('new_ICHI2016-TestData_label.tsv')

In [56]:
df['Question']=df['Title']+ ' ' + df['Question']

In [58]:
df['Question']=df['Question'].apply(lambda x : x.lower())
df['Question']=df['Question'].apply(lambda x : re.sub(r'https?://(.*?) ', '', x))
df['Question']=df['Question'].apply(lambda x : re.sub('[^a-z]', ' ', x))
df['Question']=df['Question'].apply(lambda x : re.sub(' +',' ', x))


In [59]:
df['Question']=df['Question'].apply(lambda key :' '.join([x for x in key.split() if stopwords[x]!=1]).strip() )


In [60]:
df.head(5)

Unnamed: 0,Category,Title,Question
0,SOCL,lump on my wrist,lump wrist lump wrist thumb feels pinched nerv...
1,PREG,12 wks pg with twins and having odd (non-painf...,wks pg twins odd painful jabs abdomen pg twins...
2,GOAL,Severe pain in left eye,severe pain left eye real quick mri ct scan si...
3,SOCL,Man faces charges for reading wife's e-mail,man faces charges reading wife mail wife passw...
4,TRMT,What's Clear Jello,jello hey busy wait doctors office leave messa...


In [138]:
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),('clf', MultinomialNB()), ])

In [139]:
text_clf = text_clf.fit(df['Question'], df['Category'])

In [140]:
predicted_NB = text_clf.predict(testdf['Question'])

In [141]:
Actual = testdf['Category']

In [142]:
np.mean(predicted_NB == Actual)

0.64500000000000002

In [143]:
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),('clf', SGDClassifier(loss='hinge',alpha=1e-3, n_iter=5,random_state=42)),])

In [144]:
text_clf_svm.fit(df['Question'] ,df['Category'])

Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        st...     penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
       warm_start=False))])

In [145]:
predicted_svm = text_clf_svm.predict(testdf['Question'])

In [146]:
np.mean(predicted_svm == testdf['Category'])

0.64200000000000002

### Semi Supervised Dataset

In [92]:
Title = []
Question = []
for fil in os.listdir('contains_all/'):
    fil = 'contains_all/'+ fil
    with open(fil) as f:
        for line in f:
            line=line.split('<<->>')
            Title.append(line[0])
            Question.append(line[1])


In [123]:
sdf= pd.DataFrame()

In [124]:
sdf['Title']=Title
sdf['Question']=Question

In [125]:
sdf['Question']=sdf['Title']+ ' ' + sdf['Question']
sdf['Question']=sdf['Question'].apply(lambda x : x.lower())
sdf['Question']=sdf['Question'].apply(lambda x : re.sub(r'https?://(.*?) ', '', x))
sdf['Question']=sdf['Question'].apply(lambda x : re.sub('[^a-z]', ' ', x))
sdf['Question']=sdf['Question'].apply(lambda x : re.sub(' +',' ', x))

In [126]:
sdf['Question']=sdf['Question'].apply(lambda key :' '.join([x for x in key.split() if stopwords[x]!=1]).strip() )

In [130]:
predicted = text_clf.predict(sdf['Question'])

In [131]:
sdf['Category']=predicted

In [133]:
sdf.head()

Unnamed: 0,Title,Question,Category
0,What should i do?,mom stroke worse diabetes type renal failure s...,FAML
1,Angina?,angina male pounds monday arm pains burning se...,DISE
2,sudden lightning shock feeling through left chest,sudden lightning shock feeling left chest walk...,DISE
3,Scared and Worried,scared worried scared worried point enjoy life...,DISE
4,Chest pain or Nuerophy side effects,chest pain nuerophy effects diagnosed years au...,DISE


In [134]:
frames = [sdf,df]
result = pd.concat(frames)

In [135]:
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),('clf', MultinomialNB()), ])
text_clf = text_clf.fit(result['Question'], result['Category'])

### Accuracy with semi-supervised dataset

In [136]:
predicted = text_clf.predict(testdf['Question'])
Actual = testdf['Category']
np.mean(predicted == Actual)

0.40133333333333332

### Ensemble of NB, SVM and CNN

In [159]:
predicted_CNN=[]
with open('output.txt') as f:
        st = f.read()
        st = st.lstrip('[')
        st = st.rstrip(']')
        st=st.split(',')
        for x in st:
            x=x.strip()
            x=x.lstrip('\'')
            x=x.rstrip('\'')
            predicted_CNN.append(x)

In [169]:
Ensemble_predict = []

for i in range(3000):
    dicti={}
    try :
        dicti[predicted_CNN[i]] +=1
    except:
        dicti[predicted_CNN[i]]=1
    try :
        dicti[predicted_NB[i]] +=1
    except:
        dicti[predicted_NB[i]]=1
    try :
        dicti[predicted_svm[i]] +=1
    except:
        dicti[predicted_svm[i]]=1
    ma = 0
    for key,value in dicti.items():
        if value > ma:
            ma= value
            index = key
    if ma == 1:
        Ensemble_predict.append(predicted_NB[i])
    else :
        Ensemble_predict.append(index)
    

In [171]:
Actual = testdf['Category']

In [172]:
np.mean(Ensemble_predict == Actual)

0.66000000000000003