# Exploratory Data Analysis

## Import libraries

In [66]:
import pandas as pd
import numpy as np
import fasttext
import fasttext.util
from sklearn.model_selection import train_test_split

### Load data

In [67]:
df = pd.read_csv('Train.csv', sep=';')
df.head()

Unnamed: 0,name,condition,opinion,rate,rate1
0,Zegerid,GERD,"""Using it as a replacement for Nexium, since i...",10,high
1,Ethosuximide,Seizures,"""This medicine is very good at controlling me ...",10,high
2,Tri-Sprintec,Birth Control,"""I just started taking Tri Sprintec after my l...",9,high
3,Levaquin,Pneumonia,"""This medicine made me feel absolutely horribl...",5,medium
4,Methylphenidate,ADHD,"""I&#039;ve been taking Concerta since 2003. Fo...",9,high


In [68]:
len(df)

150000

In [69]:
imbalance = df.groupby(by='rate').name.count()

In [70]:
imbalance = np.ceil(21/np.ceil(((imbalance/imbalance.min())*2)))

In [71]:
df['op_len'] = df['opinion'].str.len()

# Data cleaning

In [72]:
from html import unescape

Remove html escaping

In [73]:
df.update(df[df['opinion'].str.contains('&')]['opinion'].apply(unescape))
df.opinion.head()

0    "Using it as a replacement for Nexium, since i...
1    "This medicine is very good at controlling me ...
2    "I just started taking Tri Sprintec after my l...
3    "This medicine made me feel absolutely horribl...
4    "I've been taking Concerta since 2003. For me ...
Name: opinion, dtype: object

remove trailing "

In [74]:
df['opinion'] = df['opinion'].str[1:-1]
df.opinion.head()

0    Using it as a replacement for Nexium, since in...
1    This medicine is very good at controlling me s...
2    I just started taking Tri Sprintec after my la...
3    This medicine made me feel absolutely horrible...
4    I've been taking Concerta since 2003. For me i...
Name: opinion, dtype: object

In [75]:
df['opinion'] = df['opinion'].str.lower()
df.opinion.head()

0    using it as a replacement for nexium, since in...
1    this medicine is very good at controlling me s...
2    i just started taking tri sprintec after my la...
3    this medicine made me feel absolutely horrible...
4    i've been taking concerta since 2003. for me i...
Name: opinion, dtype: object

In [76]:
df = df[df['opinion'].str.len() > 2]

In [77]:
df['name'] = df['name'].str.lower()
df['condition'] = df['condition'].str.lower().dropna()

In [78]:
df = df.sample(frac=1.0)

In [79]:
# Clean the data up
import re
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;:]')
BAD_SYMBOLS_RE = re.compile('[^a-z #+_]')
UTF_CODES = re.compile('#\d+')

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = UTF_CODES.sub('', text)
    # We do not remove stop words for LSTM because having a 'not' in a sentence is actually kind of useful
    #text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
df['opinion'] = df['opinion'].apply(clean_text)

In [80]:
df['condition'] = df['condition'].astype('string').dropna().apply(clean_text)
df['name'] = df['name'].astype('string').dropna().apply(clean_text)

In [81]:
train, test = train_test_split(df, test_size=0.1)


In [104]:
import csv
import re

location_train_fasttext = "opinion.train.ft" #will be created
location_test_fasttext = "opinion.test.ft" #will be created

#creates Vowpal Wabbit-formatted file from tsv file
def to_fasttext(df,location_output_file, test = False):
    with open(location_output_file, "w") as outfile:
    #create a reader to read train file
        #for every line
        for i, row in df.iterrows():
            #if test set label doesnt matter/or isnt available
            label = str(row['rate'])
            imb = int(imbalance[row['rate']])
            phrase = row['opinion']
            outfile.write('__label__'+label+' '+phrase + "\n" )

## 65.8%

In [29]:
model = fasttext.train_supervised(input=location_train_fasttext, lr=1.0, epoch=25, wordNgrams=2)

In [30]:
model.test(location_test_fasttext)

(29998, 0.6586105740382692, 0.6586105740382692)

# 71.3%

In [18]:
model = fasttext.train_supervised(input=location_train_fasttext, lr=1.0, epoch=25, wordNgrams=5)

In [19]:
model.test(location_test_fasttext)

(14999, 0.7133142209480632, 0.7133142209480632)

In [20]:
model = fasttext.train_supervised(input=location_train_fasttext, lr=1.0, epoch=25, wordNgrams=7)
model.test(location_test_fasttext)

(14999, 0.7146476431762118, 0.7146476431762118)

In [21]:
model = fasttext.train_supervised(input=location_train_fasttext, lr=0.5, epoch=25, wordNgrams=7)
model.test(location_test_fasttext)

(14999, 0.7140476031735449, 0.7140476031735449)

In [22]:
model = fasttext.train_supervised(input=location_train_fasttext, lr=0.1, epoch=30, wordNgrams=7)
model.test(location_test_fasttext)

(14999, 0.701580105340356, 0.701580105340356)

In [23]:
model = fasttext.train_supervised(input=location_train_fasttext, lr=0.3, epoch=30, wordNgrams=5)
model.test(location_test_fasttext)

(14999, 0.7115807720514701, 0.7115807720514701)

In [24]:
model = fasttext.train_supervised(input=location_train_fasttext, lr=0.3, epoch=30, wordNgrams=5, neg=10)
model.test(location_test_fasttext)

(14999, 0.7122474831655443, 0.7122474831655443)

In [25]:
model = fasttext.train_supervised(input=location_train_fasttext, lr=0.01, epoch=50, wordNgrams=5)
model.test(location_test_fasttext)

(14999, 0.46529768651243414, 0.46529768651243414)

In [28]:
model = fasttext.train_supervised(input=location_train_fasttext, lr=0.01, epoch=200, wordNgrams=5)
model.test(location_test_fasttext)

In [19]:
model.test(location_test_fasttext)

(14999, 0.4523634908993933, 0.4523634908993933)

# BEST MODEL 80%

In [83]:
model = fasttext.train_supervised(input=location_train_fasttext, lr=1.0, epoch=25, wordNgrams=5, ws=7, dim=300)
model.test(location_test_fasttext)

(14999, 0.7111807453830256, 0.7111807453830256)

In [42]:
model = fasttext.train_supervised(input=location_train_fasttext, lr=1.0, epoch=20, wordNgrams=4, ws=7, dim=300)
model.test(location_test_fasttext)

(14999, 0.8054536969131275, 0.8054536969131275)

In [84]:
preds = test['opinion'].apply(model.predict)

In [85]:
c = 0
for pred, rate in zip(preds, test['rate']):
    if int(pred[0][0][-1:]) == int(rate):
        c+=1
    elif int(pred[0][0][-1:])==0 and int(rate)==10:
        c+=1
print(c/len(test))

0.7111807453830256


# K-Fold CV

In [88]:
from sklearn.model_selection import KFold

In [103]:
kf = KFold(n_splits=10)
for train_index, test_index in kf.split(df['opinion']):
    train = df.iloc[train_index, 2:4]
    test = df.iloc[test_index, 2:4]
    to_fasttext(train, location_train_fasttext)
    to_fasttext(test, location_test_fasttext, test=True)
    model = fasttext.train_supervised(input=location_train_fasttext, lr=1.0, epoch=25, wordNgrams=5, ws=7, dim=300)
    model.test(location_test_fasttext)
    c = 0
    preds = test['opinion'].apply(model.predict)
    for pred, rate in zip(preds, test['rate']):
        if int(pred[0][0][-1:]) == int(rate):
            c+=1
        elif int(pred[0][0][-1:])==0 and int(rate)==10:
            c+=1
    print(c/len(test))

opinion  rate
83567  i started using enbrel two years ago i can sti...    10
67584  i love zonegran  i have no seizures while on t...     9
67779  so  ive had the patch for  years and a half no...    10
64045  i have had the implanon since december    i am...     8
97861  seeing all the positive reviews its upsetting ...     1                                                   opinion  rate
16162   ive been on aviane for about  weeks im on my t...     4
133562  ive been using desmopressin acetate injection ...    10
143450  my second round of aldara i am hpv +    the mo...     8
113607  the only antidepressant that has actually give...     9
23893   this is my second day of taking phentermine  t...    10
                                                  opinion  rate
16162   ive been on aviane for about  weeks im on my t...     4
133562  ive been using desmopressin acetate injection ...    10
143450  my second round of aldara i am hpv +    the mo...     8
113607  the only antidepressant

In [210]:
results = pd.read_csv('opinion.preds.txt', names=['pred', 'row'], sep=' ')
results = results.set_index('row')
results

Unnamed: 0_level_0,pred
row,Unnamed: 1_level_1
137633,6
6492,4
98406,4
67886,4
139546,7
...,...
7442,2
35877,9
148681,1
99621,2


In [211]:
original = pd.read_csv('opinion.test.vw', sep = '|', header=None)
original = original.iloc[:,0]
original_rate = original.apply(lambda x: int(x.split(' ')[0]))
original_index = original.apply(lambda x: int(x.split(' ')[2][1:]))
original = pd.DataFrame()
original['index'] = original_index
original['rate'] = original_rate
original = original.set_index('index')['rate']
original

index
137633     2
6492       4
98406      8
67886     10
139546     3
          ..
7442       8
35877      6
148681     8
99621      9
75075     10
Name: rate, Length: 29998, dtype: int64