In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.proportion import proportions_chisquare
from scipy.stats import chisquare
import pickle
from bs4 import BeautifulSoup
from collections import defaultdict
import requests
%matplotlib inline

import nltk
from nltk.tokenize import word_tokenize
import string
from nltk.stem.snowball import SnowballStemmer
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [19]:
## only need to remove punctuation and stemize
stemmer = SnowballStemmer('english')

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

In [20]:
df = pd.read_csv("web_scrap.csv",index_col=0)
df.columns = ['first','Overall rating','Effectiveness','Side effects','Condition','Dosage','Other conditions','Other drugs taken','Benefits','Detailed Side effects','Comments']
tem = df['first'].str.split(" ",expand=True)[[0,4,7]]
tem.columns = ['name','age','gender']
df = pd.concat([tem,df],1).drop('first',1)

In [21]:
df.groupby('Side effects').size()

Side effects
 Extremely Severe Side Effects     500
 Mild Side Effects                2559
 Moderate Side Effects            1637
 No Side Effects                  2330
 Severe Side Effects               932
dtype: int64

In [22]:
df['Side effects Yes/No'] = (df['Side effects'] != ' No Side Effects').astype(int)

In [23]:
train, test = train_test_split(df, test_size=0.25)

In [24]:
con_vec = TfidfVectorizer(stop_words='english',tokenizer=tokenize)
X_train = con_vec.fit_transform(train['Comments'])
y_train = train['Side effects Yes/No']

X_test = con_vec.transform(test['Comments'])
y_test = test['Side effects Yes/No']

  'stop_words.' % sorted(inconsistent))


In [25]:
lr = LogisticRegression(penalty='l2')
lr_cv_score = cross_val_score(lr,X_train,y_train,scoring='accuracy',cv=3,n_jobs=-1)
lr_cv_score

array([0.73768844, 0.74610357, 0.7340372 ])

In [26]:
svm_lin = SVC(kernel='linear')
svm_lin_cv_score = cross_val_score(svm_lin,X_train,y_train,scoring='accuracy',cv=3,n_jobs=-1)
svm_lin_cv_score

array([0.7678392 , 0.76973353, 0.75465058])

In [27]:
svm_lin = SVC(kernel='rbf')
svm_lin_cv_score = cross_val_score(svm_lin,X_train,y_train,scoring='accuracy',cv=3,n_jobs=-1)
svm_lin_cv_score

array([0.70603015, 0.70638512, 0.70638512])

In [30]:
rfc = RandomForestClassifier(n_estimators=800,n_jobs=-1)
rfc_cv_score = cross_val_score(rfc,X_train,y_train,scoring='accuracy',cv=3,n_jobs=-1)
rfc_cv_score

array([0.84623116, 0.84464555, 0.83660131])

In [34]:
rfc = RandomForestClassifier(n_estimators=800,n_jobs=-1)
rfc.fit(X_train,y_train)
rfc.feature_importances_

array([8.77195175e-07, 7.99636998e-06, 3.52125225e-06, ...,
       3.12009485e-06, 3.49801123e-06, 2.53551888e-06])

In [41]:
importance = pd.DataFrame(rfc.feature_importances_,con_vec.get_feature_names())
importance[0].sort_values(ascending=False)

day                 0.012572
onc                 0.010308
tablet              0.009960
pill                0.009482
daili               0.009462
morn                0.009191
1                   0.006762
took                0.005866
time                0.005693
twice               0.005627
treatment           0.005612
hour                0.005436
taken               0.005352
befor               0.005281
need                0.005278
everi               0.005064
year                0.004677
mg                  0.004666
effect              0.004657
week                0.004527
night               0.004270
dosag               0.004089
just                0.004061
month               0.003830
stop                0.003483
use                 0.003435
bed                 0.003416
drug                0.003403
thyroid             0.003389
onli                0.003292
                      ...   
aarp                0.000000
meunfunct           0.000000
tentat              0.000000
metoprolol    

In [29]:
gbc = GradientBoostingClassifier(n_estimators=100)
gbc_cv_score = cross_val_score(gbc,X_train,y_train,scoring='accuracy',cv=3,n_jobs=-1)
gbc_cv_score

array([0.72110553, 0.72900955, 0.71644042])