In [12]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from lightgbm import LGBMModel, LGBMClassifier, plot_importance

from sklearn.metrics import *

import warnings
warnings.simplefilter('ignore')


In [2]:
df=pd.read_csv('drug_data_with_sentiment.csv')
df.head()

Unnamed: 0,drugName,condition,review,rating,date,usefulCount,sentiment,sentiment_label
0,L-methylfolate,Depression,"""I have taken anti-depressants for years, with...",10,"March 9, 2017",54,0.275,Positive
1,Sertraline,Depression,"""1 week on Zoloft for anxiety and mood swings....",8,"May 7, 2011",3,0.166667,Positive
2,Venlafaxine,Depression,"""my gp started me on Venlafaxine yesterday to ...",4,"April 27, 2016",3,-0.136508,Negative
3,Dulaglutide,"Diabetes, Type 2","""Hey Guys, It&#039;s been 4 months since my l...",10,"October 24, 2017",24,0.103571,Positive
4,Effexor XR,Depression,"""This medicine saved my life. I was at my wits...",10,"June 20, 2013",166,0.168194,Positive


In [3]:
a=['drugName', 'rating', 'date', 'usefulCount','sentiment', 'sentiment_label']
df.drop(a,inplace=True,axis=1)
df.head()

Unnamed: 0,condition,review
0,Depression,"""I have taken anti-depressants for years, with..."
1,Depression,"""1 week on Zoloft for anxiety and mood swings...."
2,Depression,"""my gp started me on Venlafaxine yesterday to ..."
3,"Diabetes, Type 2","""Hey Guys, It&#039;s been 4 months since my l..."
4,Depression,"""This medicine saved my life. I was at my wits..."


In [4]:
# Pre-process data
stop_words=set(stopwords.words('english'))
stemmer= SnowballStemmer('english')

In [5]:
def preprocess_text(text):
    #Remove Punctuation and special characters
    text=re.sub(r"[^a-zA-Z0-9]", " ",text.lower())
    
    # Tokenize The text
    tokens=nltk.word_tokenize(text)
    
    #Remove the stopwords
    tokens=[token for token in tokens if token not in stop_words]
    
    #stem the tokens
    stemmed_tokens=[stemmer.stem(token) for token in tokens]
    return " ".join(stemmed_tokens)

df['review_text']=df["review"].apply(preprocess_text)
df.head()

Unnamed: 0,condition,review,review_text
0,Depression,"""I have taken anti-depressants for years, with...",taken anti depress year improv most moder seve...
1,Depression,"""1 week on Zoloft for anxiety and mood swings....",1 week zoloft anxieti mood swing take 50mg mor...
2,Depression,"""my gp started me on Venlafaxine yesterday to ...",gp start venlafaxin yesterday help depress cha...
3,"Diabetes, Type 2","""Hey Guys, It&#039;s been 4 months since my l...",hey guy 039 4 month sinc last post want give m...
4,Depression,"""This medicine saved my life. I was at my wits...",medicin save life wit end anti depress readi g...


In [14]:
# Convert the text data into numerical features using TF-ID
X = df["review"]

y = df["condition"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)
print('X_train:',X_train.shape)
print('y_train:',y_train.shape)
print('X_test:',X_test.shape)
print('X_test:',X_test.shape)

X_train: (11155,)
y_train: (11155,)
X_test: (2789,)
X_test: (2789,)


In [15]:
tfidf = TfidfVectorizer(ngram_range=(1,1))
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)

In [20]:
X_train_tf

<11155x13485 sparse matrix of type '<class 'numpy.float64'>'
	with 651947 stored elements in Compressed Sparse Row format>

In [34]:
model= LGBMClassifier(
        n_estimators=10000,
        learning_rate=0.10,
        num_leaves=30,
        subsample=.9,
        max_depth=7,
        reg_alpha=.1,
        reg_lambda=.1,
        min_split_gain=.01,
        min_child_weight=2,
        silent=-1,
        verbose=-1,
        )
model.fit(X_train_tf,y_train)

In [35]:
y_pred=model.predict(X_test_tf)
accuracy=accuracy_score(y_test,y_pred)
print("Accuracy:",accuracy)

Accuracy: 0.959483685908928


In [36]:
import string
def predict_condition(text):
    text=re.sub(r"[^a-zA-Z0-9]", " ",text.lower())
    text=" ".join([word for word in text.split() if word not in (stopwords.words('english'))])
    text=re.sub('@[A-Za-z0-9]+','',text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub("[0-9" "]+"," ",text)
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('#','',text) 
    text = re.sub('RT[\s]+','',text)
    text = re.sub('@[\w]*','',text) 
    text = re.sub('https?:\/\/\S+', '', text) 
    text = re.sub('\n',' ',text)
    
    # Tokenize The text
    tokens=nltk.word_tokenize(text)
    
    #Remove the stopwords
    tokens=[token for token in tokens if token not in stop_words]
    
    #stem the tokens
    stemmed_tokens=[stemmer.stem(token) for token in tokens]
    text= " ".join(stemmed_tokens)
    
    
    review_v = tfidf.transform([text]) 
    
    
    #model=SVC(kernel='linear')
    pred=model.predict(review_v)
    return pred

In [25]:
review=" I am feel so depressed right now."
condition=predict_condition(review)
print('Predicted condition:',condition)

Predicted condition: ['Depression']


In [26]:
review="I&#039;m a 27 year old white male in good shape but have always had high blood pressure...I&#039;m guessing genetic. My blood pressure would run anywhere from 130-160 over 85-105. The doctor tried a diuretic which did nothing, and then a beta blocker which had horrible side effects. They finally tried 10mg of lisinopril and wow how great! It almost immediately brought my blood pressure to 118/80&#039;s. I actually cut my dose to 2.5 mg in the morning and 2.5 mg at lunch and my blood pressure stays perfect all day and night. I have had no side effects whatsoever and wouldn&#039;t even know I was taking any medication if it weren&#039;t for my lowered blood pressure. Highly recommend!"
condition=predict_condition(review)
print('Predicted condition:',condition)

Predicted condition: ['High Blood Pressure']


In [38]:
review="I have been taking Saxenda since July 2016.  I had severe nausea for about a month once I got up to the 2.6 dosage.  It has since subsided and the only side effect I notice now is the dry mouth.  I make sure to drink  2.5 litres of water a day (about 10 glasses).  This helps with the weight loss as well as the constipation.  I have been reducing my dose to find a comfortable spot where I am still losing weight but don&#039;t feel like I am over medicating.  For me, 1.8 is working very well.  I also feel wearing a Fitbit has really helped.  I can track my food, water, exercise and steps - it keeps me moving more.  When this started I could barely walk the length of myself without getting winded - I have lost 58 lbs so far."
condition=predict_condition(review)
print('Predicted condition:',condition)

Predicted condition: ['Diabetes, Type 2']


In [37]:
review=" Telmikind-H 40Mg Tablet is used to reduce the risk of heart complications like coronary artery disease and stroke in the elderly population."
condition=predict_condition(review)
print('Predicted condition:',condition)

Predicted condition: ['High Blood Pressure']


In [42]:
review="Spring of 2008 I was hospitalized with pnuemonia and diagnosed with Lyme diease and full blown AIDS with CD4 count of &quot;11&quot; viral load some number so high in the millions I could never remember. I was taking Combivir and Kaletra with Dapsone for the 1st year then it stopped working. I started Kaletra with the Dapsone my CD4 count is now 209 and rising. For a few weeks I was very aggressive and broke all my dishes in the house LOL. I take vitamin supplements and drink a boost pluz every day. LIfe is good now!"
condition=predict_condition(review)
print('Predicted condition:',condition)

Predicted condition: ['Depression']
