In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer

from sklearn.model_selection import train_test_split
from lightgbm import *
from sklearn.metrics import *


import warnings
warnings.simplefilter('ignore')


In [14]:
df=pd.read_csv('drug_data_with_sentiment.csv')
df.head()

Unnamed: 0,drugName,condition,review,rating,date,usefulCount,sentiment,sentiment_label
0,L-methylfolate,Depression,"""I have taken anti-depressants for years, with...",10,"March 9, 2017",54,0.275,Positive
1,Sertraline,Depression,"""1 week on Zoloft for anxiety and mood swings....",8,"May 7, 2011",3,0.166667,Positive
2,Venlafaxine,Depression,"""my gp started me on Venlafaxine yesterday to ...",4,"April 27, 2016",3,-0.136508,Negative
3,Dulaglutide,"Diabetes, Type 2","""Hey Guys, It&#039;s been 4 months since my l...",10,"October 24, 2017",24,0.103571,Positive
4,Effexor XR,Depression,"""This medicine saved my life. I was at my wits...",10,"June 20, 2013",166,0.168194,Positive


In [15]:
a=['drugName', 'rating', 'date', 'usefulCount','sentiment', 'sentiment_label']
df.drop(a,inplace=True,axis=1)
df.head()

Unnamed: 0,condition,review
0,Depression,"""I have taken anti-depressants for years, with..."
1,Depression,"""1 week on Zoloft for anxiety and mood swings...."
2,Depression,"""my gp started me on Venlafaxine yesterday to ..."
3,"Diabetes, Type 2","""Hey Guys, It&#039;s been 4 months since my l..."
4,Depression,"""This medicine saved my life. I was at my wits..."


In [16]:
# Pre-process data
stop_words=set(stopwords.words('english'))
stemmer= SnowballStemmer('english')

In [17]:
def preprocess_text(text):
    #Remove Punctuation and special characters
    text=re.sub(r"[^a-zA-Z0-9]", " ",text.lower())
    
    # Tokenize The text
    tokens=nltk.word_tokenize(text)
    
    #Remove the stopwords
    tokens=[token for token in tokens if token not in stop_words]
    
    #stem the tokens
    stemmed_tokens=[stemmer.stem(token) for token in tokens]
    return " ".join(stemmed_tokens)

df['review_text']=df["review"].apply(preprocess_text)
df.head()

Unnamed: 0,condition,review,review_text
0,Depression,"""I have taken anti-depressants for years, with...",taken anti depress year improv most moder seve...
1,Depression,"""1 week on Zoloft for anxiety and mood swings....",1 week zoloft anxieti mood swing take 50mg mor...
2,Depression,"""my gp started me on Venlafaxine yesterday to ...",gp start venlafaxin yesterday help depress cha...
3,"Diabetes, Type 2","""Hey Guys, It&#039;s been 4 months since my l...",hey guy 039 4 month sinc last post want give m...
4,Depression,"""This medicine saved my life. I was at my wits...",medicin save life wit end anti depress readi g...


In [18]:
# Convert the text data into numerical features using TF-ID
X = df["review"]
y = df["condition"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
print('X_train:',X_train.shape)
print('y_train:',y_train.shape)
print('X_test:',X_test.shape)
print('X_test:',X_test.shape)

X_train: (11155,)
y_train: (11155,)
X_test: (2789,)
X_test: (2789,)


In [19]:
tfidf = TfidfVectorizer(ngram_range=(1,1))
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)

In [20]:
model= LGBMClassifier(
        n_estimators=10000,
        learning_rate=0.10,
        num_leaves=30,
        subsample=.9,
        max_depth=7,
        reg_alpha=.1,
        reg_lambda=.1,
        min_split_gain=.01,
        min_child_weight=2,
        silent=-1,
        verbose=-1,
        )
model.fit(X_train_tf,y_train)

In [21]:
y_pred=model.predict(X_test_tf)
accuracy=accuracy_score(y_test,y_pred)
print("Accuracy:",accuracy)

Accuracy: 0.9587665830046612


In [24]:
#gnerat classification report
report=classification_report(y_test,y_pred)
print(report)

                     precision    recall  f1-score   support

         Depression       0.96      0.99      0.97      1778
   Diabetes, Type 2       0.97      0.92      0.95       538
High Blood Pressure       0.95      0.88      0.91       473

           accuracy                           0.96      2789
          macro avg       0.96      0.93      0.94      2789
       weighted avg       0.96      0.96      0.96      2789



In [29]:
from pickle import dump


AttributeError: 'LGBMClassifier' object has no attribute 'save_model'