# Drug Review Sentiment Analysis

<br>


**Sentiment Analysis also known as Opinion Mining refers to the use of natural language processing, text analysis to systematically identify, extract, quantify, and study affective states and subjective information.**

**Sentiment analysis is widely applied to reviews and survey responses, online and social media, and healthcare materials for applications that range from marketing to customer service to clinical medicine.**

**In this project, we aim to perform Sentiment Analysis of Drug reviews. Data used in this project are online product reviews collected from “amazon.com”. We expect to do review-level categorization of review data with promising outcomes.**

# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

# Dataset Loader

In [2]:
path1 ="E:\Downlload\drugsComTest_raw.tsv"
class DataFrame_Loader():

    
    def __init__(self,error_bad_lines,sep):
        self.error_bad_lines = error_bad_lines
        self.sep = sep
        
        print("Loadind DataFrame")
        
    def load_json_files(self,path1):
        dftrain = pd.read_csv(path1,error_bad_lines=True,sep='\t')
        return dftrain

In [3]:
load = DataFrame_Loader(True,'\t')

Loadind DataFrame


In [4]:
df = load.load_json_files(path1)
df.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10.0,"February 28, 2012",22
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8.0,"May 17, 2009",17
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9.0,"September 29, 2017",3
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9.0,"March 5, 2017",35
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9.0,"October 22, 2015",4


# Text Preprocessing

In [5]:
from sklearn.model_selection import train_test_split
class DataFrame_Preprocessor():

    
    def __init__(self):
        
        
        print("Preprocessor object created")
        
        
    def preprocess(self,df):
        
        df['Sentiment'] = np.where(df['rating'] > 6, 1, 0)
        
        df= df[['review','Sentiment']]
        
        x = df['review']
        
        y = df['Sentiment']
        
        return train_test_split(x,y,test_size=0.1, random_state=0)

In [6]:
PR = DataFrame_Preprocessor()

Preprocessor object created


In [7]:
X_train, X_test, y_train, y_test = PR.preprocess(df)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((48389,), (5377,), (48389,), (5377,))

## Feature Engineering with Keras Tokenization and Pad Sequences

In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import text
from keras.utils import np_utils
from keras.models import Sequential
class Keras_Tokenizer():

    
    def __init__(self,max_features):
        
        self.max_features =6000
        
        
        print("Tokenizer object created")
        
        
    def __label_encoding(self,y_train):
        """
        Encode the given list of class labels
        :y_train_enc: returns list of encoded classes
        :labels: actual class labels
        """
        lbl_enc = LabelEncoder()

        y_train_enc = lbl_enc.fit_transform(y_train)
        labels = lbl_enc.classes_

        return y_train_enc, labels
    
    
    
    def __word_embedding(self,train, test, max_features, max_len=200):
        
        
        try:
            """ Keras Tokenizer class object """
            tokenizer = text.Tokenizer(num_words=max_features)
            tokenizer.fit_on_texts(train)

            train_data = tokenizer.texts_to_sequences(train)
            test_data = tokenizer.texts_to_sequences(test)

            """ Get the max_len """
            vocab_size = len(tokenizer.word_index) + 1

            """ Padd the sequence based on the max-length """
            x_train = sequence.pad_sequences(train_data, maxlen=max_len, padding='post')
            x_test = sequence.pad_sequences(test_data, maxlen=max_len, padding='post')
            """ Return train, test and vocab size """
            return tokenizer, x_train, x_test, vocab_size
        except ValueError as ve:
            raise(ValueError("Error in word embedding {}".format(ve)))
            
            
    def preprocess(self,X_train, X_test):
        
    
        return self.__word_embedding(X_train, X_test, self.max_features)

In [17]:
KT = Keras_Tokenizer(6000)

Tokenizer object created


In [18]:
tokenizer, x_pad_train, x_pad_valid, vocab_size = KT.preprocess(X_train, X_test)

In [25]:
x_pad_train.shape,x_pad_valid.shape,vocab_size

((48389, 200), (5377, 200), 33068)

# Modelling RNN Birectional lstm Architecture

In [114]:
from tensorflow import keras
class RNN_Bidirectional_lstm_Build_Pack():

    
    def __init__(self,
                 input_length,
                 output_length,
                 vocab_size,
                 optimizer,
                 loss,
                 metrics,
                 batch_size,
                 epochs,
                 verbose):
        
        self.input_length =200
        self.output_length= 200
        self.vocab_size = 33068
        self.optimizer = 'adam'
        self.loss = 'binary_crossentropy'
        self.metrics = ['acc']
        self.batch_size = 256
        self.epochs = 20
        self.verbose = 1
        
        
        
        print("Tokenizer object created")
        
    
    def build_rnn(self,vocab_size,output_dim, input_dim):

        model = Sequential([
            keras.layers.Embedding(self.vocab_size,output_dim = self.output_length,
                                  input_length = self.input_length),
            keras.layers.BatchNormalization(),
            keras.layers.Bidirectional(keras.layers.LSTM(256,return_sequences=True)),
            keras.layers.GlobalMaxPool1D(),
            keras.layers.Dense(225),
            keras.layers.Dropout(0.3),
            keras.layers.Dense(155),
            keras.layers.Dropout(0.3),
            keras.layers.Dense(150),
            keras.layers.Dropout(0.2),
            keras.layers.Dense(125),
            keras.layers.Dropout(0.2),
            keras.layers.Dense(95),
            keras.layers.Dropout(0.2),
            keras.layers.Dense(64),
            keras.layers.Dropout(0.1),
            keras.layers.Dense(34),
            keras.layers.Dropout(0.1),
            keras.layers.Dense(32),
            keras.layers.Dense(output_dim, activation='sigmoid')
        ])

        return model
    
    
    def Compile_and_Fit(self,rnn_model):
    
        rnn_model.compile(optimizer=self.optimizer, loss=self.loss, metrics=self.metrics)


        rnn_model.fit(x_pad_train, 
                                y_train,
                                batch_size=self.batch_size,
                               epochs=self.epochs,
                               verbose= self.verbose)
        
        score = rnn_model.evaluate(x_pad_valid, y_test, verbose=1)
        
        print("Loss:%.3f Accuracy: %.3f" % (score[0], score[1]))
        
        return rnn_model

In [115]:
Rnn_Model = RNN_Bidirectional_lstm_Build_Pack(200,200,33068,'adam','binary_crossentropy',['acc'],256,10,1)

Tokenizer object created


In [116]:
rnn_model = Rnn_Model.build_rnn(vocab_size,1,200)
rnn_model.summary()

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 200, 200)          6613600   
_________________________________________________________________
batch_normalization_12 (Batc (None, 200, 200)          800       
_________________________________________________________________
bidirectional_12 (Bidirectio (None, 200, 512)          935936    
_________________________________________________________________
global_max_pooling1d_12 (Glo (None, 512)               0         
_________________________________________________________________
dense_53 (Dense)             (None, 225)               115425    
_________________________________________________________________
dropout_35 (Dropout)         (None, 225)               0         
_________________________________________________________________
dense_54 (Dense)             (None, 155)             

In [118]:
rnn_model = Rnn_Model.Compile_and_Fit(rnn_model)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Loss:0.972 Accuracy: 0.859


# Prediction

In [121]:
y_preds = rnn_model.predict(x_pad_valid)

print("y_preds Shape ::",y_preds.shape)


for arr in y_preds:
    for i in range(len(arr)):
        if arr[i]>0.5:
            arr[i] = 1
        else:
            arr[i] = 0

            
y_preds = y_preds.astype('int32')

pred_df = pd.DataFrame(y_preds, columns=['pred'])

print(pred_df.shape)
pred_df.head()

y_preds Shape :: (5377, 1)
(5377, 1)


Unnamed: 0,pred
0,1
1,1
2,0
3,1
4,1


In [125]:
pred_df.value_counts()

pred
1       3637
0       1740
dtype: int64

# Metrics

In [126]:
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print(metrics.accuracy_score(y_test, pred_df))
        
print(metrics.confusion_matrix(y_test, pred_df))
        
print(metrics.classification_report(y_test, pred_df))

0.8590291984377906
[[1424  442]
 [ 316 3195]]
              precision    recall  f1-score   support

           0       0.82      0.76      0.79      1866
           1       0.88      0.91      0.89      3511

    accuracy                           0.86      5377
   macro avg       0.85      0.84      0.84      5377
weighted avg       0.86      0.86      0.86      5377



# Model Serialization

In [108]:
model_json = rnn_model.to_json()
with open("rnn_model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
rnn_model.save("rnn_model.h5", overwrite=False)

# Tokenizer Serialization

In [109]:
import pickle
with open('test_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)