In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
#from bs4 import BeautifulSoup
from collections import defaultdict
#import requests
%matplotlib inline
from sklearn.svm import SVC
from nltk.stem.snowball import SnowballStemmer

## remove special symbol
def rm_sym(df):
    df['review'] = df['review'].str.replace("&#039;",'\'')
    df['review'].head()
    df['rating_cate'] = ''
    df.loc[df['rating'] >= 7,'rating_cate'] = 'high'
    df.loc[df['rating'] <= 4,'rating_cate'] = 'low'
    df.loc[(df['rating'] > 4) & (df['rating'] < 7),'rating_cate'] = 'medium'
    return df

def clean_text(df_tem3):
    df_tem3['review'] = df_tem3['review'].str.replace("\"","").str.lower()
    df_tem3['review'] = df_tem3['review'].str.replace( r"(\\r)|(\\n)|(\\t)|(\\f)|(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(&#039;)|(\d\s)|(\d)|(\/)","")
    df_tem3['review'] = df_tem3['review'].str.replace("\"","").str.lower()
    df_tem3['review'] = df_tem3['review'].str.replace( r"(\$)|(\-)|(\\)|(\s{2,})"," ")
    df_tem3['review'].sample(1).iloc[0]

    stemmer = SnowballStemmer('english')
    df_tem3['review'] = df_tem3['review'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split(" ")]))
    return df_tem3

In [2]:
df = pd.read_csv('drugsCom_raw/drugsComTrain_raw.tsv',sep='\t',index_col=0).sample(20000)
df = rm_sym(df)
df_tem3 = df

test = pd.read_csv("drugsCom_raw/drugsComTest_raw.tsv",sep='\t', index_col=0)
test = rm_sym(test)

df_tem3 = clean_text(df_tem3)
test = clean_text(test)

In [3]:
df_tem3.groupby('rating_cate').size()

rating_cate
high      13369
low        4839
medium     1792
dtype: int64

In [4]:
print(df_tem3.shape)
print(test.shape)

(20000, 7)
(53766, 7)


In [7]:
import tensorflow as tf
import tensorflow 

#from tensorflow import tensorflow.keras

#from keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer

# fix random seed for reproducibility

MAX_NB_WORDS = 3000
max_review_length = 500
EMBEDDING_DIM = 160


In [8]:
# Tokenize the data
tokenizer = Tokenizer(num_words = MAX_NB_WORDS, 
                      filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
                      lower=True, split=' ', char_level=False, 
                      oov_token=None, document_count=0)

tokenizer.fit_on_texts(df_tem3['review'])
train_sequences = tokenizer.texts_to_sequences(df_tem3['review'])
test_sequences = tokenizer.texts_to_sequences(test['review'])


In [9]:
# truncate and pad input sequences
X_train = sequence.pad_sequences(train_sequences, maxlen=max_review_length)
X_test = sequence.pad_sequences(test_sequences, maxlen = max_review_length)


In [10]:
y_train = pd.get_dummies(df_tem3['rating_cate'])
y_test = pd.get_dummies(test['rating_cate'])

word_index = tokenizer.word_index
y_train.head()

Unnamed: 0,high,low,medium
167209,0,1,0
125031,0,1,0
81662,1,0,0
157486,1,0,0
113069,0,1,0


In [11]:
# Print shapes of data. 

print(X_train.shape, '<-- shape of train_data ready for val/train split.')
print(X_test.shape, '<-- shape of final_test_data ready for fedding to network.')
print(len(tokenizer.word_index), '<-- Length of Word Index')

(20000, 500) <-- shape of train_data ready for val/train split.
(53766, 500) <-- shape of final_test_data ready for fedding to network.
18430 <-- Length of Word Index


In [12]:
# Split Training & Validation Data
from sklearn.model_selection import train_test_split


print('creating train and validation data by dividing train_data in 80:20 ratio')
######################################################

X_train_t, X_train_val, Y_train_t, y_train_val = train_test_split(X_train, y_train,test_size = 0.2)

######################################################
print('train data shape:', X_train_t.shape)
print('validation data shape:', X_train_val.shape)
print('Data is ready for training!!')

creating train and validation data by dividing train_data in 80:20 ratio
train data shape: (16000, 500)
validation data shape: (4000, 500)
Data is ready for training!!


In [13]:
nb_words  = min(MAX_NB_WORDS, len(word_index))
lstm_out = max_review_length

model = Sequential()
model.add(Embedding(nb_words,EMBEDDING_DIM,input_length=max_review_length))
model.add(LSTM(50))
#model.add(Attention(MAX_SEQUENCE_LENGTH))
model.add(Dense(3, activation = 'softmax'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [14]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 160)          480000    
_________________________________________________________________
lstm (LSTM)                  (None, 50)                42200     
_________________________________________________________________
dense (Dense)                (None, 3)                 153       
Total params: 522,353
Trainable params: 522,353
Non-trainable params: 0
_________________________________________________________________


In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


training_cycle = 1
batch = 32 
notebookname = "Drug_Data_"
variant = "LSTM_w_stopwords_"
version = "1.0_"
title = notebookname + variant + version


stamp = '{}trainging_cycle{}batchsize_{}'.format(title,training_cycle,batch)
print(stamp)
best_model_path = title + stamp + 'best.h5'

early_stopping = EarlyStopping(patience = 4)
model_checkpoint = ModelCheckpoint(best_model_path, save_best_only = True)


# Run LSTM Model
epoch = 40
LSTM_model = model.fit(X_train_t, Y_train_t, batch_size=batch, epochs=epoch,
                       validation_data=(X_train_val, y_train_val), shuffle = True, 
                       callbacks = [early_stopping, model_checkpoint], verbose = 0)
best_score = min(LSTM_model.history['val_loss'])

Drug_Data_LSTM_w_stopwords_1.0_trainging_cycle1batchsize_32


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [None]:
LSTM_model.history['val_accuracy']

In [None]:
## 1000
a= [0.7633334,
 0.765,
 0.7733334,
 0.7766667,
 0.79,
 0.78333336,
 0.77833337,
 0.77833337,
 0.78666663]



In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
accr = model.evaluate(X_test,y_test, batch_size = 100)
accuracy_score(y_test.values.argmax(axis=1),prediction.argmax(axis=1))
confusion_matrix(y_test.values.argmax(axis=1),prediction.argmax(axis=1))