<a href="https://colab.research.google.com/github/BrittonWinterrose/AB-Demo/blob/master/Drug_Data_NLP_Attention_Network_w_Glove.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
!pip install -U -q PyDrive
from tqdm import tqdm
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

import re
import csv
import codecs
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.models import load_model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

import sys

from keras import backend as K
from keras.engine.topology import Layer
#from keras import initializations
from keras import initializers, regularizers, constraints



Using TensorFlow backend.


In [0]:
class Attention(Layer):
     # Input shape 3D tensor with shape: `(samples, steps, features)`.
     # Output shape 2D tensor with shape: `(samples, features)`.

    def __init__(self, step_dim,W_regulizer = None,b_regulizer = None,
                 W_constraint = None, b_constraint = None,bias = True,**kwargs):
        
        self.W_regulizer = W_regulizer
        self.b_regulizer = b_regulizer
        
        self.W_constraint = W_constraint
        self.b_constraint = b_constraint
        
        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        self.init = initializers.get('glorot_uniform')
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3
        # Create a trainable weight variable for this layer.
        self.W = self.add_weight(shape=(input_shape[-1],),
                                      initializer= self.init,
                                      constraint = self.W_constraint,
                                      regularizer = self.W_regulizer,
                                      name = '{}_W'.format(self.name))
        
        self.features_dim = input_shape[-1]
        
        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regulizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None
        super(Attention, self).build(input_shape)  

    
    def call(self, x, mask=None):
      
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
           
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
    
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [3]:
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Get the dataset from the source
!wget http://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip
!unzip drugsCom_raw.zip

# Get the list of custom drugname stopwords. 
downloaded = drive.CreateFile({'id': "1msOnZkWx0qu8RwI9eT3FEQbOoQcGUR7Q"})
downloaded.GetContentFile("drugname_stopwords.csv")

# Map Files
#TRAIN_DATA_FILE='drugsComTrain_raw.tsv'
#TEST_DATA_FILE='drugsComTest_raw.tsv'

# Set Sequencing Variables
MAX_SEQUENCE_LENGTH = 180
MAX_NB_WORDS = 10000
EMBEDDING_DIM = 100 # Set equal to the GLOVE dimension # 300
VALIDATION_SPLIT = 0.1

num_lstm = 100 #300
num_dense = 75 #256
lstm_dropout_rate = 0.25
dense_dropout_rate = 0.25

act = 'relu'
 

--2018-12-13 07:50:17--  http://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.249
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.249|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42989872 (41M) [application/zip]
Saving to: ‘drugsCom_raw.zip’


2018-12-13 07:50:22 (7.81 MB/s) - ‘drugsCom_raw.zip’ saved [42989872/42989872]

Archive:  drugsCom_raw.zip
  inflating: drugsComTest_raw.tsv    
  inflating: drugsComTrain_raw.tsv   


In [4]:
#!wget http://nlp.stanford.edu/data/glove.840B.300d.zip
#!unzip glove.840B.300d.zip

!wget http://nlp.stanford.edu/data/glove.twitter.27B.zip
!unzip glove.twitter.27B.zip

--2018-12-13 07:51:17--  http://nlp.stanford.edu/data/glove.twitter.27B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.twitter.27B.zip [following]
--2018-12-13 07:51:17--  https://nlp.stanford.edu/data/glove.twitter.27B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1520408563 (1.4G) [application/zip]
Saving to: ‘glove.twitter.27B.zip’


2018-12-13 07:52:31 (19.6 MB/s) - ‘glove.twitter.27B.zip’ saved [1520408563/1520408563]

Archive:  glove.twitter.27B.zip
  inflating: glove.twitter.27B.25d.txt  
  inflating: glove.twitter.27B.50d.txt  
  inflating: glove.twitter.27B.100d.txt  
  inflating: glove.twitter.27B.200d.txt  


In [5]:
EMBEDDING_FILE='glove.twitter.27B.100d.txt'
nltk.download('stopwords')

drug_stopwords = pd.read_csv('drugname_stopwords.csv')
drug_stopwords = drug_stopwords["0"].tolist()
print(drug_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
['acetaminophen', 'pseudoephedrine', 'phenylephrine', 'estradiol', 'dextromethorphan', 'codeine', 'hydrochlorothiazide', 'chlorpheniramine', 'guaifenesin', 'metformin', 'ethinyl', 'mucinex', 'hydrocortisone', 'peroxide', 'amlodipine', 'diphenhydramine', 'tylenol', 'tenofovir', 'polymyxin', 'dha', 'hfa', 'benzoyl', 'advil', 'vicks', 'butalbital', 'loestrin', 'emtricitabine', 'lidocaine', 'salicylate', 'salicylic', 'odt', 'naproxen', 'claritin', 'senna', 'hyoscyamine', 'sudafed', 'estrogens', 'valsartan', 'brompheniramine', 'dexamethasone', 'gildess', 'intensol', 'hct', 'junel', 'afrin', 'globulin', 'neomycin', 'simethicone', 'excedrin', 'methenamine', 'depo', 'promethazine', 'microgestin', 'nizoral', 'monistat', 'naloxone', 'respimat', 'norethindrone', 'pyrilamine', 'ellipta', 'atropine', 'olmesartan', 'lamivudine', 'hydroxide', 'oxycodone', 'provera', 'fluticasone', 'levonorge

In [6]:
########################################
## index word vectors.
########################################
print('Indexing word vectors')
embedding_index = {}

with open(EMBEDDING_FILE,'r', encoding='utf8') as f:
  for line in tqdm(f):
      values = line.split()
      word = values[0]
      coefs = np.asarray(values[1:], dtype='float32')
      embedding_index[word] = coefs
  f.close()
print('Indexed the word vectors')   
print('Found %s word vectors.' %len(embedding_index))     

0it [00:00, ?it/s]

Indexing word vectors


1193514it [00:37, 31637.25it/s]

Indexed the word vectors
Found 1193514 word vectors.





In [0]:
# Turn rating into new "binned" classes column
train_df = pd.read_table('drugsComTrain_raw.tsv')
test_df = pd.read_table('drugsComTest_raw.tsv')

def rank_bin(array):
  y_rank = []
  for i in array:
    if i <= 4: # Negative Rating Cut Off (Inclusive)
      y_rank.append('Negative')
    elif i >= 7: # Positive Rating Cut Off (Inclusive)
      y_rank.append('Positive')
    else:
      y_rank.append('Neutral')
  return y_rank

train_df['classes_to_predict'] = rank_bin(train_df['rating'])
test_df['classes_to_predict'] = rank_bin(test_df['rating'])


In [0]:
classes_to_predict = ['Positive','Negative','Neutral']

In [9]:
train_df.classes_to_predict.value_counts() # Check to see the bin sizes. 


Positive    106866
Negative     40075
Neutral      14356
Name: classes_to_predict, dtype: int64

In [10]:
########################################
## Basic preprocessing of text data. 
########################################
print('performing some basic preprocessing on data')

#regex for removing non-alphanumeric characters and spaces
remove_special_char = re.compile('r[^a-z\d]',re.IGNORECASE)

#regex to replace all numerics
replace_numerics = re.compile(r'\d+',re.IGNORECASE)

performing some basic preprocessing on data


In [11]:
##############################################################################################
## fuction for coverting the text & stopword removal and stemming.
##############################################################################################
stop_words = nltk.corpus.stopwords.words('english')
print ("NLTK stopword list length: ",len(stop_words),'words')
stop_words.extend(drug_stopwords)
print ("Added custom stopwords")
print ("New stopword list length: ",len(stop_words),'words')

def preprocess_text(text, remove_stopwords = True, perform_stemming = True):
    #convert text to lowercase and split.
    text = text.lower().split()
    
    #stopword removal(you can use your own set of stopwords, here we are using default from nltk stopwords)
    if(remove_stopwords):
        text = [word for word in text if word not in stop_words]
     
    text = ' '.join(text)   
    
    text = remove_special_char.sub('', text)
    text = replace_numerics.sub('n', text)
        
    if(perform_stemming):
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = ' '.join(stemmed_words)
        
    return text    


NLTK stopword list length:  179 words
Added custom stopwords
New stopword list length:  3057 words


In [12]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount,classes_to_predict
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27,Positive
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,Positive
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,Neutral
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,Positive
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37,Positive


In [13]:
##################################################
## forming sequeces to feed into the network.
################################################## 
import numpy as np
raw_train_comments = train_df['review'].fillna('NA').values
raw_test_comments = test_df['review'].fillna('NA').values
y = pd.get_dummies(train_df['classes_to_predict']).values
np.savetxt("y_train.csv", y, delimiter=",", fmt='%s')
y_test_predicted = pd.get_dummies(test_df['classes_to_predict']).values
np.savetxt("y_test.csv", y_test_predicted, delimiter=",", fmt='%s')
print ('Raw comments: \n',raw_train_comments[1:5])

processed_train_comments = []
for comment in tqdm(raw_train_comments):
    processed_train_comments.append(preprocess_text(comment))

np.savetxt("processed_train_comments.csv", processed_train_comments, delimiter=",", fmt='%s')
    
processed_test_comments = []    
for comment in tqdm(raw_test_comments):
    processed_test_comments.append(preprocess_text(comment))

np.savetxt("processed_test_comments.csv", processed_test_comments, delimiter=",", fmt='%s')
    
print ('Processed comments: \n',processed_train_comments[1:5])

  0%|          | 32/161297 [00:00<08:27, 317.46it/s]

Raw comments: 
 ['"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. \r\nWe have tried many different medications and so far this is the most effective."'
 '"I used to take another oral contraceptive, which had 21 pill cycle, and was very happy- very light periods, max 5 days, no other side effects. But it contained hormone gestodene, which is not available in US, so I switched to Lybrel, 

100%|██████████| 161297/161297 [07:06<00:00, 378.34it/s]
100%|██████████| 53766/53766 [02:20<00:00, 382.47it/s]


Processed comments: 
 ['"mi son halfway fourth week intuniv. becam concern began last week, start take highest dose on. two days, could hard get bed, cranky, slept near n hour drive home school vacat (veri unusu him.) call doctomonday morn said stick days. see school, get morning. last two day problem free. much agreeabl eve less emot (a good thing), less cranky. rememb thing should. overal behaviobett tri mani differ medic faeffective."', '"i use take anotheor contraceptive, n pill cycle, happy- light periods, max n days, side effects. contain hormon gestodene, avail us, switch lybrel, ingredi simila pill ended, start immediately, first day period, instruct said. period last two weeks. take second pack- two weeks. now, third pack thing got even worse- third period last two week it&#n; end third week- still daili brown discharge. posit side didn&#n;t side effects. idea period free tempting... alas."', '"this first time use form birth control. i&#n;m glad went patch, n months. first dec

In [14]:
#reload processed train comments 
# processed_train_comments = [line.strip() for line in open("processed_train_comments.csv", 'r')]
# processed_test_comments = [line.strip() for line in open("processed_test_comments.csv", 'r')]
# y = [line.strip() for line in open("y_train.csv", 'r')]
# y_test_predicted = [line.strip() for line in open("y_test.csv", 'r')]
# len(processed_train_comments)

ge

'''
['0,0,1', '0,0,1', '0,1,0', '0,0,1', '0,0,1']
['0,0,1', '0,0,1', '0,0,1', '0,0,1', '0,0,1']
['"it side effect, take combin n mg fish oil"', '"mi son halfway fourth week intuniv. becam concern began last week, start take highest dose on. two days, could hard get bed, cranky, slept near n hour drive home school vacat (veri unusu him.) call doctomonday morn said stick days. see school, get morning. last two day problem free. much agreeabl eve less emot (a good thing), less cranky. rememb thing should. overal behaviobett tri mani differ medic faeffective."', '"i use take anotheor contraceptive, n pill cycle, happy- light periods, max n days, side effects. contain hormon gestodene, avail us, switch lybrel, ingredi simila pill ended, start immediately, first day period, instruct said. period last two weeks. take second pack- two weeks. now, third pack thing got even worse- third period last two week it&#n; end third week- still daili brown discharge. posit side didn&#n;t side effects. idea period free tempting... alas."', '"this first time use form birth control. i&#n;m glad went patch, n months. first decreas libido subsided. downsid made period longe(n-n day exact) use period n-n day max also made cramp intens first two day period, nevecramp use birth control. happi patch"', '"suboxon complet turn life around. feel healthi i&#n;m excel job alway money pocket save account. none spent year abus oxycontin. paycheck alreadi spent time got start resort scheme steal fund addiction. history. you&#n;r readi stop, there&#n; good chanc put path great life again. found side-effect minim compar oxycontin. i&#n;m actual sleep bett slight constip me. truli amazing. cost pale comparison spent oxycontin."']
['"i&#n;v tri antidepress year (citalopram, fluoxetine, amitriptyline), none help depression, insomnia &amp; anxiety. doctosuggest chang onto nmg medicin save life. thank side effect especi common - weight gain, i&#n;v actual lost alot weight. still suicid thought save me."', '"mi son crohn&#n; diseas done well asacol. complaint show side effects. taken mani nine tablet peday one time. i&#n;v happi results, reduc bout diarrhea drastically."', '"quick reduct symptoms"', '"contrav combin drug use alcohol, smoking, opioid cessation. peopl lose weight also help control oveeating. doubt obes caus sugacarb addiction, power drug. take five days, good news is, seem go work immediately. feel hungri want food now. realli don&#n;t care eat; it&#n; fill stomach. sinc days, don&#n;t know i&#n;v lost weight (i don&#n;t scale), cloth feel littl loos mayb pound two. i&#n;m hope month medication, develop healthiehabit continu without aid contrave."', '"i birth control one cycle. read review type similabirth control bit apprehens start. im give birth control n n long enough n. falov birth control! side effect minim like im even birth control! experienc mild headach nausea ive feel great! got period cue third day inact pill idea come zero pms! period light bare cramping! unprotect sex first month obvious didn&#n;t get pregnant i&#n;m pleased! high recommend"']
'''

[[0 0 1]
 [0 0 1]
 [0 1 0]
 [0 0 1]
 [0 0 1]]
[[0 0 1]
 [0 0 1]
 [0 0 1]
 [0 0 1]
 [0 0 1]]
['"it side effect, take combin n mg fish oil"', '"mi son halfway fourth week intuniv. becam concern began last week, start take highest dose on. two days, could hard get bed, cranky, slept near n hour drive home school vacat (veri unusu him.) call doctomonday morn said stick days. see school, get morning. last two day problem free. much agreeabl eve less emot (a good thing), less cranky. rememb thing should. overal behaviobett tri mani differ medic faeffective."', '"i use take anotheor contraceptive, n pill cycle, happy- light periods, max n days, side effects. contain hormon gestodene, avail us, switch lybrel, ingredi simila pill ended, start immediately, first day period, instruct said. period last two weeks. take second pack- two weeks. now, third pack thing got even worse- third period last two week it&#n; end third week- still daili brown discharge. posit side didn&#n;t side effects. idea p

'\n[\'0,0,1\', \'0,0,1\', \'0,1,0\', \'0,0,1\', \'0,0,1\']\n[\'0,0,1\', \'0,0,1\', \'0,0,1\', \'0,0,1\', \'0,0,1\']\n[\'"it side effect, take combin n mg fish oil"\', \'"mi son halfway fourth week intuniv. becam concern began last week, start take highest dose on. two days, could hard get bed, cranky, slept near n hour drive home school vacat (veri unusu him.) call doctomonday morn said stick days. see school, get morning. last two day problem free. much agreeabl eve less emot (a good thing), less cranky. rememb thing should. overal behaviobett tri mani differ medic faeffective."\', \'"i use take anotheor contraceptive, n pill cycle, happy- light periods, max n days, side effects. contain hormon gestodene, avail us, switch lybrel, ingredi simila pill ended, start immediately, first day period, instruct said. period last two weeks. take second pack- two weeks. now, third pack thing got even worse- third period last two week it&#n; end third week- still daili brown discharge. posit side 

In [15]:
processed_train_comments[0:5]

['"it side effect, take combin n mg fish oil"',
 '"mi son halfway fourth week intuniv. becam concern began last week, start take highest dose on. two days, could hard get bed, cranky, slept near n hour drive home school vacat (veri unusu him.) call doctomonday morn said stick days. see school, get morning. last two day problem free. much agreeabl eve less emot (a good thing), less cranky. rememb thing should. overal behaviobett tri mani differ medic faeffective."',
 '"i use take anotheor contraceptive, n pill cycle, happy- light periods, max n days, side effects. contain hormon gestodene, avail us, switch lybrel, ingredi simila pill ended, start immediately, first day period, instruct said. period last two weeks. take second pack- two weeks. now, third pack thing got even worse- third period last two week it&#n; end third week- still daili brown discharge. posit side didn&#n;t side effects. idea period free tempting... alas."',
 '"this first time use form birth control. i&#n;m glad wen

In [16]:
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
tokenizer = Tokenizer(num_words = MAX_NB_WORDS, 
                      filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
                      lower=True, split=' ', char_level=False, 
                      oov_token=None, document_count=0)

tokenizer.fit_on_texts(processed_train_comments + processed_test_comments)
train_sequences = tokenizer.texts_to_sequences(processed_train_comments)
test_sequences = tokenizer.texts_to_sequences(processed_test_comments)

print('found %s tokens in text.' %(tokenizer.word_index))

train_data = pad_sequences(train_sequences, maxlen = MAX_SEQUENCE_LENGTH)
final_test_data = pad_sequences(test_sequences, maxlen = MAX_SEQUENCE_LENGTH)

train_shape = train_data.shape
test_shape = final_test_data.shape
y_shape = y.shape





In [19]:
print(train_data.shape, '<-- shape of train_data ready for val/train split.')
print(final_test_data.shape, '<-- shape of final_test_data ready for fedding to network.')
print(y.shape,'<-- shape of label(y)')


(161297, 180) <-- shape of train_data ready for val/train split.
(53766, 180) <-- shape of final_test_data ready for fedding to network.
(161297, 3) <-- shape of label(y)


In [18]:
##################################################
## preparing word embeddings.
##################################################
print('preparing embedding matrix')
word_index = tokenizer.word_index
nb_words  = min(MAX_NB_WORDS, len(word_index)) + 1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if(i > MAX_NB_WORDS):
        continue
    embedding_vector = embedding_index.get(word)
    if(embedding_vector is not None):
        # words not foundin embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    
print('embedding matrix preparation complete') 
print(len(word_index))


preparing embedding matrix
embedding matrix preparation complete
105244


In [20]:
##################################################
## train and validation split.
##################################################  
print('creating train and validation data by dividing train_data in 80:20 ratio')
permutation = np.random.permutation(len(train_data))
index_train = permutation[:int(len(train_data)*0.8)]
index_validation = permutation[int(len(train_data)*0.2):]

final_train_data = train_data[index_train]
train_data_labels = y[index_train]

final_val_data = train_data[index_validation]
val_data_labels = y[index_validation]

print('train data shape:', final_train_data.shape)
print('validation data shape:', final_val_data.shape)
print('train and validation data are ready!!')

creating train and validation data by dividing train_data in 80:20 ratio
train data shape: (129037, 180)
validation data shape: (129038, 180)
train and validation data are ready!!


In [0]:
############################
## Keras model structure.
############################
embedding_layer = Embedding(nb_words, EMBEDDING_DIM, weights = [embedding_matrix], input_length = MAX_SEQUENCE_LENGTH, trainable = False)
lstm_layer = LSTM(num_lstm, dropout = lstm_dropout_rate, recurrent_dropout = lstm_dropout_rate, return_sequences = True )       

input_comment = Input(shape = (MAX_SEQUENCE_LENGTH,), dtype = 'int32')
embedded_sequence = embedding_layer(input_comment)
x = lstm_layer(embedded_sequence)
x = Dropout(dense_dropout_rate)(x)
merged = Attention(MAX_SEQUENCE_LENGTH)(x)
merged = Dense(num_dense, activation = act)(merged)
merged = Dropout(dense_dropout_rate)(merged)
merged = BatchNormalization()(merged)
preds = Dense(len(classes_to_predict), activation = 'sigmoid')(merged)

In [23]:
#########################
## Compile the model.
#########################
model = Model(inputs = [input_comment], outputs = preds)
model.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 180)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 180, 100)          1000100   
_________________________________________________________________
lstm_1 (LSTM)                (None, 180, 100)          80400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 180, 100)          0         
_________________________________________________________________
attention_1 (Attention)      (None, 100)               280       
_________________________________________________________________
dense_1 (Dense)              (None, 75)                7575      
_________________________________________________________________
dropout_2 (Dropout)          (None, 75)                0         
__________

In [24]:
stamp = 'sentiment_with_lstm_and_glove_%.2f_%.2f'%(lstm_dropout_rate,dense_dropout_rate)
print(stamp)
best_model_path = stamp + '.h5'

early_stopping = EarlyStopping(patience = 4)
model_checkpoint = ModelCheckpoint(best_model_path, save_best_only = True)

theModel = model.fit(x = final_train_data, y = train_data_labels,
                     validation_data = (final_val_data, val_data_labels),
                     epochs = 100, batch_size = 256, shuffle = True,
                 callbacks = [early_stopping, model_checkpoint])

best_score = min(theModel.history['val_loss'])

sentiment_with_lstm_and_glove_0.25_0.25
Train on 129037 samples, validate on 129038 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100


In [28]:
#######################################
## time to make prediction!!!
########################################
y_test_predicted = model.predict([final_test_data], batch_size = 1024, verbose = 1)

# Save Training History
df_fit = pd.DataFrame(columns=['loss','acc','val_loss','val_acc'])
df_fit['loss'] = theModel.history['loss']
df_fit['val_loss'] = theModel.history['val_loss']
df_fit['acc'] = theModel.history['acc']
df_fit['val_acc'] = theModel.history['val_acc']
df_fit

# Save Model & Run Data 
save_name = ('stamp_{}'.format(stamp))

#uploaded = drive.CreateFile({'title': '{}.h5'.format(save_name)})
#model.save('{}.h5'.format(save_name))
#uploaded.SetContentFile('{}.h5'.format(save_name))
#uploaded.Upload()
#print('Uploaded file {}.h5'.format(save_name),'with ID {}'.format(uploaded.get('id')))

uploaded = drive.CreateFile({'title': '{}.csv'.format(save_name)})
df_fit.to_csv('{}.csv'.format(save_name))
uploaded.SetContentFile('{}.csv'.format(save_name))
uploaded.Upload()
print('Uploaded file {}.csv'.format(save_name),'with ID {}'.format(uploaded.get('id')))



InvalidConfigError: ignored

In [27]:
df_fit.head()

Unnamed: 0,loss,acc,val_loss,val_acc
0,0.472877,0.786149,0.427231,0.806995
1,0.417372,0.816536,0.395614,0.828417
2,0.395871,0.828917,0.37878,0.836123
3,0.382249,0.835559,0.3636,0.847569
4,0.372271,0.841544,0.364831,0.843431


In [0]:
## Reload the saved model. 
#model = load_model('sentiment_with_lstm_and_glove_0.25_0.25.h5',custom_objects={'Attention': Attention(MAX_SEQUENCE_LENGTH)(x)})


In [0]:
#stamp = 'sentiment_with_lstm_and_glove_%.2f_%.2f'%(lstm_dropout_rate,dense_dropout_rate)
#print(stamp)

In [0]:
#model_path = 'sentiment_with_lstm_and_glove_0.25_0.25.h5'
#keras.models.load_model(model_path, custom_objects=SeqSelfAttention.get_custom_objects())

In [0]:
# Reload by scope? 
#from keras.utils import CustomObjectScope

#with CustomObjectScope({'Attention': Attention}):
#    model = load_model("sentiment_with_lstm_and_glove_0.25_0.25.h5")
