In [105]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
from nltk.corpus import stopwords

import numpy as np
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import keras.backend as K
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.layers import GRU, LSTM, Bidirectional, Conv1D
import re

In [107]:
import nltk
nltk.download("stopwords")
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:

def clean(text, remove_stopwords=True, stem_words=True):
    text = str(text)
    text = text.lower()
    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text.split() if not w in stops]
        text = " ".join(text)
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9,.\/'\"]", " ", text)
    text = re.sub(r"[0-9]"," ", text )
    text = re.sub(r'http[s]?://\S+', '<URL>', text)
    text = re.sub(r".aspx"," ", text)
    text = re.sub(r"xap","silverlight application package ", text)
    text = re.sub(r"v=vs.105", " ", text);
    text = re.sub(r"appreslib.dll.xxxx.mui"," ", text)
    #text = re.sub(r"ru","russia", text)
    #text = re.sub(r"ru-ru","russia",text)
    #text = re.sub(r"de","german", text)
    #text = re.sub(r"de-de","german", text)
    text = re.sub(r"wich","which",text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"supportedcultures", "supported cultures", text)
    text = re.sub(r"calenderidentifier", "calender identifier", text)
    text = re.sub(r"futureaccesslist", "future access list", text)
    text = re.sub(r"wp7", "windows phone", text)
    text = re.sub(r"wp8", "windows phone", text)
    text = re.sub(r"'re", " are ", text)
    text = re.sub(r"isn't", " is not ", text)
    text = re.sub(r"doesn't", " does not ", text)
    text = re.sub(r"i'm", " i am ", text)
    text = re.sub(r"there's", "there is ", text)
    text = re.sub(r"can't", " can not ", text)
    text = re.sub(r"webview", "web view ", text)
    text = re.sub(r"mediaplayer", "media player ", text)
    text = re.sub(r"onedrive", "one drive ", text)
    text = re.sub(r"contentdialog", "content dialog ", text)
    text = re.sub(r"mapcontrol", "map control ", text)
    text = re.sub(r"applicationmodel", "application model ", text)
    text = re.sub(r"it's", " it is ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"\'ll", " will ", text)
    
    text = re.sub(r"shouldn't", " should not ", text)
    text = re.sub(r"didn't", " did not ", text)
    text = re.sub(r"couldn't", "could not ", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"autoscaling", "auto scaling", text)
    text = re.sub(r"abillity", "ability ", text)
    text = re.sub(r"photocamera", "photo camera ", text)
    text = re.sub(r"standardtiledata", "standard tile data ", text)
    text = re.sub(r"batterysaver", "battery saver", text)
    
    text = re.sub(r"menuitems", "menu items", text)
    text = re.sub(r"keyup", " key up ", text)
    text = re.sub(r"manualy", "manually", text)
    text = re.sub(r"samplerate", "sample rate", text)
    text = re.sub(r"applicationid", "application id", text)
    text = re.sub(r"isolatedstoragesettings", "isolated storage settings ", text)
    text = re.sub(r"funtionality", "functionality ", text)
    text = re.sub(r"selectedindex", "selected index", text)
    text = re.sub(r"multiscaleimage", "multiscale image", text)
    
    text = re.sub(r"isolatedstorage", "isolated storage ", text)
    text = re.sub(r"andriod", "android ", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"autoscaling", "auto scaling", text)
    text = re.sub(r"xmlnsdefinitionattribute", " xml ns definition attribute ", text)
    text = re.sub(r"scheduledtoastnotification", "scheduled toast notification", text)
    text = re.sub(r"wrapgrid", "wrap grid", text)
    text = re.sub(r"stackpanel", "stack panel", text)
    text = re.sub(r"reader'", "reader", text)
    text = re.sub(r"designcapacityinmilliwatthours", "design capacity in milli watt hours ", text)
    
    text = re.sub(r"designcapacityinmilliwatthours", "full charge capacity in milli watt hours", text)
    text = re.sub(r"featureset", "feature set", text)
    text = re.sub(r"currentappsimulator", "current app simulator", text)
    text = re.sub(r"'old'", "old ", text)
    text = re.sub(r"carrige", "carriage ", text)
    text = re.sub(r"screenmirroring", "screen mirroring", text)
    text = re.sub(r"loggingchannel", "logging channel", text)
    
    text = re.sub(r"actualheight","actual height", text)
    text = re.sub(r"devicetype","device type",text)
    text = re.sub(r"uisplitviewcontroller","user interface split view controller", text)
    text = re.sub(r"phonecallorigin","phone call origin", text)
    text = re.sub(r"onlineidauthenticator","online id authenticator", text)
    text = re.sub(r"credentialprompttype","credential prompt type",text)
    text = re.sub(r"retypecredentials","retype credentials", text)
    text = re.sub(r"backgroundtask","background task", text)    
    text = re.sub(r"scheduledtask","scheduled task", text)
    text = re.sub(r"contactstore","contact store",text)
    text = re.sub(r"resourcedictionary","resource dictionary",text)
    text = re.sub(r"medialibraryextensions","media library extensions", text)
    text = re.sub(r"navigationservice","navigation service",text)
    text = re.sub(r"fontfamily","font family", text)
    text = re.sub(r"frameworkelement","framework element", text)    
    
    text = re.sub(r"connectioninterval"," connection interval", text)
    text = re.sub(r"backgroundtransferrequest","background transfer request",text)
    text = re.sub(r"datasource","data source", text)
    text = re.sub(r"swipecontrol","swipe control", text)
    
    
    
    text = re.sub(r"listview", "list view", text)
    text = re.sub(r"windowsphone","windows phone", text )
    text = re.sub(r"uservoice", "user voice", text)
    text = re.sub(r"treeview","tree view", text)
    text = re.sub(r"datatemplate","data template ", text)
    text = re.sub(r"scrollbar", "scroll bar", text);
    text = re.sub(r"ainputpane","input pane", text)
    text = re.sub(r"splitview","split view", text)
    text = re.sub(r"lockscreen","lock screen",text)
    text = re.sub(r"suggestedstartlocation","suggested start location", text)
    text = re.sub(r"combobox","combo box", text)
    text = re.sub(r"scrollviewer","scroll viewer",text)
    text = re.sub(r"calendarview", "calendar view ", text)
    text = re.sub(r"commandbar", "command bar", text)
    text = re.sub(r"mediaelement", "media element", text)
    text = re.sub(r"gridview", "grid view", text)
    text = re.sub(r"firstname", "first name", text)
    text = re.sub(r"richeditbox", "rich edit box", text)
    text = re.sub(r"winappdriver", " windows application driver ", text)
    text = re.sub(r"itemtemplate", "item template ", text)
    text = re.sub(r"applicationbar", "application bar ", text)
    text = re.sub(r"pixelwidth", "pixel width", text)
    text = re.sub(r"pixelheight", "pixel height ", text)
    text = re.sub(r"webservice", " web service ", text)
    text = re.sub(r"dependencyproperty", "dependency property ", text)
    text = re.sub(r"storagefolder", "storage folder ", text)
    
    text = re.sub(r"onedrive", "one drive ", text)
    text = re.sub(r"contentdialog", "content dialog ", text)
    text = re.sub(r"mapcontrol", "map control ", text)
    text = re.sub(r"applicationmodel", "application model ", text)
    text = re.sub("it's", " it is ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"\'ll", " will ", text)
    
    text = re.sub(r"shouldn't", " should not ", text)
    text = re.sub(r"didn't", " did not ", text)
    text = re.sub(r"couldn't", "could not ", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"autoscaling", "auto scaling", text)
    text = re.sub(r"abillity", "ability ", text)
    text = re.sub(r"photocamera", "photo camera ", text)
    text = re.sub(r"standardtiledata", "standard tile data ", text)
    text = re.sub(r"batterysaver", "battery saver", text)
    
    text = re.sub(r"menuitems", "menu items", text)
    text = re.sub(r"keyup", " key up ", text)
    text = re.sub(r"manualy", "manually", text)
    text = re.sub(r"samplerate", "sample rate", text)
    text = re.sub(r"applicationid", "application id", text)
    text = re.sub(r"isolatedstoragesettings", "isolated storage settings ", text)
    text = re.sub(r"funtionality", "functionality ", text)
    text = re.sub(r"selectedindex", "selected index", text)
    text = re.sub(r"multiscaleimage", "multiscale image", text)
    
    text = re.sub(r"isolatedstorage", "isolated storage ", text)
    text = re.sub(r"andriod", "android ", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"autoscaling", "auto scaling", text)
    text = re.sub(r"xmlnsdefinitionattribute", " xml ns definition attribute ", text)
    text = re.sub(r"scheduledtoastnotification", "scheduled toast notification", text)
    text = re.sub(r"wrapgrid", "wrap grid", text)
    text = re.sub(r"stackpanel", "stack panel", text)
    text = re.sub(r"reader'", "reader", text)
    text = re.sub(r"designcapacityinmilliwatthours", "design capacity in milli watt hours ", text)
    text = re.sub(r"featureset", "feature set", text)
    text = re.sub(r"currentappsimulator", "current app simulator", text)
    text = re.sub(r"'old'", "old ", text)
    text = re.sub(r"carrige", "carriage ", text)
    text = re.sub(r"screenmirroring", "screen mirroring", text)
    text = re.sub(r"loggingchannel", "logging channel", text)
    
    text = re.sub(r"actualheight","actual height", text)
    text = re.sub(r"devicetype","device type",text)
    text = re.sub(r"uisplitviewcontroller","user interface split view controller", text)
    text = re.sub(r"phonecallorigin","phone call origin", text)
    text = re.sub(r"onlineidauthenticator","online id authenticator", text)
    text = re.sub(r"credentialprompttype","credential prompt type",text)
    text = re.sub(r"retypecredentials","retype credentials", text)
    text = re.sub(r"backgroundtask","background task", text)    
    text = re.sub(r"scheduledtask","scheduled task", text)
    text = re.sub(r"contactstore","contact store",text)
    
    
    text = re.sub(r"haven't","have not",text)
    text = re.sub(r"stackoverflow","stack overflow", text)
    text = re.sub(r"hasn't","has not",text)
    text = re.sub(r"async"," asynchronous ", text)
    text = re.sub(r"wouldn't","would not", text)    
    text = re.sub(r"visualstudio"," visual studio", text)
    text = re.sub(r"let's","let us",text)
    text = re.sub(r"that's","that is", text)
    text = re.sub(r"aren't","are not", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"en-us", "english", text)
    text = re.sub(r"\s{2,}", " ", text)
    return(text)

In [0]:
from numpy import array
df = pd.read_csv("/content/drive/My Drive/train - train.csv")
#df['review'] = df['review'].str.replace('[^a-zA-Z.\ ]', '')
docs = df['review'].str.lower()
labels = array(df['label'])

In [0]:
docs = df.apply(lambda x: clean(x.review), axis=1)

In [0]:

max_features = 1
tokenizer = Tokenizer(oov_token=None)
tokenizer.fit_on_texts(docs)
encoded_docs = tokenizer.texts_to_sequences(docs)
pdocs = pad_sequences(encoded_docs, maxlen=100,padding="post")
word_index = tokenizer.word_index



In [136]:
embeddings_index = {}
f = open('/content/drive/My Drive/glove.6B.100d.txt', encoding="utf8")
for line in f:
    try:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        print(word)
        pass
f.close()
embed_size = 100
embedding_matrix = np.zeros((len(word_index) + 1, embed_size))
absent =[]
absent_words = 0
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        absent_words += 1
        absent.append(word)
print('Total absent words are', absent_words, 'which is', "%0.2f" % (absent_words * 100 / len(word_index)), '% of total words')

Total absent words are 2887 which is 29.90 % of total words


In [0]:
from keras.layers import Embedding
vocab_size = len(tokenizer.word_index)+1
e = Embedding(vocab_size, 100, weights=[embedding_matrix],input_length=100, trainable=True)

In [0]:
from keras import initializers,regularizers,constraints

def dot_product(x, kernel):
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)
    

class AttentionWithContext(Layer):
    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):

        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

        self.u = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        ait = dot_product(uit, self.u)

        a = K.exp(ait)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

In [139]:
from keras.models import Model
from keras.layers import *
from keras.preprocessing.sequence import pad_sequences
from keras.regularizers import l2 as l2_reg

def create_models():
    maxlen = 100
    inp = Input((None,))
    x = e(inp)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = Bidirectional(LSTM(64, return_sequences=True))(x)
    x = Dropout(0.4)(x)
    x = AttentionWithContext()(x)
    x = Dense(64, activation="relu")(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model = create_models()
print(model.summary())

model.fit(pdocs, np.array(labels),validation_split=0.1, epochs = 3,batch_size=100)

#model.save("m.hdf5")
loss, accuracy = model.evaluate(pdocs,labels , verbose=1)
print('Training Accuracy is {}'.format(accuracy*100))

Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 100, 100)          965700    
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 100, 256)          234496    
_________________________________________________________________
bidirectional_14 (Bidirectio (None, 100, 128)          164352    
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 128)          0         
_________________________________________________________________
attention_with_context_6 (At (None, 128)               16640     
_________________________________________________________________
dense_11 (Dense)             (None, 64)                8256

In [0]:
class_idx = np.argmax(labels[0])
class_output = model.output[:, class_idx]
last_layer = model.get_layer("bidirectional_12")

In [0]:

test = pd.read_csv("/content/drive/My Drive/Colab Notebooks/test - test.csv")
tokenizer = Tokenizer(oov_token=None)
tokenizer.fit_on_texts(test['sentence'])
encoded_Testdocs = tokenizer.texts_to_sequences(test['sentence'])
Testdocs = pad_sequences(encoded_Testdocs, maxlen=100,padding="post")

In [0]:
grads = K.gradients(class_output, last_layer.output)[0]
pooled_grads = K.mean(grads)
iterate = K.function([model.input], [pooled_grads, last_layer.output[0]])
pooled_grads_value, lstm_layer_output_value = iterate([Testdocs])

In [0]:
heatmap = np.mean(lstm_layer_output_value, axis=0)
heatmap = np.maximum(heatmap,0)
heatmap /= np.max(heatmap)#normalise values in the prediction

In [0]:
norm_len = 100/last_layer.output_shape[1] # find the ratio of the text vs the lstm layer length

In [131]:

import math
from IPython.display import HTML
html = ""

for j,i in enumerate(tokenizer.sequences_to_texts(Testdocs)[3].split()):
  html += "<span style='background-color:rgba({},0,150,{})'>{} </span>".format(heatmap[math.floor(j/norm_len)]*255,heatmap[math.floor(j/norm_len)]-0.3,i)
HTML(html)

In [104]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score

yhat_probs = model.predict(pdocs, verbose=0)
yhat_classes = yhat_probs.argmax(axis=-1)
yhat_probs = yhat_probs[:,0]
yhat_classes = yhat_classes[:850]


precision = precision_score(labels[:850], yhat_classes,zero_division=1,average='weighted')
print('Precision: %f' % precision)
recall = recall_score(labels[:850], yhat_classes,average='weighted')
print('Recall: %f' % recall)
f1 = f1_score(labels[:850], yhat_classes,average='weighted')
print('F1 score: %f' % f1)

Precision: 0.845009
Recall: 0.808235
F1 score: 0.722521
