# Deep learning models for sentiment analysis

In [5]:
import operator 
from collections import defaultdict

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
from IPython.display import display
pd.set_option('display.max_colwidth', -1)
from tqdm import tqdm
tqdm.pandas()

import re
from gensim.models import KeyedVectors
from nltk import word_tokenize

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [6]:
class EmbeddingVectorizer(BaseEstimator, TransformerMixin):
    '''Takes a word embedding and transforms documents into embedded vectors
        Can specify to use a weighted sum by fitting a Tfidf Vectorizer and using those weights'''
    
    def __init__(self, wv, weighted_vec=True, max_df=1., min_df=int(1)):
        self.wv = wv
        self.weighted_vec = weighted_vec
        self.max_df = max_df
        self.min_df = min_df
        self.dim = wv.vector_size
        
    def fit(self, X):
        if self.weighted_vec:
            self.tfidf_vec = TfidfVectorizer(max_df=self.max_df, min_df=self.min_df)
            self.tfidf_vec.fit(X)
            # if a word was never seen - it must be at least as infrequent
            # as any of the known words - so the default idf is the max of 
            # known idf's
            
            max_idf = max(self.tfidf_vec.idf_) 
            self.tfidf_dict = defaultdict(lambda: max_idf, zip(self.tfidf_vec.get_feature_names(), self.tfidf_vec.idf_))
        return self
        
    def DocToWordVector(self, doc):
        tokens = word_tokenize(doc)
        vec = np.zeros(self.dim).reshape((1, self.dim))
        count = 0.
    
        for word in tokens:
            if self.weighted_vec:
                weight = self.tfidf_dict[word]
            else:
                weight = 1
            try:
                vec += self.wv[word].reshape((1, self.dim)) * weight
                count += 1.
            except KeyError:      # handling the case where the token is not
                                  # in the word embedding
                continue
    
        if count != 0:
            vec = vec / count
        return vec
            
    def transform(self, X):
        transformed_X = np.vstack(np.array(list(map(self.DocToWordVector, X))))
        return transformed_X
        

In [7]:
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in [i * 0.01 for i in range(100)]:
        score = metrics.f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result

## Import data

In [8]:
train = pd.read_csv('../../data/train.csv')
test = pd.read_csv('../../data/test.csv')

In [9]:
# Load Embedding file using google news word2vec
news_path = '../../data/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(news_path, binary=True)

## Preprocessing
Following <a href='https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings'>Dieter</a> on a better approach of preprocessing data when one has a word embedding. Essentially only perform steps to increase the vocab covered by the word embedding and do not mindlessly apply general text preprocessing. 

In [10]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [11]:
# Build vocabulary from training data
sentences = train["question_text"].progress_apply(lambda x: x.split()).values
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

100%|████████████████████████████████████████████████████████████████████| 1306122/1306122 [00:03<00:00, 351481.33it/s]
100%|████████████████████████████████████████████████████████████████████| 1306122/1306122 [00:03<00:00, 346088.95it/s]


{'How': 261930, 'did': 33489, 'Quebec': 97, 'nationalists': 91, 'see': 9003}


In [12]:
def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    covered_word_count = 0
    oov_word_count = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            covered_word_count += vocab[word]
        except:

            oov[word] = vocab[word]
            oov_word_count += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(covered_word_count / (covered_word_count + oov_word_count)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [13]:
oov = check_coverage(vocab,embeddings_index)

100%|██████████████████████████████████████████████████████████████████████| 508823/508823 [00:01<00:00, 455507.80it/s]


Found embeddings for 24.31% of vocab
Found embeddings for  78.75% of all text


In [14]:
oov[:10]
# Lots of stop words and punctuation needs to be dealt with also

[('to', 403183),
 ('a', 402682),
 ('of', 330825),
 ('and', 251973),
 ('India?', 16384),
 ('it?', 12900),
 ('do?', 8753),
 ('life?', 7753),
 ('you?', 6295),
 ('me?', 6202)]

In [15]:
print('?' in embeddings_index)
print('&' in embeddings_index)
# Remove all punctuation except &

False
True


In [16]:
def remove_punct(x):
    '''Removes all punctuation apart from &'''
    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

In [17]:
train["question_text"] = train["question_text"].progress_apply(lambda x: remove_punct(x))
sentences = train["question_text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|████████████████████████████████████████████████████████████████████| 1306122/1306122 [00:09<00:00, 131869.04it/s]
100%|████████████████████████████████████████████████████████████████████| 1306122/1306122 [00:03<00:00, 375112.11it/s]


In [18]:
oov = check_coverage(vocab,embeddings_index)

100%|██████████████████████████████████████████████████████████████████████| 253623/253623 [00:00<00:00, 420308.39it/s]


Found embeddings for 57.38% of vocab
Found embeddings for  89.99% of all text


In [19]:
oov[:10]
# Numbers need to be dealt with

[('to', 406298),
 ('a', 403852),
 ('of', 332964),
 ('and', 254081),
 ('2017', 8781),
 ('2018', 7373),
 ('10', 6642),
 ('12', 3694),
 ('20', 2942),
 ('100', 2883)]

It turns out numbers are in word2vec but represented with hashes i.e. 45 becomes ## and 345 becomes ### etc.

In [20]:
def clean_numbers(x):

    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [21]:
train["question_text"] = train["question_text"].progress_apply(lambda x: clean_numbers(x))
sentences = train["question_text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|████████████████████████████████████████████████████████████████████| 1306122/1306122 [00:11<00:00, 109143.34it/s]
100%|████████████████████████████████████████████████████████████████████| 1306122/1306122 [00:03<00:00, 344254.92it/s]


In [22]:
oov = check_coverage(vocab,embeddings_index)

100%|██████████████████████████████████████████████████████████████████████| 242997/242997 [00:00<00:00, 357242.03it/s]


Found embeddings for 60.41% of vocab
Found embeddings for  90.75% of all text


In [23]:
oov[:20]
# Time to remove stop words
# and UK/US language differences 

[('to', 406298),
 ('a', 403852),
 ('of', 332964),
 ('and', 254081),
 ('favourite', 1247),
 ('bitcoin', 987),
 ('colour', 976),
 ('doesnt', 918),
 ('centre', 886),
 ('Quorans', 858),
 ('cryptocurrency', 822),
 ('Snapchat', 807),
 ('travelling', 705),
 ('counselling', 634),
 ('btech', 632),
 ('didnt', 600),
 ('Brexit', 493),
 ('cryptocurrencies', 481),
 ('blockchain', 474),
 ('behaviour', 468)]

In [24]:
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color',
                'centre':'center',
                'behaviour': 'behavior',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium',
                'Snapchat': 'social medium',
                'Pinterest': 'social medium',
                'aluminium': 'aluminum',
                'bitcoin': 'cryptography currency',
                'cryptocurrency': 'cryptography currency',
                'cryptocurrencies': 'cryptography currency',
                'blockchain': 'cryptography currency',
                'Blockchain': 'cryptography currency',
                'Ethereum': 'cryptography currency',
                'ethereum': 'cryptography currency',
                'realise': 'realize',
                'defence': 'defense'
                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

In [25]:
train["question_text"] = train["question_text"].progress_apply(lambda x: replace_typical_misspell(x))
sentences = train["question_text"].progress_apply(lambda x: x.split())
to_remove = ['a','to','of','and']
sentences = [[word for word in sentence if not word in to_remove] for sentence in tqdm(sentences)]
vocab = build_vocab(sentences)

100%|████████████████████████████████████████████████████████████████████| 1306122/1306122 [00:09<00:00, 135898.89it/s]
100%|████████████████████████████████████████████████████████████████████| 1306122/1306122 [00:02<00:00, 502695.89it/s]
100%|████████████████████████████████████████████████████████████████████| 1306122/1306122 [00:03<00:00, 335866.12it/s]
100%|████████████████████████████████████████████████████████████████████| 1306122/1306122 [00:03<00:00, 396367.66it/s]


In [26]:
oov = check_coverage(vocab,embeddings_index)

100%|██████████████████████████████████████████████████████████████████████| 242897/242897 [00:00<00:00, 430593.53it/s]


Found embeddings for 60.44% of vocab
Found embeddings for  99.00% of all text


In [27]:
oov[:20]

[('Quorans', 858),
 ('btech', 632),
 ('Brexit', 493),
 ('upvotes', 432),
 ('programme', 402),
 ('Redmi', 379),
 ('KVPY', 349),
 ('Paytm', 334),
 ('grey', 299),
 ('currencys', 282),
 ('mtech', 281),
 ('Btech', 262),
 ('honours', 252),
 ('learnt', 248),
 ('upvote', 247),
 ('licence', 242),
 ('…', 210),
 ('Whatis', 209),
 ('bcom', 199),
 ('Isnt', 192),
 ('favour', 175),
 ('INTJ', 173),
 ('cheque', 159),
 ('INFJ', 157),
 ('aadhar', 150),
 ('judgement', 145),
 ('Fiverr', 143),
 ('modelling', 143),
 ('Xiaomi', 140),
 ('Coursera', 137),
 ('Fortnite', 130),
 ('OnePlus', 125),
 ('recognise', 124),
 ('Lyft', 124),
 ('wasnt', 123),
 ('UCEED', 123),
 ('AFCAT', 122),
 ('jewellery', 121),
 ('hasnt', 117),
 ('practise', 113),
 ('WeChat', 112),
 ('INFP', 111),
 ('travelled', 111),
 ('vape', 111),
 ('analyse', 110),
 ('GDPR', 107),
 ('demonetisation', 106),
 ('Nodejs', 105),
 ('UPSEE', 105),
 ('recognised', 105),
 ('Coinbase', 104),
 ('programmes', 104),
 ('upvoted', 102),
 ('BNBR', 99),
 ('Manaphy', 9

## Modelling

In [28]:
X = train.question_text.values
y = train.target.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

In [29]:
%%time
embed_transformer = EmbeddingVectorizer(embeddings_index, weighted_vec=False)
X_train_wv = embed_transformer.fit_transform(X_train)
X_test_wv = embed_transformer.transform(X_test)

Wall time: 5min 9s


In [30]:
%%time
# Train model
clf = LogisticRegression()
clf.fit(X_train_wv, y_train)



Wall time: 1min 39s


In [31]:
test_predictions = clf.predict(X_test_wv)
print(metrics.confusion_matrix(y_test, test_predictions))
print(metrics.f1_score(y_test, test_predictions))

[[121084   1448]
 [  5651   2430]]
0.4063884940212393


In [32]:
predictions_proba = clf.predict_proba(X_test_wv)
search_result = threshold_search(y_test, predictions_proba[:,1])
print(search_result)


{'threshold': 0.23, 'f1': 0.5436260464071015}


Performs better after this preprocessing than the general preprocessing in notebook 2.

## MLP on question vectors

In [40]:
from tensorflow import keras
from keras_tqdm import TQDMNotebookCallback
import tensorflow.keras.backend as K

In [34]:
scl = StandardScaler()
X_train_scaled = scl.fit_transform(X_train_wv)
X_test_scaled = scl.transform(X_test_wv)

INPUT_DIM = X_train_wv.shape[1]
LAYER_SIZE = 300

In [41]:
# Create model
K.clear_session()

model = keras.Sequential()

model.add(keras.layers.Dense(LAYER_SIZE, input_dim=INPUT_DIM, activation='relu'))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.BatchNormalization())

model.add(keras.layers.Dense(LAYER_SIZE, activation='relu'))
model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.BatchNormalization())


model.add(keras.layers.Dense(1))
model.add(keras.layers.Activation('sigmoid'))

# compile the model
model.compile(loss='binary_crossentropy', optimizer='adam')

In [42]:
%%time
earlystop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=4, verbose=1, mode='auto')
model.fit(X_train_scaled, y=y_train, batch_size=1024, 
          epochs=20, verbose=0, 
          validation_data=(X_test_scaled, y_test), callbacks=[earlystop, TQDMNotebookCallback(leave_inner=True, leave_outer=True)])

HBox(children=(IntProgress(value=0, description='Training', max=20, style=ProgressStyle(description_width='ini…

HBox(children=(IntProgress(value=0, description='Epoch 0', max=1175509, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Epoch 1', max=1175509, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Epoch 2', max=1175509, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Epoch 3', max=1175509, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Epoch 4', max=1175509, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Epoch 5', max=1175509, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Epoch 6', max=1175509, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Epoch 7', max=1175509, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Epoch 8', max=1175509, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Epoch 9', max=1175509, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Epoch 10', max=1175509, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 11', max=1175509, style=ProgressStyle(description_width…

Epoch 00012: early stopping

Wall time: 2min 24s


In [37]:
keras_predict = model.predict_classes(X_test_scaled)
metrics.f1_score(y_test, keras_predict)

0.6154272645366968

In [38]:
keras_probas = model.predict(X_test_scaled)
keras_search = threshold_search(y_test, keras_probas)
print(keras_search)

{'threshold': 0.28, 'f1': 0.6525347689546882}


A lot better than anything before, deep learning is the way forward.