### Imports

In [1]:
from tqdm import tqdm
from collections import Counter
from nltk.corpus import stopwords
from itertools import combinations
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bert_embedding import BertEmbedding
from allennlp.commands.elmo import ElmoEmbedder

from transformers import *
import torch
import keras

import imp, gzip
import pickle, nltk
import gensim
import multiprocessing
from copy import deepcopy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import utils as my_utils



import pandas as pd 
import numpy as np
import re
import collections
import matplotlib.pyplot as plt
from pathlib import Path

# Packages for data preparation
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional

# Packages for modeling
from keras import models
from keras import layers
from keras import regularizers

Using TensorFlow backend.


In [2]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

### Definitions

In [34]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r', encoding='utf8')
    model = {}
    for line in tqdm(f):
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [3]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in tqdm(parse(path)):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [4]:
def process_df(df):
    df['text'] = my_utils.preprocess2(df['text'])
    return df

In [38]:
def process_glove_vocab(df):
    df['text'] = df['text'].apply(lambda x: [i for i in x if i in glove_vocab])
    return df

### Start

In [5]:
dataset_movies = getDF('datasets_raw/reviews_Movies_and_TV_5.json.gz')
dataset_movies.shape

1697533it [02:58, 9501.76it/s] 


(1697533, 9)

In [6]:
dataset_kindle = getDF('datasets_raw/reviews_Kindle_Store_5.json.gz')
dataset_kindle.shape

982619it [01:27, 11208.69it/s]


(982619, 9)

In [7]:
dataset_home = getDF('datasets_raw/reviews_Home_and_Kitchen_5.json.gz')
dataset_home.shape

551682it [00:47, 11515.54it/s]


(551682, 9)

In [8]:
# dataset = dataset_home

In [9]:
dataset = pd.concat([dataset_movies, dataset_home, dataset_kindle])

In [10]:
dataset = dataset.drop(columns=['reviewerID', 'asin', 'reviewerName', 'helpful', 'summary', 'unixReviewTime', 'reviewTime'])

In [11]:
dataset = dataset.rename(columns={'reviewText': 'text', 'overall': 'sentiment'})

In [12]:
dataset.shape

(3231834, 2)

In [35]:
glove_embedding_dim = 300
glove_embeddings_index = loadGloveModel("nongit_resources/glove.6B.300d.txt")

0it [00:00, ?it/s]

Loading Glove Model


400000it [01:06, 6048.37it/s]

Done. 400000  words loaded!





In [13]:
dataset = dataset[dataset.sentiment != 3]

In [14]:
dataset.sentiment = dataset.sentiment > 4

In [15]:
dataset.sentiment = dataset.sentiment.astype(int)

In [16]:
Counter(dataset.sentiment)

Counter({0: 1057711, 1: 1831568})

In [18]:
dataset.shape

(2889279, 2)

In [23]:
dataset.text = dataset.text.apply(lambda x: " ".join(x))

In [24]:
dataset['len'] = dataset.text.apply(lambda x: len(x.split(" ")))

In [25]:
dataset.shape

(2889279, 3)

In [27]:
dataset = dataset[dataset.len > 50]

In [30]:
dataset = dataset[dataset.len < 400]

In [31]:
dataset.shape

(1576511, 3)

In [32]:
n_cores = 45

In [33]:
%%time
n = int(dataset.shape[0]/n_cores)
list_df = [dataset[i:i+n] for i in range(0, dataset.shape[0],n)]

pool = multiprocessing.Pool(n_cores)
processed_list_df = pool.map(process_df, list_df)
pool.close()

dataset = pd.concat(processed_list_df)
dataset.shape

CPU times: user 3.79 s, sys: 55.4 s, total: 59.2 s
Wall time: 2min 37s


(1576511, 3)

In [36]:
glove_vocab = glove_embeddings_index.keys()

In [37]:
len(glove_vocab)

400000

In [39]:
dataset.text = dataset.text.apply(lambda x: x.split(" "))

In [40]:
%%time
n = int(dataset.shape[0]/n_cores)
list_df = [dataset[i:i+n] for i in range(0, dataset.shape[0],n)]

pool = multiprocessing.Pool(n_cores)
processed_list_df = pool.map(process_glove_vocab, list_df)
pool.close()

dataset = pd.concat(processed_list_df)
dataset.shape

CPU times: user 1min 26s, sys: 1min 38s, total: 3min 5s
Wall time: 3min 3s


(1576511, 3)

In [41]:
dataset['len'] = dataset.text.apply(lambda x: len(x))

In [42]:
dataset.len.describe()

count    1.576511e+06
mean     7.373376e+01
std      4.616416e+01
min      1.100000e+01
25%      3.700000e+01
50%      5.800000e+01
75%      9.800000e+01
max      3.610000e+02
Name: len, dtype: float64

In [45]:
dataset = dataset[dataset.len > 50]

In [46]:
dataset = dataset[dataset.len < 300]

In [47]:
dataset.shape

(909869, 3)

In [48]:
dataset.text = dataset.text.apply(lambda x: " ".join(x))

In [49]:
Counter(dataset.sentiment)

Counter({1: 514278, 0: 395591})

In [50]:
g = dataset.groupby('sentiment')

In [51]:
d = g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True)).reset_index(drop=True)

In [52]:
d.head()

Unnamed: 0,text,sentiment,len
0,information product detail section non dvd rel...,0,60
1,change pace let begin review oxo good grip box...,0,124
2,film ha really interesting style presentation ...,0,110
3,believe seeing review people serious film even...,0,97
4,catching fire marked improvement hunger game e...,0,160


In [53]:
Counter(d.sentiment)

Counter({0: 395591, 1: 395591})

In [54]:
max_features = 20000

In [None]:
# dataset.text = dataset.text.apply(lambda x: " ".join(x))

In [None]:
# d = dataset.sample(1000000)

In [61]:
vectorizer = TfidfVectorizer()

In [62]:
vectorizer.fit(d.text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [63]:
vocab = vectorizer.get_feature_names()

In [64]:
len(vocab)

126862

In [65]:
d.text = d.text.apply(lambda x: x.split(" "))

In [66]:
# %%time
# d.text = d.text.apply(lambda x: [i for i in x if i in vocab])

In [67]:
n_cores = 45

In [68]:
def process_df(df):
    df['text'] = df['text'].apply(lambda x: [i for i in x if i in vocab])
    return df

In [None]:
%%time
n = int(d.shape[0]/n_cores)
list_df = [d[i:i+n] for i in range(0, d.shape[0],n)]

pool = multiprocessing.Pool(n_cores)
processed_list_df = pool.map(process_df, list_df)
pool.close()

d_ = pd.concat(processed_list_df)
d_.shape

In [70]:
# d_ = d_[d_.text.apply(lambda x: len(x)>3 and len(x)<50)]
# d_.shape
d.text = d.text.apply(lambda x: " ".join(x))

In [72]:
X_train, X_test, y_train, y_test = train_test_split(d.text, d.sentiment, test_size=0.3, random_state=37, stratify=d.sentiment)

In [73]:
Counter(y_test)

Counter({0: 118677, 1: 118678})

In [74]:
%%time
tk = Tokenizer(split=" ")
tk.fit_on_texts(X_train)

X_train_seq = tk.texts_to_sequences(X_train)
X_test_seq = tk.texts_to_sequences(X_test)

CPU times: user 2min 18s, sys: 5.57 s, total: 2min 23s
Wall time: 2min 24s


In [75]:
seq_lengths = X_train.apply(lambda x: len(x.split(' ')))
seq_lengths.describe()

count    553827.000000
mean        102.028763
std          42.489579
min          51.000000
25%          67.000000
50%          91.000000
75%         129.000000
max         294.000000
Name: text, dtype: float64

In [78]:
MAX_LEN = 294

In [79]:
X_train_seq_trunc = pad_sequences(X_train_seq, maxlen=MAX_LEN)
X_test_seq_trunc = pad_sequences(X_test_seq, maxlen=MAX_LEN)

In [80]:
X_test_seq_trunc.shape

(237355, 294)

In [81]:
X_train_seq_trunc.shape

(553827, 294)

In [82]:
max_features = len(vocab)

In [83]:
max_features

126862

In [84]:
max_features = len(tk.word_index.items())

In [85]:
emb_matrix = np.zeros((max_features, glove_embedding_dim))

for w, i in tk.word_index.items():
    try:
        vect = glove_embeddings_index[w]
        emb_matrix[i] = vect
    except:
        pass

In [86]:
emb_matrix.shape

(118166, 300)

In [87]:
Counter(emb_matrix.sum(axis=1))[0.0] #words without embeddings

1

In [88]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)
y_train_oh = to_categorical(y_train_le)
y_test_oh = to_categorical(y_test_le)

In [89]:
def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [98]:
emb_model2 = models.Sequential()
emb_model2.add(layers.Embedding(max_features, 300, input_length=MAX_LEN))
emb_model2.add(Bidirectional(LSTM(200)))
emb_model2.add(Dropout(0.6))
emb_model2.add(layers.Dense(256, activation='relu'))
emb_model2.add(layers.Dense(2, activation='softmax'))
emb_model2.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 294, 300)          35449800  
_________________________________________________________________
bidirectional_3 (Bidirection (None, 400)               801600    
_________________________________________________________________
dropout_3 (Dropout)          (None, 400)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 256)               102656    
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 514       
Total params: 36,354,570
Trainable params: 36,354,570
Non-trainable params: 0
_________________________________________________________________


In [99]:
emb_model2.layers[0].set_weights([emb_matrix])
emb_model2.layers[0].trainable = True

In [100]:
emb_model2.compile(optimizer='adam', loss='categorical_crossentropy' , metrics=['accuracy'])

In [101]:
history = emb_model2.fit(X_train_seq_trunc , y_train_oh, epochs=5,
                    batch_size=512 , validation_data=(X_test_seq_trunc, y_test_oh), verbose=1)

Train on 553827 samples, validate on 237355 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 5/5


In [102]:
embeddings = emb_model2.layers[0].get_weights()[0]

In [103]:
embeddings.shape

(118166, 300)

In [104]:
len(tk.word_index.items())

118166

In [105]:
words_embeddings = {w:embeddings[idx] for w, idx in tk.word_index.items() if idx < max_features}

In [107]:
import pickle

with open('nongit_resources/words_embeddings_trained.pickle', 'wb') as handle:
    pickle.dump(words_embeddings, handle)

In [None]:
# dataset = pd.read_csv("../Notebooks/NLP_training/amazon_reviews_cleaned.csv")
# dataset['text'] = dataset.clean_reviewtext
# dataset['sentiment'] = dataset.overall
# dataset = dataset.fillna("")