In [2]:
import os
import re
import datetime
import time
from itertools import islice
from operator import itemgetter

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

from sklearn.model_selection import train_test_split, ShuffleSplit

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score, f1_score, precision_score, accuracy_score, roc_auc_score, confusion_matrix
from sklearn import svm

from imblearn.over_sampling import SMOTE

import pickle

from keras.initializers import Constant
from keras import Model
from keras.models import Sequential
from keras.layers import Dense, Input
from keras.layers import LSTM,Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence


from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from nltk import tokenize


Using TensorFlow backend.
[nltk_data] Downloading package stopwords to /home/dat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/dat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def get_run_time(t1, t2):
    diff = t2 - t1
    mins = int(diff / 60)
    secs = round(diff % 60, 3)
    return str(mins) + " mins and " + str(secs) + " seconds"

def clean_str(sentence):
    # Remove HTML
    review_text = BeautifulSoup(sentence, features="html.parser").text
    # Remove non-letters
    letters_only = re.sub("[^a-zA-Z\s\s+]", "", review_text).strip()
    return letters_only

def convert_plain_to_csv(text_file, csv_file):
    t0 = time.time()
    with open(text_file, "r") as f1, open(csv_file, "w") as f2:
        i = 0
        f2.write("productId,score,summary,text\n")
        while True:
            next_n_lines = list(islice(f1, 9))  # read 9 line
            if not next_n_lines:
                break

            output_line = ""
            for line in next_n_lines:
                if "product/productId:" in line:
                    output_line += line.split(":")[1].strip() + ","
                elif "review/score:" in line:
                    output_line += line.split(":")[1].strip() + ","
                elif "review/summary:" in line:
                    summary = clean_str(line.split(":")[1].strip()) + ","
                    output_line += summary
                elif "review/text:" in line:
                    text = clean_str(line.split(":")[1].strip()) + "\n"
                    output_line += text

            f2.write(output_line)

            # print status
            i += 1
            if i % 10000 == 0:
                print(i, "reviews converted...")

    print(datetime.datetime.now(), "- Converting completed in", get_run_time(t0, time.time()))

def get_data(file_name):
    if os.path.exists(file_name):
        print("-- " + file_name + " found locally")
        df = pd.read_csv(file_name)
    return df

def review_to_words(review):
    # 1. Convert to lower case, split into individual words
    words = review.lower().split()

    # 2. Get english stop words
    stops = set(stopwords.words("english"))
    
    # 3. Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    
    return " ".join(meaningful_words)


def cleaning_data(dataset, file_name):
    t0 = time.time()
    num_reviews = dataset["text"].size
    clean_train_reviews = []

    # Loop over each review
    for i in range(0, num_reviews):
        # If the index is evenly divisible by 1000, print a message
        if (i + 1) % 10000 == 0:
            print("Review", i + 1, "of", num_reviews, "\n")

        productId = str(dataset["productId"][i])
        score = str(dataset["score"][i])
        summary = str(dataset["summary"][i])
        text = review_to_words(str(dataset["text"][i]))

        clean_train_reviews.append(productId + "," + score + "," + summary + "," + text + "\n")

    print("Writing clean train reviews...")
    with open(file_name, "w") as f:
        f.write("productId,score,summary,text\n")
        for review in clean_train_reviews:
            f.write("%s\n" % review)

    
    print(datetime.datetime.now(), "- Write file completed in", get_run_time(t0, time.time()))

In [4]:
MAX_SENT_LENGTH = 600
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100

In [5]:
# read data from file
reviews = pd.read_csv("clean_train_reviews.csv", nrows=20000)
# ignore all 3* reviews
reviews = reviews[reviews["score"] != 3]
# positive sentiment = 4* or 5* reviews (sentriment = True)
reviews["sentiment"] = reviews["score"] >= 4

X = reviews['text'].values.astype('U')
# X = reviews['text']
y = reviews['sentiment']

In [6]:
X.shape

(18351,)

In [7]:
tokenizer = Tokenizer(nb_words=20000)
tokenizer.fit_on_texts(X)

In [8]:

data = np.zeros((X.shape[0], MAX_SENT_LENGTH), dtype='int32')
for i,sen in enumerate(X.tolist()):
    for j,word in enumerate(sen.split()):
        if j < MAX_SENT_LENGTH and tokenizer.word_index[word] < MAX_NB_WORDS:
            data[i, j] = tokenizer.word_index[word]
        

word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

Total 23197 unique tokens.


In [9]:
GLOVE_DIR = "."
embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [10]:
print('Total %s word vectors.' % len(embeddings_index))

Total 400000 word vectors.


In [11]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [12]:
embedding_matrix.shape

(23198, 100)

In [41]:
# create the model
embedding_layer = Embedding(len(word_index)+1,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SENT_LENGTH,
                            trainable=False)


int_sequences_input = Input(shape=(MAX_SENT_LENGTH,), dtype="int32")
embedded_sequences = embedding_layer(int_sequences_input)
x = LSTM(200, dropout=0.25, recurrent_dropout=0.25)(embedded_sequences)
x = Dense(128, activation='relu')(x)
# x = Dropout(0.25)(x)
preds = Dense(1, activation='softmax')(x)
model = Model(int_sequences_input, preds)
model.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 600)               0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 600, 100)          2319800   
_________________________________________________________________
lstm_5 (LSTM)                (None, 200)               240800    
_________________________________________________________________
dense_9 (Dense)              (None, 128)               25728     
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 129       
Total params: 2,586,457
Trainable params: 266,657
Non-trainable params: 2,319,800
_________________________________________________________________


In [31]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2,random_state=42)

In [32]:
X_train.shape, X_test.shape

((14680, 600), (3671, 600))

In [33]:
lb_encoder = LabelEncoder()
y_train = lb_encoder.fit_transform(y_train)
y_test = lb_encoder.transform(y_test)
X_test.shape, y_test.shape

((3671, 600), (3671,))

In [34]:
X_train

array([[ 1544,     7,   311, ...,     0,     0,     0],
       [   20,     5,    40, ...,     0,     0,     0],
       [  455,  3275,   349, ...,     0,     0,     0],
       ...,
       [    7,    57, 16190, ...,     0,     0,     0],
       [  300,  3904,    19, ...,     0,     0,     0],
       [ 1044,    92,    24, ...,     0,     0,     0]], dtype=int32)

In [57]:
model.compile(
    loss='categorical_hinge',
    optimizer='nadam',
    metrics=['acc']
)

In [58]:
model.fit(X_train, y_train,
          batch_size=64,
          epochs=3,
          verbose=1,
          validation_data=(X_test, y_test))

Train on 14680 samples, validate on 3671 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x7fcbac475128>

In [59]:
# save the model to disk
filename = 'lstm_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [60]:

score, acc = model.evaluate(X_test, y_test,
                            batch_size=64)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.33560337773690396
Test accuracy: 0.8321983218193054


In [61]:
y_pred = model.predict(X_test)

In [66]:
accs =accuracy_score(y_test, y_pred)
f1s = f1_score(y_test, y_pred)
cms = confusion_matrix(y_test, y_pred)
pres = precision_score(y_test, y_pred)
recs = recall_score(y_test, y_pred)

print("\nAverage accuracy across folds: {:.2f}%".format(accs* 100))
print("\nAverage F1 score across folds: {:.2f}%".format(f1s * 100))
print("\nAverage Precision score across folds: {:.2f}%".format(pres* 100))
print("\nAverage Recall score across folds: {:.2f}%".format(recs* 100))
print("\nAverage Confusion Matrix across folds: \n {}".format(cms))


Average accuracy across folds: 83.22%

Average F1 score across folds: 90.84%

Average Precision score across folds: 83.22%

Average Recall score across folds: 100.00%

Average Confusion Matrix across folds: 
 [[   0  616]
 [   0 3055]]
