In [1]:
# Import libraries

import pandas as pd

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras import models
from keras import layers
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# Word2vec
import gensim
from gensim.models import Word2Vec

# Utility
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools

In [2]:
# Define constants

W2V_SIZE = 300
W2V_WINDOW = 7
#W2V_EPOCH = 32
W2V_EPOCH = 8

SEQUENCE_LENGTH = 300
# EPOCHS = 8
EPOCHS = 3
BATCH_SIZE = 1024

In [3]:
# Import dataset
import pandas as pd
df = pd.read_pickle("preprocessed_labeled.pkl")
#df = df[['clean','sentiment']]
df = df[['clean_nouns','sentiment']]
df['sentiment'] = df['sentiment'].replace(1,0)
df['sentiment'] = df['sentiment'].replace(2,0)
df['sentiment'] = df['sentiment'].replace(0,1) # 1 is POSITIVE
df['sentiment'] = df['sentiment'].replace(-1,0) # 0 is NEGATIVE

In [4]:
# Define train/test
df_train = df[0:40000]
df_test = df[40000:]

In [5]:
# Word2Vec
#sentences = [word_tokenize(text) for text in df_train.clean]
sentences = [word_tokenize(text) for text in df_train.clean_nouns]
# Include unlabaled sentenced
unlabaled = pd.read_pickle("preprocessed.pkl")
#sentences_unlabaled = [word_tokenize(text) for text in unlabaled.base] # without non words and len smaller than 2
sentences_unlabaled = [word_tokenize(text) for text in unlabaled.clean_nouns]

sentences.extend(sentences_unlabaled)
word2vec_model = gensim.models.word2vec.Word2Vec(sentences, vector_size=W2V_SIZE, window=W2V_WINDOW, min_count=1, workers=8)

In [6]:
vocab_size = len(word2vec_model.wv)
print("Vocab size is", vocab_size)

Vocab size is 100286


In [7]:
tokenizer = Tokenizer()
# tokenizer.fit_on_texts(df_train.clean)
# tokenizer.fit_on_texts(unlabaled.base)
tokenizer.fit_on_texts(df_train.clean_nouns)
tokenizer.fit_on_texts(unlabaled.clean_nouns)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

Total words 100291


In [8]:
# x_train = pad_sequences(tokenizer.texts_to_sequences(df_train.clean), maxlen=SEQUENCE_LENGTH)
# x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.clean), maxlen=SEQUENCE_LENGTH)
x_train = pad_sequences(tokenizer.texts_to_sequences(df_train.clean_nouns), maxlen=SEQUENCE_LENGTH)
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.clean_nouns), maxlen=SEQUENCE_LENGTH)

In [9]:
# Index padded sequence and then replace after balancing
sequences = {}
x_train_indexed = []
i = 0
for sequence in x_train:
    sequences[i] = sequence
    x_train_indexed.append(i)
    i = i + 1

In [10]:
# Solve imbalanced data with SMOTE
from imblearn.over_sampling import SMOTE
X = np.array(x_train_indexed).reshape(-1,1) # each index in a list
y = df_train['sentiment']
print(len(X), len(y))
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
print(len(X), len(y))

40000 40000
72916 72916


In [11]:
x_train_balanced = []

In [12]:
# Invert after balancing
for index in X:
    i = int(X[index][0])
    sequence = sequences[i]
    x_train_balanced.append(sequence)

In [13]:
x_train_balanced = np.array(x_train_balanced)

In [14]:
y_train = y
y_train = np.array(y_train).reshape(-1,1)
y_test = np.array(df_test['sentiment']).reshape(-1,1)

In [15]:
print("x_train", x_train_balanced.shape)
print("y_train", y_train.shape)
print()
print("x_test", x_test.shape)
print("y_test", y_test.shape)

x_train (72916, 300)
y_train (72916, 1)

x_test (3943, 300)
y_test (3943, 1)


In [16]:
# Create embedding matrix
embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
for word, i in tokenizer.word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]
print(embedding_matrix.shape)

(100291, 300)


In [17]:
# Create Embedding Layer
embedding_layer = layers.Embedding(vocab_size, W2V_SIZE, weights=[embedding_matrix], input_length=SEQUENCE_LENGTH, trainable=False)

In [18]:
# Build model
model = models.Sequential()
model.add(embedding_layer)
model.add(layers.Dropout(0.5))
model.add(layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2))
#model.add(layers.Dense(1, activation='sigmoid'))
model.add(layers.Dense(1, activation='relu'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 300)          30087300  
                                                                 
 dropout (Dropout)           (None, 300, 300)          0         
                                                                 
 lstm (LSTM)                 (None, 100)               160400    
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 30,247,801
Trainable params: 160,501
Non-trainable params: 30,087,300
_________________________________________________________________


In [19]:
from tensorflow import keras

sgd = keras.optimizers.SGD(learning_rate=0.5, momentum=0.9, nesterov=True)
model.compile(loss = 'binary_crossentropy', optimizer = 'sgd', metrics = ['accuracy'])
     
#model.compile(loss='binary_crossentropy',optimizer=keras.optimizers.Adam(learning_rate=1e-3),metrics=['accuracy'])
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]
history = model.fit(x_train, y_train,
                    batch_size=BATCH_SIZE,
                    #epochs=EPOCHS,
                    epochs=2,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)

Epoch 1/2
Epoch 2/2


In [70]:
model.save("w2v_keras.h5")

In [20]:
scores = model.evaluate(x_test, y_test, batch_size=BATCH_SIZE, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))
print("Loss:",scores[0])

Accuracy: 88.59%
Loss: 1.5982820987701416


In [41]:
# Predict
def decode_sentiment(score):
    if score > 0.5: return 1
    elif score <= 0.5: return 0

def get_features(texts):
    #return pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=SEQUENCE_LENGTH)
    return pad_sequences(tokenizer.texts_to_sequences(texts), maxlen=SEQUENCE_LENGTH)

def predict(features):
    # Predict
    scores = model.predict(np.array(features))
    return scores

# Classify unlabaled data
unlabaled = pd.read_pickle("preprocessed.pkl")
features = []
vectors = get_features(unlabaled.clean_nouns)
scores = predict(vectors)



In [44]:
negatives = []
for i in range(0,len(scores)):
    if scores[i]<1: negatives.append(i)

In [45]:
len(negatives) # 19342 negatives out of 313985 

19342

In [46]:
with open('word2vec_keras_negatives.txt','w') as tfile:
    tfile.write(str(negatives))