In [1]:
# Import libraries

import pandas as pd

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras import models
from keras import layers
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# Word2vec
import gensim
from gensim.models import Word2Vec

# Utility
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools

In [2]:
# Define constants

W2V_SIZE = 200
W2V_WINDOW = 7
W2V_EPOCH = 8

SEQUENCE_LENGTH = 200
EPOCHS = 3
BATCH_SIZE = 1024

In [3]:
# Import dataset
import pandas as pd
df = pd.read_pickle("preprocessed_labeled.pkl")
df = df[['clean_nouns','sentiment']]
df['sentiment'] = df['sentiment'].replace(1,0)
df['sentiment'] = df['sentiment'].replace(2,0)
df['sentiment'] = df['sentiment'].replace(0,1) # 1 is POSITIVE
df['sentiment'] = df['sentiment'].replace(-1,0) # 0 is NEGATIVE

In [4]:
# Word2Vec
sentences = [word_tokenize(text) for text in df.clean_nouns]
# Include unlabaled sentenced
unlabaled = pd.read_pickle("preprocessed.pkl")
sentences_unlabaled = [word_tokenize(text) for text in unlabaled.clean_nouns]

sentences.extend(sentences_unlabaled)
word2vec_model = gensim.models.word2vec.Word2Vec(sentences, vector_size=W2V_SIZE, window=W2V_WINDOW, min_count=1, workers=8)

In [5]:
vocab_size = len(word2vec_model.wv)
print("Vocab size is", vocab_size)

Vocab size is 101104


In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.clean_nouns)
tokenizer.fit_on_texts(unlabaled.clean_nouns)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

Total words 101109


In [7]:
x = pad_sequences(tokenizer.texts_to_sequences(df.clean_nouns), maxlen=SEQUENCE_LENGTH)

In [8]:
# Add vader columns
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

compound, neg, pos = [], [], []
for text in df['clean_nouns']:
    compound.append(sia.polarity_scores(text)['compound'])
    neg.append(sia.polarity_scores(text)['neg'])
    pos.append(sia.polarity_scores(text)['pos'])
df['compound'] = compound
df['neg'] = neg
df['pos'] = pos

In [9]:
# Add polarity and subjectivity
from textblob import TextBlob

polarity, subjectivity = [], []
for text in df['clean_nouns']:
    polarity.append(TextBlob(text).sentiment.polarity)
    subjectivity.append(TextBlob(text).sentiment.subjectivity)
df['polarity'] = polarity
df['subjectivity'] = subjectivity

In [10]:
# with additional features
x_new = []
for index, row in df.iterrows():
    vector = x[index]
    x_new.append(np.append(vector, [row['compound'],row['pos'],row['neg'],row['polarity'],row['subjectivity']]) )

In [11]:
len(x_new[0])

205

In [11]:
# Index padded sequence and then replace after balancing
sequences = {}
x_indexed = []
i = 0
for sequence in x_new:
    sequences[i] = sequence
    x_indexed.append(i)
    i = i + 1

In [9]:
# # Index padded sequence and then replace after balancing
# sequences = {}
# x_indexed = []
# i = 0
# for sequence in x:
#     sequences[i] = sequence
#     x_indexed.append(i)
#     i = i + 1

In [12]:
# Solve imbalanced data with SMOTE
from imblearn.over_sampling import SMOTE
X = np.array(x_indexed).reshape(-1,1) # each index in a list
y = df['sentiment']
print(len(X), len(y))
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
print(len(X), len(y))

43943 43943
79906 79906


In [14]:
x_balanced = []
features = []

In [15]:
# Invert after balancing
for index in X:
    i = int(X[index][0])
    sequence = sequences[i][0:201]
    features.append(sequences[i][201:])
    x_balanced.append(sequence)

In [12]:
# if not vader
x_balanced = []
# Invert after balancing
for index in X:
    i = int(X[index][0])
    sequence = sequences[i]
    x_balanced.append(sequence)
x_balanced = np.array(x_balanced)
# In the first step we will split the data in training and remaining dataset
from sklearn.model_selection import train_test_split
X_train, X_rem, y_train, y_rem = train_test_split(x_balanced, y, train_size=0.6, random_state = 4)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state = 4)
X_train = np.array(X_train)
X_valid = np.array(X_valid)
X_test = np.array(X_test)
# Include validation dataset
print("x_train", X_train.shape)
print("y_train", y_train.shape)
print()
print("x_test", X_test.shape)
print("y_test", y_test.shape)
print()
print("x_valid", X_valid.shape)
print("y_valid", y_valid.shape)

x_train (47943, 200)
y_train (47943,)

x_test (15982, 200)
y_test (15982,)

x_valid (15981, 200)
y_valid (15981,)


In [32]:
x_balanced = np.array(x_balanced)

In [16]:
# In the first step we will split the data in training and remaining dataset
from sklearn.model_selection import train_test_split
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.7, random_state = 42)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state = 42)

In [16]:
len(sequences[0])

205

In [17]:
# Invert after balancing
X_train_embed, X_train_features = [],[]
for index in X_train:
    i = int(X[index][0])
    sequence = sequences[i][0:200]
    X_train_features.append(sequences[i][200:])
    X_train_embed.append(sequence)
    
X_valid_embed, X_valid_features = [],[]
for index in X_valid:
    i = int(X[index][0])
    sequence = sequences[i][0:200]
    X_valid_features.append(sequences[i][200:])
    X_valid_embed.append(sequence)
    
X_test_embed, X_test_features = [],[]
for index in X_test:
    i = int(X[index][0])
    sequence = sequences[i][0:200]
    X_test_features.append(sequences[i][200:])
    X_test_embed.append(sequence)

In [18]:
X_train_embed = np.array(X_train_embed)
X_train_features = np.array(X_train_features)
X_valid_embed = np.array(X_valid_embed)
X_valid_features = np.array(X_valid_features)
X_test_embed = np.array(X_test_embed)
X_test_features = np.array(X_test_features)

In [19]:
# Include validation dataset
print("x_train", X_train_embed.shape)
print("y_train", y_train.shape)
print()
print("x_test", X_test_embed.shape)
print("y_test", y_test.shape)
print()
print("x_valid", X_valid_embed.shape)
print("y_valid", y_valid.shape)

x_train (55934, 200)
y_train (55934,)

x_test (11986, 200)
y_test (11986,)

x_valid (11986, 200)
y_valid (11986,)


In [39]:
print("x_train", X_train_features.shape)

x_train (55934, 5)


In [20]:
# Create embedding matrix
embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
for word, i in tokenizer.word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]
print(embedding_matrix.shape)

(101109, 200)


In [21]:
# Create Embedding Layer
embedding_layer = layers.Embedding(vocab_size, W2V_SIZE, weights=[embedding_matrix], input_length=SEQUENCE_LENGTH, trainable=False)

In [22]:
import keras
input1 = layers.Input(shape=(SEQUENCE_LENGTH,))
meta_input = layers.Input(shape=(5,))
emb = embedding_layer(input1)
lstm = layers.LSTM(128)(emb)
x = layers.Concatenate()([lstm, meta_input])
x = layers.Dense(64, activation='relu')(x)
x = layers.Dense(1, activation='sigmoid')(x)
model = models.Model(inputs=[input1 , meta_input], outputs=[x])
model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=1e-3), metrics=['accuracy'])
print(model.summary())

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 200)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 200, 200)     20221800    ['input_1[0][0]']                
                                                                                                  
 lstm (LSTM)                    (None, 128)          168448      ['embedding[0][0]']              
                                                                                                  
 input_2 (InputLayer)           [(None, 5)]          0           []                               
                                                                                              

In [23]:
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=3),
              EarlyStopping(monitor='val_acc', min_delta=1e-3, patience=3)]
history = model.fit(x=[X_train_embed, X_train_features], y=y_train, batch_size=128, epochs=10, verbose=1,
                    callbacks=callbacks,
                    validation_data=([X_valid_embed, X_valid_features], y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
scores = model.evaluate([X_test_embed, X_test_features], y_test, batch_size=BATCH_SIZE, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))
print("Loss:",scores[0])

scores = model.predict([X_test_embed, X_test_features])

from sklearn import metrics
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

y_pred = []
for score in scores:
    if score >= 0.5: y_pred.append(1)
    else: y_pred.append(0)

print(metrics.classification_report(y_test, y_pred))
print(matthews_corrcoef(y_test, y_pred))
print(f1_score(y_test, y_pred, average='macro'))

print(precision_score(y_test, y_pred, average='macro'))
print(precision_score(y_test, y_pred, average=None))

print(recall_score(y_test, y_pred, average='macro'))
print(recall_score(y_test, y_pred, average=None))

Accuracy: 59.04%
Loss: 0.7153916954994202
              precision    recall  f1-score   support

           0       0.61      0.52      0.56      5998
           1       0.58      0.66      0.62      5988

    accuracy                           0.59     11986
   macro avg       0.59      0.59      0.59     11986
weighted avg       0.59      0.59      0.59     11986

0.1826172266275372
0.5883720481941733
0.5922123452527127
[0.60550815 0.57891654]
0.5904137384472179
[0.52050684 0.66032064]


In [None]:
roc_auc_score(y_test, y_pred)

In [None]:
scores = model.predict([X_test_embed, X_test_features])

In [41]:
# Predict
def decode_sentiment(score):
    if score > 0.5: return 1
    elif score <= 0.5: return 0

def get_features(texts):
    return pad_sequences(tokenizer.texts_to_sequences(texts), maxlen=SEQUENCE_LENGTH)

def predict(features):
    # Predict
    scores = model.predict(np.array(features))
    return scores

def get_vader(texts):
    features = []
    for i in texts:
        l = []
        l.append(sia.polarity_scores(i)['compound'])
        l.append(sia.polarity_scores(i)['neg'])
        l.append(sia.polarity_scores(i)['pos'])
        l.append(TextBlob(i).sentiment.polarity)
        l.append(TextBlob(i).sentiment.subjectivity)
        features.append(l)
    # Create the pandas DataFrame
    features = pd.DataFrame(features)
    # specifying column names
    features.columns = ['compound', 'neg', 'pos', 'polarity', 'subjectivity'] 
    return features
        
# Classify unlabaled data
unlabaled = pd.read_pickle("preprocessed.pkl")
features = []
vectors = get_features(unlabaled.clean_nouns)
features = np.array(get_vader(unlabaled.clean_nouns))

In [42]:
print(vectors.shape,features.shape)

(313985, 200) (313985, 5)


In [44]:
scores = model.predict([vectors, features])



In [47]:
negatives = []
for i in range(0,len(scores)):
    if scores[i]<0.5: negatives.append(i)

In [48]:
len(negatives) # 92373 negatives out of 313985 

92373

In [49]:
with open('word2vec_keras_negatives.txt','w') as tfile:
    tfile.write(str(negatives))