In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from nltk.tokenize import TweetTokenizer
import datetime
import lightgbm as lgb
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

In [8]:
from sklearn import metrics

In [9]:
from keras.models import Sequential

In [10]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GRU, BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras import backend as K
from keras.engine import InputSpec, Layer
from keras.optimizers import Adam

from keras.callbacks import ModelCheckpoint, TensorBoard, Callback, EarlyStopping

In [66]:
df = pd.read_csv("labelled_phrases.csv")

In [67]:
df

Unnamed: 0,phrase_id,phrase_tokens,sentiment_value,sentiment
0,3,['cockettes'],0.50000,neutral
1,4,['cockettes'],0.42708,neutral
2,5,"['cockettes', 'provide', 'window', 'subculture...",0.37500,negative
3,6,"['cockettes', 'provide', 'window', 'subculture...",0.41667,neutral
4,7,"['cockettes', 'provide', 'window', 'subculture...",0.54167,neutral
...,...,...,...,...
237439,239227,"['standard', 'hollywood', 'bio', 'pic']",0.36111,negative
237440,239228,"['typical', 'fish', 'water', 'story']",0.38889,negative
237441,239229,['zero'],0.33333,negative
237442,239230,"['zippy', 'jazzy', 'score']",0.88889,very_positive


In [68]:
reviews = np.array(df['phrase_tokens'])
sentiments = np.array(df['sentiment'])

# build train and test datasets
train_reviews, test_reviews, train_sentiments, test_sentiments = train_test_split(reviews, sentiments , test_size=0.20,  random_state=0)

In [11]:
full_text = list(train_reviews) + list(test_reviews)

In [12]:
tk = Tokenizer(lower = True, filters='', num_words=20000)
tk.fit_on_texts(full_text)

In [13]:
train_tokenized = tk.texts_to_sequences(train_reviews)
test_tokenized = tk.texts_to_sequences(test_reviews)

In [15]:
max_len = 30
X_train = pad_sequences(train_tokenized, maxlen = max_len)
X_test = pad_sequences(test_tokenized, maxlen = max_len)

In [69]:
Y_train = pd.get_dummies(train_sentiments).values

In [70]:
Y_test = pd.get_dummies(test_sentiments).values
Y_tests = Y_test.argmax(axis=-1)

In [18]:
max_features=20000

In [72]:
embed_dim = 100
lstm_out = 100

model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X_train.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(lstm_out, dropout=0.1, recurrent_dropout=0.1))
model.add(Dense(5,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 30, 100)           2000000   
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 30, 100)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 505       
Total params: 2,080,905
Trainable params: 2,080,905
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
X_val = X_train[-37000:]

In [22]:
X_train = X_train[:-37000]

In [71]:
Y_val = Y_train[-37000:]
Y_train = Y_train[:-37000]

In [73]:
model.fit(X_train, Y_train, epochs = 5, batch_size=32,validation_split=0.05,callbacks=[EarlyStopping(monitor='val_accuracy', patience=3, min_delta=0.0001)])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f3803de5450>

In [74]:
score,acc = model.evaluate(X_val, Y_val, verbose = 2, batch_size = 64)

579/579 - 8s - loss: 0.9175 - accuracy: 0.6432


In [75]:
y_pred = model.predict(X_val)
y_pred = y_pred.argmax(axis=-1)

In [76]:
Y_vals = Y_val.argmax(axis=-1)

In [28]:
def display_classification_report(true_labels, predicted_labels, target_names):

    report = metrics.classification_report(y_true=true_labels, y_pred=predicted_labels, target_names=target_names) 
    print(report)

def get_metrics(true_labels, predicted_labels):
    
    print('Accuracy:  {:2.2%} '.format(metrics.accuracy_score(true_labels, predicted_labels)))
    print('Precision: {:2.2%} '.format(metrics.precision_score(true_labels, predicted_labels, average='weighted')))
    print('Recall:    {:2.2%} '.format(metrics.recall_score(true_labels, predicted_labels, average='weighted')))
    print('F1 Score:  {:2.2%} '.format(metrics.f1_score(true_labels, predicted_labels, average='weighted')))
         
        
def display_model_performance_metrics(true_labels, predicted_labels, target_names):
    print('Model Performance metrics:')
    print('-'*30)
    get_metrics(true_labels=true_labels, predicted_labels=predicted_labels)
    print('\nModel Classification report:')
    print('-'*30)
    display_classification_report(true_labels=true_labels, predicted_labels=predicted_labels, target_names=target_names)

In [29]:
display_model_performance_metrics(true_labels=Y_vals, predicted_labels=y_pred,
                                  target_names=['very_negative', 'negative', 'neutral', 'positive', 'very_positive'])
display_classification_report(Y_vals, y_pred, target_names=['very_negative', 'negative', 'neutral', 'positive', 'very_positive'])

Model Performance metrics:
------------------------------
Accuracy:  80.89% 
Precision: 80.77% 
Recall:    80.89% 
F1 Score:  80.46% 

Model Classification report:
------------------------------


ValueError: ignored

In [5]:
bi = pd.read_csv("binary_labelled_phrases.csv")

In [6]:
bi

Unnamed: 0,phrase_id,phrase_tokens,sentiment_value,sentiment
0,3,['cockettes'],0.50000,negative
1,4,['cockettes'],0.42708,negative
2,5,"['cockettes', 'provide', 'window', 'subculture...",0.37500,negative
3,6,"['cockettes', 'provide', 'window', 'subculture...",0.41667,negative
4,7,"['cockettes', 'provide', 'window', 'subculture...",0.54167,negative
...,...,...,...,...
237439,239227,"['standard', 'hollywood', 'bio', 'pic']",0.36111,negative
237440,239228,"['typical', 'fish', 'water', 'story']",0.38889,negative
237441,239229,['zero'],0.33333,negative
237442,239230,"['zippy', 'jazzy', 'score']",0.88889,positive


In [7]:
reviews = np.array(bi['phrase_tokens'])
sentiments = np.array(bi['sentiment'])

# build train and test datasets
train_reviews, test_reviews, train_sentiments, test_sentiments = train_test_split(reviews, sentiments , test_size=0.20,  random_state=0)

In [77]:
y_preds = model.predict(X_test)
y_preds = y_preds.argmax(axis=-1)