In [71]:
import time
import os
import pandas as pd
import numpy as np
from collections import Counter
import re
import nltk
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sea
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from gensim.models import word2vec
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
#from wordcloud import WordCloud, STOPWORDS

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Activation, Embedding, Flatten, Dropout
from keras.utils import to_categorical
from keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ReduceLROnPlateau, EarlyStopping


import warnings
warnings.filterwarnings('ignore')

In [None]:
data_columns = ["target", "ids", "date", "flag", "user", "text"]
data = pd.read_csv('../input/sentiment140/training.1600000.processed.noemoticon.csv', encoding = "ISO-8859-1", names = data_columns)

In [None]:
print('Number of instances : {}'.format(data.shape[0]))
print('-'*100)
data.info()

In [None]:
data.head()

In [None]:
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}
def decode_sentiment(label):
    return decode_map[int(label)]

In [None]:
%%time
data.target = data.target.apply(lambda x: decode_sentiment(x))

In [None]:
target_cnt = Counter(data.target)

plt.figure(figsize=(8,6))
plt.bar(target_cnt.keys(), target_cnt.values())
plt.title("Dataset labels distribuition")
plt.show()

In [None]:
stop_words = stopwords.words('spanish')
stemmer = SnowballStemmer('english')

In [None]:
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [None]:
%%time
data.text = data.text.apply(lambda x: preprocess(x))

In [None]:
data_train, data_test = train_test_split(data, test_size = 0.2, random_state = 23)
print('Train Size : {}'.format(len(data_train)))
print('Test Size : {}'.format(len(data_test)))

In [None]:
data.text[0]

In [None]:
%%time
documents = [_text.split() for _text in data_train.text] 

In [None]:
w2v_model = word2vec.Word2Vec(size=300, 
                              window=10, 
                               min_count=10
                              )

In [None]:
w2v_model.build_vocab(documents)

In [None]:
words = w2v_model.wv.vocab.keys()
vocab_size = len(words) + 1
print("Vocab size", vocab_size)

In [None]:
%%time
w2v_model.train(documents, total_examples=len(documents), epochs=32)

In [54]:
w2v_model.most_similar("good")

[('great', 0.7157130241394043),
 ('goood', 0.6614766120910645),
 ('nice', 0.5850443243980408),
 ('gooood', 0.5783562660217285),
 ('bad', 0.5589054226875305),
 ('gud', 0.5531789064407349),
 ('rough', 0.5530114769935608),
 ('fantastic', 0.5488074421882629),
 ('gooooood', 0.5351353287696838),
 ('goooooood', 0.5340060591697693)]

In [None]:
%%time
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data_train.text)

voacb_size = len(tokenizer.word_index) + 1
print('Total words', vocab_size)

In [None]:
%%time
x_train = pad_sequences(tokenizer.texts_to_sequences(data_train.text), maxlen=300)
x_test = pad_sequences(tokenizer.texts_to_sequences(data_test.text), maxlen=300)

In [None]:
labels = data_train.target.unique().tolist()
labels.append('NEUTRAL')
labels

In [None]:
encoder = LabelEncoder()
encoder.fit(data_train.target.tolist())

y_train = encoder.transform(data_train.target.tolist())
y_test = encoder.transform(data_test.target.tolist())

In [None]:
y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

print("y_train",y_train.shape)
print("y_test",y_test.shape)

In [None]:
print("x_train", x_train.shape)
print("y_train", y_train.shape)
print()
print("x_test", x_test.shape)
print("y_test", y_test.shape)

In [None]:
y_train[:10]

In [None]:
embedding_matrix = np.zeros((vocab_size, 300))

for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
        
print(embedding_matrix.shape)

In [None]:
embedding_layer = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=300, trainable=False)

In [None]:
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

In [None]:
%%time
history = model.fit(x_train, y_train,
                    batch_size=1024,
                    epochs=5,
                    verbose=1
                    )

In [None]:
%%time
score = model.evaluate(x_test, y_test, batch_size=1024)
print()
print("ACCURACY:",score[1])
print("LOSS:",score[0])

In [69]:
SENTIMENT_THRESHOLDS = (0.4, 0.7)

def decode_sentiment(score, include_neutral=True):
    if include_neutral:        
        label = 'NEUTRAL'
        if score <= SENTIMENT_THRESHOLDS[0]:
            label = 'NEGATIVE'
        elif score >= SENTIMENT_THRESHOLDS[1]:
            label = 'POSITIVE'

        return label
    else:
        return 'NEGATIVE' if score < 0.5 else 'POSITIVE'

In [50]:
def predict(text, include_neutral=True):
    start_at = time.time()
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=300)
    # Predict
    score = model.predict([x_test])[0]
    # Decode sentiment
    label = decode_sentiment(score, include_neutral=include_neutral)

    return {"label": label, "score": float(score),
       "elapsed_time": time.time()-start_at} 

In [51]:
predict("I love the music")

{'label': 'POSITIVE',
 'score': 0.9904234409332275,
 'elapsed_time': 0.3687586784362793}

In [52]:
predict('I hate the rain')

{'label': 'NEGATIVE',
 'score': 0.007849573157727718,
 'elapsed_time': 0.34761738777160645}

In [55]:
predict('I dont know what I am doing')

{'label': 'NEGATIVE',
 'score': 0.20417726039886475,
 'elapsed_time': 0.3041715621948242}

In [67]:
predict('I dont know what I am doing')

{'label': 'NEGATIVE',
 'score': 0.20417726039886475,
 'elapsed_time': 0.3433356285095215}

In [70]:
%%time
y_pred_1d = []
y_test_1d = list(data_test.target)
scores = model.predict(x_test, verbose=1, batch_size=8000)
y_pred_1d = [decode_sentiment(score, include_neutral=False) for score in scores]

CPU times: user 17.8 s, sys: 2.34 s, total: 20.2 s
Wall time: 22.5 s


In [80]:
print('Accuracy on the test data : {}'.format(accuracy_score(y_pred_1d, y_test_1d)))
print()
print(classification_report(y_pred_1d, y_test_1d))

Accuracy on the test data : 0.81293125

              precision    recall  f1-score   support

    NEGATIVE       0.81      0.82      0.81    157784
    POSITIVE       0.82      0.81      0.81    162216

    accuracy                           0.81    320000
   macro avg       0.81      0.81      0.81    320000
weighted avg       0.81      0.81      0.81    320000

