## Train Sentiment Analysis

In [43]:
import warnings
warnings.filterwarnings('ignore')

In [44]:
from pathlib import Path
import pandas as pd

In [96]:
sentiment140_csv = pd.read_csv('../datasets/sentiment140/sentiment140.txt', delimiter=',', encoding="latin-1", header=None, names=['label', 'd1','d2','d3', 'd4', 'tweet'])

print(len(sentiment140_csv))

1600000


In [97]:
sentiment140_csv.head()

Unnamed: 0,label,d1,d2,d3,d4,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [98]:
sentiment140_df = sentiment140_csv.drop(['d1','d2','d3','d4'], axis=1)

## Label meaning
Positive (4), negative (0) or neutral (2).

In [99]:
sentiment140_df.head()

Unnamed: 0,label,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


## Data Processing

Preprocess the texts:
- Convert all characters to lower case
- Remove special characters: Remove links and usernames and trasform emojis to text
- Remove repetitions: Remove char repetitions (e.g. whaaaaat => what)
- Remove stop words: Remove common stop words

In [101]:
import re
from time import time
import nltk
from emoji import demojize

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Araceli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [106]:
texts = sentiment140_df['tweet']

start = time()

# Lowercasing
texts = texts.str.lower()

# Remove special chars
texts = texts.str.replace(r"(http|@)\S+", "")
texts = texts.apply(demojize)
texts = texts.str.replace(r"::", ": :")
texts = texts.str.replace(r"â€™", "'")
texts = texts.str.replace(r"[^a-z\':_]", " ")

#Remove repetitions
pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
texts = texts.str.replace(pattern, r"\1")


# Transform short negation form
texts = texts.str.replace(r"(can't|cannot)", 'can not')
texts = texts.str.replace(r"n't", ' not')

# Remove stop words
stopwords = nltk.corpus.stopwords.words('english')
stopwords.remove('not')
stopwords.remove('nor')
stopwords.remove('no')
texts = texts.apply(
    lambda x: ' '.join([word for word in x.split() if word not in stopwords])
)

print('Time to clean up: {:.2f} sec'.format(time() - start))

sentiment140_df.tweet = texts

Time to clean up: 462.24 sec


NameError: name 'data' is not defined

In [108]:
sentiment140_df.head()

Unnamed: 0,label,tweet
0,0,aw that's bummer shoulda got david carr third day
1,0,upset not update facebook texting might cry re...
2,0,dived many times ball managed save rest go bounds
3,0,whole body feels itchy like fire
4,0,no not behaving i'm mad not see


## Tokenize

Transform the text corpus to a vector representation

- **num_words**: Number of words to use

In [114]:
num_words = 10000

In [115]:
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer

In [117]:
tokenizer = Tokenizer(num_words=num_words, lower=True)
tokenizer.fit_on_texts(sentiment140_df.tweet)

file_to_save = Path('../datasets/sentiment140/tokenizer.pickle').resolve()
with file_to_save.open('wb') as file:
    pickle.dump(tokenizer, file)

## Split data
Split the dataset in train and test data (validation)

In [119]:
from sklearn.model_selection import train_test_split

In [127]:
train = pd.DataFrame(columns=['label', 'tweet'])
test = pd.DataFrame(columns=['label', 'tweet'])

for label in sentiment140_df['label'].unique():
    label_data = sentiment140_df[sentiment140_df['label'] == label]
    train_data, test_data = train_test_split(label_data, test_size = 0.3)
    
    train = pd.concat([train, train_data])
    test = pd.concat([test, test_data])
    
# TODO: Verify behavior with conventinoal train_test_split method

## Model

Define the bidirectional GRU model

In [135]:
from tensorflow.keras.layers import Input, Embedding, GRU
from tensorflow.keras.layers import Dropout, GlobalAvgPool1D
from tensorflow.keras.layers import Bidirectional, Dense
from tensorflow.keras.models import Sequential

In [142]:
input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)
embedding_dim = 200
input_length = 100
gru_units = 128
gru_dropout = 0.1
recurrent_dropout = 0.1
dropout = 0.1

In [145]:
## Estimator
model = Sequential()

model.add(Embedding(
    input_dim=input_dim,
    output_dim=embedding_dim,
    input_shape=(input_length,)
))

model.add(Bidirectional(GRU(
    gru_units,
    return_sequences=True,
    dropout=gru_dropout,
    recurrent_dropout=recurrent_dropout
)))

model.add(GlobalAvgPool1D())
model.add(Dense(32, activation='relu'))
model.add(Dropout(dropout))

model.add(Dense(1, activation='sigmoid'))

In [146]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 100, 200)          2000000   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 256)          252672    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                8224      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 2,260,929
Tr

## Prepare the data
Prepare the model input data

In [147]:
from tensorflow.keras.preprocessing.sequence import pad_sequences 

In [154]:
train_sequences = [text.split() for text in train['tweet']]
test_sequences = [text.split() for text in test['tweet']]

list_tokenized_train = tokenizer.texts_to_sequences(train_sequences)
list_tokenized_test = tokenizer.texts_to_sequences(test_sequences)

x_train = pad_sequences(list_tokenized_train, maxlen=input_length)
x_test = pad_sequences(list_tokenized_test, maxlen=input_length)

y_train = train['label'].replace(4, 1)
y_test = test['label'].replace(4, 1)

## Train model
Training model with previous data

In [156]:
batch_size = 128
epochs = 1

In [157]:
model.fit(
    x_train, 
    y=y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(x_test, y_test)
)

Train on 1120000 samples, validate on 480000 samples


<tensorflow.python.keras.callbacks.History at 0x1776d491dc8>

In [159]:
model_file = Path('../models/sentiment_analysis/gru_model.h5').resolve()
model.save_weights(model_file.as_posix())

# Small testing

In [165]:
list_test = ["I feel sick and I dont want to go to the school, but tomorrow is going to be a better day", 
             "I feel sick and I dont want to go to the school"]

In [201]:

value_to_predict = "This is awesome, not really".split()
list_tokenized_value_to_predict = tokenizer.texts_to_sequences(value_to_predict)

x_prediction = pad_sequences(list_tokenized_value_to_predict, maxlen=input_length)

In [202]:
x_test

array([[   0,    0,    0, ..., 4882, 5445,   43],
       [   0,    0,    0, ...,    0,   96,  190],
       [   0,    0,    0, ..., 6268, 6515,   90],
       ...,
       [   0,    0,    0, ...,    6, 1541,   43],
       [   0,    0,    0, ...,  205,    3,   15],
       [   0,    0,    0, ...,  214,   18,  413]])

In [203]:
# small testing

import numpy as np
def get_score_range(mean):
  if mean < 0.5:
    return (0.0, mean)
  return (mean, 1.0)




result = model.predict(x_prediction)
mean = np.mean(result)
std = np.std(result)
low, high = get_score_range(mean)
print( ": Score Range: {:4f} - {:4f}".format(low, high))

: Score Range: 0.608004 - 1.000000
