In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
# modified dataset
train_data_path = '../Dataset/sentiment140/modified/train/modified_train_data.csv'

In [3]:
pd.set_option("display.max_columns", 101)

ori_train_data = pd.read_csv(train_data_path)

In [4]:
# copy the dataframe
copied_train_data = ori_train_data.copy()

In [5]:
copied_train_data.head()

Unnamed: 0,polarity,id,date,query,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
# select only polarity and tweet from the dataset
copied_train_data = copied_train_data[['polarity', 'tweet']]

In [7]:
copied_train_data.head()

Unnamed: 0,polarity,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [8]:
def preprocess_data(data):
    
    mod_tweet = []
    mod_polarity = []
    
    for index, row in data.iterrows():
        tweet = row['tweet']    
        cleaned_tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", tweet).split())
        mod_tweet.append(cleaned_tweet)
        
        polarity = row['polarity']
        if polarity == 0: # negative
            mod_polarity.append(0)
        else:
            mod_polarity.append(1)
    
    data['mod_tweet'] = mod_tweet
    data['mod_polarity'] = mod_polarity

In [9]:
preprocess_data(copied_train_data)

In [10]:
# option 1
copied_train_data = copied_train_data.drop(['polarity', 'tweet'], axis=1)

In [11]:
# preprocessed train data
copied_train_data.head()

Unnamed: 0,mod_tweet,mod_polarity
0,Awww that s a bummer You shoulda got David Car...,0
1,is upset that he can t update his Facebook by ...,0
2,I dived many times for the ball Managed to sav...,0
3,my whole body feels itchy and like its on fire,0
4,no it s not behaving at all i m mad why am i h...,0


In [12]:
# print shape before removing neutral polarity=2
copied_train_data.shape

(1600000, 2)

In [13]:
# print datatypes
copied_train_data.dtypes

mod_tweet       object
mod_polarity     int64
dtype: object

# Keras RNN (LSTM)

In [14]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, LSTM
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, Callback

Using TensorFlow backend.


As a pre-processing step, we convert the sentence into word tokens. The word tokens are then mapped to a (numerical) word index. The final step involves 'padding' the list of indices with zeros to ensure every row has the same length.

In [15]:
X_train = copied_train_data['mod_tweet']
y_train = copied_train_data['mod_polarity']

In [16]:
token = Tokenizer(num_words=2000, lower=True, split=" ")
token.fit_on_texts(X_train)
X_train_sequence = token.texts_to_sequences(X_train)

In [17]:
# truncate and pad input sequences
max_review_length = 500

In [18]:
# pad the setences
X_train_pad_sequence = sequence.pad_sequences(X_train_sequence, maxlen=max_review_length)

In [19]:
X_train_pad_sequence.shape

(1600000, 500)

In [20]:
top_words = 5000

In [21]:
# hyperparameter tuning
optimizer = Adam(lr = 0.001)
objective = "binary_crossentropy"
embedding_vecor_length = 32

We will map each word onto a 32 length real valued vector. We will also limit the total number of words that we are interested in modeling to the 5000 most frequent words, and zero out the rest. Finally, the sequence length (number of words) in each review varies, so we will constrain each review to be 500 words, truncating long reviews and pad the shorter reviews with zero values.

In [22]:
# create the model
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
# model.add(Dropout(0.2))

model.add(LSTM(100))
# model.add(Dropout(0.2))

model.add(Dense(1, activation="sigmoid"))

model.compile(loss=objective, optimizer=optimizer, metrics=["accuracy"])

In [23]:
# RNN (LSTM) model summary
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 213,301.0
Trainable params: 213,301
Non-trainable params: 0.0
_________________________________________________________________
None


In [24]:
# callback for loss logging per epoch
class LossHistory(Callback):
    def on_train_begin(self, logs={}):
        self.losses = []
        self.val_losses = []
        
    def on_epoch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))
        self.val_losses.append(logs.get('val_loss'))

In [25]:
history = LossHistory()

early_stopping = EarlyStopping(monitor='val_loss', patience=4, verbose=1, mode='auto') 

model.fit(X_train_pad_sequence, 
          y_train, 
          batch_size=64, 
          epochs=3, 
          validation_split=0.3, 
          verbose=1, 
          shuffle=True, 
          callbacks=[history, early_stopping])

Train on 1120000 samples, validate on 480000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x21680fa90>

In [None]:
# prediction = model.predict(X_test, verbose=1)
# print("Validation Log Loss: {}".format(log_loss(y_test, prediction)))