Import library

In [21]:
from pre_processing import pre_processing
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

# Import dataset from: https://www.kaggle.com/kazanova/sentiment140

In [2]:
dataset = pd.read_csv('../dataset_sentiment_analysis.csv', names=["sentiment", "id", "date", "query", "user", "tweet"])

# View information about dataset

In [3]:
print("Dim dataset: ", len(dataset))
print("Unique sentiment: ", dataset["sentiment"].unique())

#check if dataset contain any null values in sentiment and tweet
print("Number of null elements in columns:\n", dataset.isnull().sum())

#view number of positive and negative tweets
print("Percentuage of positive tweets: ", (len(dataset["sentiment"][dataset.sentiment == 4])/len(dataset))*100, "%")
print("Percentuage of negative tweets: ", (len(dataset["sentiment"][dataset.sentiment == 0])/len(dataset))*100, "%")

Dim dataset:  1600000
Unique sentiment:  [0 4]
Number of null elements in columns:
 sentiment    0
id           0
date         0
query        0
user         0
tweet        0
dtype: int64
Percentuage of positive tweets:  50.0 %
Percentuage of negative tweets:  50.0 %


# Dataset manipulation:
- Drop unnecessary information (id, date, query, user)
- Replace sentiment id from 0,4 (negative, positive) to 0,1 (negative, positive)
- convert column tweet from object to str

In [4]:
#shuffle dataset
#dataset = dataset.sample(frac=1)
#drop unnecessary information
dataset.drop(["id", "date", "query", "user"], axis=1, inplace=True)
#replace sentiment id
dataset["sentiment"].replace(4,1)

#convert tweet from object to str
dataset["tweet"] = dataset["tweet"].astype("str")

print("New shape of dataset: ", dataset.shape)

New shape of dataset:  (1600000, 2)


In [6]:
processing_tweet = pre_processing(list(dataset["tweet"]))

Processing tweets:: 100%|██████████| 1600000/1600000 [01:20<00:00, 19766.10it/s]
word tokenize process: 100%|██████████| 1600000/1600000 [02:33<00:00, 10446.58it/s]
Remove stop word: 100%|██████████| 1600000/1600000 [02:41<00:00, 9899.09it/s] 


Convert tokenized tweet from list to str

In [7]:
dataset["processed_tweet"] = processing_tweet
dataset["processed_tweet"] = dataset['processed_tweet'].apply(lambda x: ' '.join(map(str,x)))

In [8]:
print(dataset[:][:10])

   sentiment                                              tweet  \
0          0  @switchfoot http://twitpic.com/2y1zl - Awww, t...   
1          0  is upset that he can't update his Facebook by ...   
2          0  @Kenichan I dived many times for the ball. Man...   
3          0    my whole body feels itchy and like its on fire    
4          0  @nationwideclass no, it's not behaving at all....   
5          0                      @Kwesidei not the whole crew    
6          0                                        Need a hug    
7          0  @LOLTrish hey  long time no see! Yes.. Rains a...   
8          0               @Tatiana_K nope they didn't have it    
9          0                          @twittera que me muera ?    

                                     processed_tweet  
0       awww bummer shoulda got david carr third day  
1  upset update facebook texting might cry result...  
2  dived many times ball managed save 50 rest go ...  
3                   whole body feels itchy

# Save dataset to pickle file
### warning: save dataset produce a file dim: ~250mb

In [9]:
#shuffle dataset
dataset = dataset.sample(frac=1)

In [10]:
#shuffle dataset
dataset = dataset.sample(frac=1)

In [11]:
#shuffle dataset
dataset = dataset.sample(frac=1)

In [36]:
files = open("dataset_sentiment_analysis.pickle", "wb")
pickle.dump(dataset, files)


# Creating ML model using LSTM


In [12]:
dataset.columns

tokenizer = Tokenizer(num_words=5000, split=' ')

tokenizer.fit_on_texts(dataset['processed_tweet'].values)

X = tokenizer.texts_to_sequences(dataset['processed_tweet'])

X = pad_sequences(X)



In [47]:
files = open("tokenizer.pickle", "wb")
pickle.dump(tokenizer, files)

In [13]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(5000, 300,input_length = X.shape[1]))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='rmsprop',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 48, 300)           1500000   
_________________________________________________________________
lstm (LSTM)                  (None, 196)               389648    
_________________________________________________________________
dense (Dense)                (None, 2)                 394       
Total params: 1,890,042
Trainable params: 1,890,042
Non-trainable params: 0
_________________________________________________________________
None


In [14]:
Y = pd.get_dummies(dataset['sentiment']).values

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.20, shuffle=True)

In [16]:
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(1280000, 48) (1280000, 2)
(320000, 48) (320000, 2)


In [17]:
print(len(X_train))

1280000


# Check uniformity in splitted dataset

In [18]:
print("num neg y_train:", [y_train[i][0] for i in range(0, len(y_train))].count(1))
print("num pos y_train:", [y_train[i][1] for i in range(0, len(y_train))].count(1))

print("num neg y_test:", [y_test[i][0] for i in range(0, len(y_test))].count(1))
print("num pos y_test:", [y_test[i][1] for i in range(0, len(y_test))].count(1))

num neg y_train: 640089
num pos y_train: 639911
num neg y_test: 159911
num pos y_test: 160089


# Execute this block to split train model

In [None]:
for i in range(0, len(X_train)-320000, 320000):
    print(i)
    model.fit(X_train, y_train, epochs=10, batch_size = 64, verbose = 1)
    model.evaluate(X_test, y_test, verbose=1)
    print("******* SAVING MODEL *******")
    model.save("sentiment_model_lstm")

# Execute this block to classic train model

In [20]:
model.fit(X_train, y_train, epochs=4, batch_size=1024, verbose = 1)
print("***** EVALUATION *****")
model.evaluate(X_test, y_test, verbose=1)
model.save("sentiment_model_lstm")

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
***** EVALUATION *****
INFO:tensorflow:Assets written to: sentiment_model_lstm/assets


## Accuracy score with LSTM model: 0.7819

In [8]:
model = load_model("sentiment_model_lstm")

# KNN Model

In [None]:
for k in range(1,11):
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(X_train, y_train)
    y_pred = knn_model.predict(X_test)
    print(f'k = {k}, accuracy: ', accuracy_score(y_test, y_pred))