Import library

In [None]:
from pre_processing import pre_processing
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

# Import dataset from: https://www.kaggle.com/kazanova/sentiment140

In [None]:
dataset = pd.read_csv('../dataset_sentiment_analysis.csv', names=["sentiment", "id", "date", "query", "user", "tweet"])

# View information about dataset

In [None]:
print("Dim dataset: ", len(dataset))
print("Unique sentiment: ", dataset["sentiment"].unique())

#check if dataset contain any null values in sentiment and tweet
print("Number of null elements in columns:\n", dataset.isnull().sum())

#view number of positive and negative tweets
print("Percentuage of positive tweets: ", (len(dataset["sentiment"][dataset.sentiment == 4])/len(dataset))*100, "%")
print("Percentuage of negative tweets: ", (len(dataset["sentiment"][dataset.sentiment == 0])/len(dataset))*100, "%")

# Dataset manipulation:
- Drop unnecessary information (id, date, query, user)
- Replace sentiment id from 0,4 (negative, positive) to 0,1 (negative, positive)
- convert column tweet from object to str

In [None]:
#shuffle dataset
#dataset = dataset.sample(frac=1)
#drop unnecessary information
dataset.drop(["id", "date", "query", "user"], axis=1, inplace=True)
#replace sentiment id
dataset["sentiment"].replace(4,1)

#convert tweet from object to str
dataset["tweet"] = dataset["tweet"].astype("str")

print("New shape of dataset: ", dataset.shape)

In [None]:
#too long process (~ 8 hours to process 1.6kkk tweets), using MPI to split processing
processing_tweet = pre_processing(list(dataset["tweet"]))

Suppose that we had split dataset into 4 sub processing: 0-400k, 400-800k, 800k-1.2kk, 1.2kk-1.6kk using pickle library

In [None]:
dataset_processed = list()
for files in ["../../processing_0_400k.pickle", "../../processing_400k_800k.pickle", "../../processing_800k_12kk.pickle", "../../processing_12kk_end.pickle"]:
    dataset_processed_tmp = pickle.load(open(files, "rb"))
    for entry in dataset_processed_tmp:
        dataset_processed.append(entry)
    

Convert tokenized tweet from list to str

In [None]:
dataset["processed_tweet"] = dataset_processed
dataset["processed_tweet"] = dataset['processed_tweet'].apply(lambda x: ' '.join(map(str,x)))

# Save dataset to pickle file
### warning: save dataset produce a file dim: ~250mb

In [None]:
#shuffle dataset
dataset = dataset.sample(frac=1)

In [None]:
files = open("dataset_sentiment_analysis.pickle", "wb")
pickle.dump(dataset, files)


# Creating ML model using LSTM


In [None]:
dataset.columns

tokenizer = Tokenizer(num_words=1500, split=' ')

tokenizer.fit_on_texts(dataset['processed_tweet'].values)

X = tokenizer.texts_to_sequences(dataset['processed_tweet'])

X = pad_sequences(X)



In [None]:
files = open("tokenizer.pickle", "wb")
pickle.dump(tokenizer, files)

In [None]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(2000, 300,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='sigmoid'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

In [None]:
Y = pd.get_dummies(dataset['sentiment']).values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.30, random_state=42)

In [None]:
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

In [None]:

#batch_size = 32
model.fit(X_train, y_train, epochs = 3, verbose = 1)


In [None]:
model.save("model_lstm_epoch_1")
model.save("model_lstm_epoch_1.h5")

In [None]:
score, acc = model.evaluate(X_test, y_test, verbose=1)