### In this project, i implemented a ML model using LSTM  and the objective is to predict the sentiment as positive or negative on twitter_training dataset.

Import libraries

In [1]:
import numpy as np
import pandas as pd # to read and manage the dataset
import tensorflow as tf # for deep learning models
from tensorflow import keras 
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences # used to pad a sequence with zeros if the sequence length is less than the required length
from tensorflow.keras.preprocessing.text import Tokenizer # used for text partitioning
from tensorflow.keras.models import Sequential # used the sequence model for sequence data analysis
from tensorflow.keras.layers import LSTM,Dense, Dropout, SpatialDropout1D # layers to be used to build the DL model
from tensorflow.keras.layers import Embedding 
from sklearn.feature_extraction.text import CountVectorizer




Read the dataset with the following variables names 'seq','brand','sentiment','text' and print the first 5 rows. 


In [2]:
df = pd.read_csv('twitter_training.csv', names = ['seq','brand','sentiment','text'])
df.head()

Unnamed: 0,seq,brand,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [3]:
#select only the 'sentiment'and 'text' columns to analyze
data = df[['sentiment','text']]

In [4]:
data

Unnamed: 0,sentiment,text
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...
...,...,...
74673,Positive,Just realized that the Windows partition of my...
74674,Positive,Just realized that my Mac window partition is ...
74675,Positive,Just realized the windows partition of my Mac ...
74676,Positive,Just realized between the windows partition of...


In [6]:
#print the unique labels (sentiments) and their frequencies
print(data["sentiment"].value_counts())

#filter out the Neutral labels
data = data[data['sentiment'] != 'Neutral']

#filter out the Irrelevant labels
data = data[data['sentiment'] != 'Irrelevant']

#print the selected labels and the frequencies
print(data["sentiment"].value_counts())


Negative      22538
Positive      20832
Neutral       18318
Irrelevant    12990
Name: sentiment, dtype: int64
Negative    22538
Positive    20832
Name: sentiment, dtype: int64


In [7]:
# converts sentiment labels to numeric values,using factorize() function.
sentiment_label = data.sentiment.factorize()
print(sentiment_label)

(array([0, 0, 0, ..., 0, 0, 0], dtype=int64), Index(['Positive', 'Negative'], dtype='object'))


In [8]:
# process the NAN values (null) by either drop the rows with NAN or replace by ' '
nan_idx = data[pd.isnull(data['text'])].index.tolist() # return all rows (index) with null data
data.loc[nan_idx, 'text'] = ' ' #replace the null value wilth " "

In [9]:
# Extract the values of the 'text' column as the input sequence
InputData = data.text.values # this is the input sequence. To process the sequence data (the text), we need to do the following:
# 1- tokenize the data using the Tokenizer() function, set the word limit to 5000
# 2- convert the text sequence into sequence of codes (numbers) using the texts_to_sequences function
# 3- pad the sequence with zeros if the length is less than 200 using the pad_sequence function

# num_words: the maximum number of words to keep, 
# based on word frequency. Only the most common num_words-1 words will be kept.
tokenizer = Tokenizer(num_words=5000) 
tokenizer.fit_on_texts(InputData)
encoded_docs = tokenizer.texts_to_sequences(InputData) # convert to codes

# maxlen: Optional Int, maximum length of all sequences. 
# If not provided, sequences will be padded to the length of the longest individual sequence.
padded_sequence = pad_sequences(encoded_docs, maxlen=200)
#print(tokenizer.word_index)

In [10]:
# to show the text and its code and then the padding to have fixed sequence size
print(InputData[0])
print(encoded_docs[0])
print(padded_sequence[0])

im getting on borderlands and i will murder you all ,
[271, 146, 14, 113, 4, 2, 60, 1508, 13, 27]
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0

In [11]:
# create a deep learning sequential model by adding the following layers (you can modify)
#The Dropout layer randomly sets input units to 0 
# with a frequency of rate at each step during training time, which helps prevent overfitting.

#Embedding layer enables us to convert each word into a fixed length vector of defined size.
embedding_vector_length = 32
vocab_size = len(tokenizer.word_index) + 1
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length,     
                                     input_length=200) )
model.add(SpatialDropout1D(0.25))
model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', 
                           metrics=['accuracy'])
print(model.summary())



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 32)           662592    
                                                                 
 spatial_dropout1d (Spatial  (None, 200, 32)           0         
 Dropout1D)                                                      
                                                                 
 lstm (LSTM)                 (None, 50)                16600     
                                                                 
 dropout (Dropout)           (None, 50)                0         
                                                                 
 dense (Dense)               (None, 1)                 51        
                                                                 
Total params: 679243 (2.59 MB)
Trainable params: 679243 (2.59 MB)
Non-trainable params: 0 (0.00 Byte)
__________________

In [12]:
# train the model with the fit function on the dataset
Trained_Model = model.fit(padded_sequence,sentiment_label[0],
                  validation_split=0.2, epochs=1, batch_size=32)






In [13]:
# Use the trained model to test any sequence for example "the quality of this product is very bad"
# The same sequence analysis process needs to be done before applying the model

test_sentence ="the quality of this product is very bad"
tokenizer.fit_on_texts(test_sentence)
tw = tokenizer.texts_to_sequences([test_sentence])
print(tw)
tw = pad_sequences(tw,maxlen=200)

prediction = int(model.predict(tw).round().item())
print (sentiment_label[1][prediction])

[[1, 979, 7, 8, 1262, 6, 132, 131]]
Negative
