In [1]:
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re

In [3]:
data = pd.read_csv("twitter_sentiment_data.csv")

print(data)

       sentiment                                            message  \
0             -1  @tiniebeany climate change is an interesting h...   
1              1  RT @NatGeoChannel: Watch #BeforeTheFlood right...   
2              1  Fabulous! Leonardo #DiCaprio's film on #climat...   
3              1  RT @Mick_Fanning: Just watched this amazing do...   
4              2  RT @cnalive: Pranita Biswasi, a Lutheran from ...   
...          ...                                                ...   
43938          1  Dear @realDonaldTrump,\nYeah right. Human Medi...   
43939          1  What will your respective parties do to preven...   
43940          2  RT @MikkiL: UN Poll Shows Climate Change Is th...   
43941          0  RT @taehbeingextra: i still can$q$t believe th...   
43942          1  @Likeabat77 @zachhaller \n\nThe wealthy + foss...   

                  tweetid  
0      792927353886371840  
1      793124211518832641  
2      793124402388832256  
3      793124635873275904  
4      

In [4]:
#StopWords
english_stops = set(stopwords.words('english'))

In [6]:
#Load and clean Dataset 
#Encode Sentiments 
def load_dataset():
    df = pd.read_csv(r"twitter_sentiment_data.csv")
    x_data = df['message']       # message/Input
    y_data = df['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case
    
    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace(0, 0)
    y_data = y_data.replace(1, 1)
    y_data = y_data.replace(2, 2)
    y_data = y_data.replace(-1, 3)
    return x_data, y_data

x_data, y_data = load_dataset()

print('Message')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Message
0        [tiniebeany, climate, change, interesting, hus...
1        [rt, natgeochannel, watch, beforetheflood, rig...
2        [fabulous, leonardo, dicaprio, film, climate, ...
3        [rt, mick, fanning, just, watched, amazing, do...
4        [rt, cnalive, pranita, biswasi, lutheran, odis...
                               ...                        
43938    [dear, realdonaldtrump, yeah, right, human, me...
43939    [what, respective, parties, prevent, climate, ...
43940    [rt, mikkil, un, poll, shows, climate, change,...
43941    [rt, taehbeingextra, still, q, believe, gif, t...
43942    [likeabat, zachhaller, the, wealthy, fossil, f...
Name: message, Length: 43943, dtype: object 

Sentiment
0        3
1        1
2        1
3        1
4        2
        ..
43938    1
43939    1
43940    2
43941    0
43942    1
Name: sentiment, Length: 43943, dtype: int64


In [7]:
#Split dataset 
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
27202    [rt, enviro, voter, in, hrs, maria, went, cat,...
27644    [rt, allsoulkind, christiebeaches, johnnyfr, t...
28618    [rt, karen, douglas, our, chapter, climate, ch...
35219    [rt, usatoday, military, leaders, sounding, an...
5766     [rt, katrinanation, trump, denial, catastrophi...
                               ...                        
22585    [rt, time, justin, trudeau, kayaked, family, t...
41367    [if, treated, terrorism, urgency, considering,...
43394    [rt, katz, clinton, donald, thinks, climate, c...
32561    [ask, scientist, binghamton, university, can, ...
34919    [a, new, study, suggests, warm, blooded, anima...
Name: message, Length: 35154, dtype: object 

20267    [rt, redtraccoon, clean, energy, path, cleaner...
8723     [rt, miltonwolfmd, nice, try, thehill, the, te...
37838    [crayola, common, core, lessons, promote, maoi...
8381           [robhunterswords, global, warming, i, tell]
19949    [rt, jkuylenstierna, world, leaders, ignore, t...


In [8]:
#Fct for getting the maximum review length
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [9]:
# ENCODE Message
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum message length: ', max_length)

Encoded X Train
 [[    5  3375  2193 ...   136   142   799]
 [    5 17153 17154 ...     7  1785     0]
 [    5  6082 11453 ... 17160 17161 17162]
 ...
 [    5 63296   273 ...     9    40     0]
 [  452   167  6441 ...  3007     0     0]
 [   27    21    95 ...     3     4 63297]] 

Encoded X Test
 [[   5 8051  354 ... 3339   24    1]
 [   5  714  749 ...  327  869  508]
 [ 961 2880 5325 ...  881 6946 2825]
 ...
 [   5 8385   11 ... 8204   61   79]
 [ 389 6712  293 ...    0    0    0]
 [   5 1836   65 ...   15    3    4]] 

Maximum message length:  15


In [10]:
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(4, activation='softmax'))
model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 15, 32)            2025536   
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 4)                 260       
                                                                 
Total params: 2,050,628
Trainable params: 2,050,628
Non-trainable params: 0
_________________________________________________________________
None


In [11]:
#Training 
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [12]:
model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])

Epoch 1/5
Epoch 1: accuracy improved from -inf to 0.63239, saving model to models\LSTM.h5
Epoch 2/5
Epoch 2: accuracy improved from 0.63239 to 0.80176, saving model to models\LSTM.h5
Epoch 3/5
Epoch 3: accuracy improved from 0.80176 to 0.89680, saving model to models\LSTM.h5
Epoch 4/5
Epoch 4: accuracy improved from 0.89680 to 0.94564, saving model to models\LSTM.h5
Epoch 5/5
Epoch 5: accuracy improved from 0.94564 to 0.96922, saving model to models\LSTM.h5


<keras.callbacks.History at 0x249a1a02fd0>

In [13]:
#Testing 
predict_x = model.predict(x_test) 
y_pred = np.argmax(predict_x,axis=1)

true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

Correct Prediction: 6207
Wrong Prediction: 2582
Accuracy: 70.62236887017863


In [14]:
loaded_model = load_model('models/LSTM.h5')

In [17]:
message = str(input('Message : '))

Message : @tiniebeany climate change is an interesting hustle as it was global warming but the planet stopped warming for 15 yes while the suv boom


In [18]:
# Pre-process input
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', message)
print('Cleaned: ', message)

words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]

print('Filtered: ', filtered)

Cleaned:  @tiniebeany climate change is an interesting hustle as it was global warming but the planet stopped warming for 15 yes while the suv boom
Filtered:  ['tiniebeany climate change interesting hustle global warming planet stopped warming  yes suv boom']


In [19]:
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')
print(tokenize_words)

[[30314     1     2   767 30315     6     7    81  1288     7   274  4586
   4277     0     0]]


In [20]:
result = loaded_model.predict(tokenize_words)
y_pred = np.argmax(result,axis=1)
print(y_pred)

[3]
