In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup 
import re
import nltk
nltk.download('omw-1.4')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, SpatialDropout1D, Embedding, LSTM
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\sheng\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sheng\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# read training data
data = pd.read_csv('./data/twitter_training.csv',header=None)
data.head()

Unnamed: 0,0,1,2,3
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [3]:
data_2 = data[data[2]!='Irrelevant']
data_2.drop(columns=[0,1],inplace=True)
data_2.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,2,3
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...


In [4]:
# removing not and no
my_stopwords = stopwords.words("english")
my_stopwords.remove('not')
my_stopwords.remove('no')

In [5]:

# intializing method for lemmatizing words
lemmatizer = WordNetLemmatizer()

# now creating funtion to clean our data
def cleaned_review(review):
    if isinstance(review,str):
        # remove any html tags
        new_review = BeautifulSoup(review).get_text()
        
        # remove urls from reviews
        no_urls = new_review.replace('http\S+', '').replace('www\S+', '')
        
        # remove any non-letters
        clean_review = re.sub("[^a-zA-Z]", " ", no_urls)
        
        # convert whole sentence to lowercase and split
        new_words = clean_review.lower().split()
        
        # converting stopwords list to set for faster search
        stops = set(my_stopwords)
        
        # using stopwords to remove irrelavent words and lemmatizing the final output
        final_words = [lemmatizer.lemmatize(word) for word in new_words if not word in stops]
        # return the final result
        return (" ".join(final_words))
    else:
        cleaned_review(str(review))

In [6]:
data_2['msg'] = data_2[3].apply(lambda x:cleaned_review(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_2['msg'] = data_2[3].apply(lambda x:cleaned_review(x))


In [7]:
sentiment = pd.get_dummies(data_2[2])
data_3 = pd.concat([data_2,sentiment],axis=1)
train_data=data_3[["msg","Negative","Neutral","Positive"]].dropna()
train_data.head()

Unnamed: 0,msg,Negative,Neutral,Positive
0,im getting borderland murder,0,0,1
1,coming border kill,0,0,1
2,im getting borderland kill,0,0,1
3,im coming borderland murder,0,0,1
4,im getting borderland murder,0,0,1


In [8]:
max_len = 40
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['msg'].values)
sequences = tokenizer.texts_to_sequences(train_data['msg'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
X = pad_sequences(sequences, maxlen=max_len, padding='post')
y=train_data[['Negative','Neutral','Positive']]

Found 23091 unique tokens.


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42)
X_train.shape,X_test.shape

((60509, 40), (612, 40))

In [10]:

n_most_words = 23092
n_dim = 50

# Now we create our model
model = Sequential()
model.add(Embedding(n_most_words,n_dim,input_length = X_train.shape[1]))
model.add(SpatialDropout1D(0.25))
model.add(LSTM(100,dropout=0.25,recurrent_dropout=0.25))
model.add(Dense(50,activation='relu'))
model.add(Dense(25,activation='relu'))
model.add(Dense(3,activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

early_stop = EarlyStopping(patience=10)
model_history = model.fit(X_train,y_train,epochs=40,validation_split = 0.1, batch_size = 128 ,callbacks=[early_stop])

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40


In [11]:
import pickle
model.save('my_saved_model')



INFO:tensorflow:Assets written to: my_saved_model\assets


INFO:tensorflow:Assets written to: my_saved_model\assets


In [26]:
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:

# Checking the model loss and accuracy
import tensorflow as tf
loaded_model = tf.keras.models.load_model('my_saved_model')
accr = loaded_model.evaluate(X_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.365
  Accuracy: 0.900


In [31]:
def find_sentiment(review,tokenizer,model):
    model_input=cleaned_review(review)
    seq = tokenizer.texts_to_sequences([model_input])
    padded = pad_sequences(seq, maxlen=max_len)
    pred = model.predict(padded)
    label = ['Negative','Neutral','Positive']
    return label[np.argmax(pred)]

with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

input_string="I have a lot of projects to do"


In [32]:
find_sentiment(input_string,tokenizer,loaded_model)



'Positive'