In [0]:
import nltk
from nltk import word_tokenize
from nltk import download
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import re

In [0]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Package brown_tei is already up-to-date!
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Package cess_cat is already up-to-date!
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Package cess_esp is already up-to-date!
[nltk_data]    | Downloading packag

True

In [0]:
# read csv into a dataframe
train_data = pd.read_csv("/train_tweets.csv")

In [0]:
train_data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [0]:
# exploratory data analysis
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
id       31962 non-null int64
label    31962 non-null int64
tweet    31962 non-null object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [0]:
# data clean up activities

# function to remove unwanted columns from dataframe
def drop_features(features,data):
    data.drop(features,inplace=True,axis=1)

# cleaning up the 'tweet' 
def process_tweet(tweet):
    return " ".join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])", "",tweet.lower()).split())

# adding a new column('processed_tweets') containing cleaned up tweets
train_data['processed_tweets'] = train_data['tweet'].apply(process_tweet)

In [0]:
train_data.head(10)

Unnamed: 0,id,label,tweet,processed_tweets
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit i cant use cause they d...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in ur
4,5,0,factsguide: society now #motivation,factsguide society now motivation
5,6,0,[2/2] huge fan fare and big talking before the...,22 huge fan fare and big talking before they l...
6,7,0,@user camping tomorrow @user @user @user @use...,camping tomorrow danny
7,8,0,the next school year is the year for exams.ð...,the next school year is the year for exams can...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...,we won love the land allin cavs champions clev...
9,10,0,@user @user welcome here ! i'm it's so #gr...,welcome here im its so gr8


In [0]:
drop_features(['id','tweet'],train_data)

In [0]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 2 columns):
label               31962 non-null int64
processed_tweets    31962 non-null object
dtypes: int64(1), object(1)
memory usage: 499.5+ KB


In [0]:
# separating out the labels for facilitating training process
# and splitting dataset into training and test sets

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(train_data["processed_tweets"],train_data["label"], test_size = 0.2)


'happy fathers day'

In [0]:
# vectorisation of training data using tfidf vectoriser

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

count_vect = CountVectorizer(stop_words='english')
transformer = TfidfTransformer(norm='l2',sublinear_tf=True)

x_train_counts = count_vect.fit_transform(x_train)
x_train_tfidf = transformer.fit_transform(x_train_counts)

In [0]:
print(x_train_counts.shape)
print(x_train_tfidf.shape)

(25569, 35467)
(25569, 35467)


In [0]:
# vectorisation of test data using tfidf vectoriser

x_test_counts = count_vect.transform(x_test)
x_test_tfidf = transformer.transform(x_test_counts)

In [0]:
print(x_test_counts.shape)
print(x_test_tfidf.shape)

(6393, 35467)
(6393, 35467)


# Training the model

# RandomForestClassifier

In [0]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=200)
model.fit(x_train_tfidf,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
predictions = model.predict(x_test_tfidf)
print(predictions)

[0 0 0 ... 0 0 0]


In [0]:
# model accuracy

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,predictions))

0.9607383075238543


In [0]:
# testing new set of data on the trained model

test_data = pd.read_csv("/content/drive/My Drive/NLP/Sentiment_Analysis/test_tweets.csv")

In [0]:
# data pre-processing of test dataset

test_data['processed_tweet'] = test_data['tweet'].apply(process_tweet)

drop_features(['tweet'],test_data)

train_counts = count_vect.fit_transform(train_data['processed_tweets'])
test_counts = count_vect.transform(test_data['processed_tweet'])

print(train_counts.shape)
print(test_counts.shape)

(31962, 41120)
(17197, 41120)


In [0]:
train_tfidf = transformer.fit_transform(train_counts)
test_tfidf = transformer.transform(test_counts)

In [0]:
model.fit(train_tfidf,train_data['label'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
predictions = model.predict(test_tfidf)

# putting the predictions into dataframe
final_result = pd.DataFrame({'id':test_data['id'],'label':predictions})

In [0]:
final_result

Unnamed: 0,id,label
0,31963,0
1,31964,1
2,31965,0
3,31966,0
4,31967,0
...,...,...
17192,49155,1
17193,49156,0
17194,49157,0
17195,49158,0


# Deep Learning

In [0]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Conv2D, Conv1D, LSTM, Bidirectional, GlobalMaxPooling1D, Flatten
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

In [0]:
# Train -test splitting of teh given dataset

X_train, X_test, y_train, y_test = train_test_split(train_data["processed_tweets"],train_data["label"], test_size=0.20)

The Tokenizer class from the keras.preprocessing.text module to create a word-to-index dictionary. In the word-to-index dictionary, each word in the corpus is used as a key, while a corresponding unique index is used as the value for the key

In [0]:
# further preprocessing of the cleaned up texts before applying word-embedding

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [0]:
# X_train and X_test are lists of sentences where each sentence is represented as a list of integers 
#(each word in the corpus is used as a key, while a corresponding unique index is used as the value for the key thus a list of integers)
print(len(X_train))

print(X_train[28])

# total number of different words present
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
print(vocab_size)


25569
[138, 67, 18]
35932


In [0]:
# setting up maximum length for any sentence 
maxlen = 100

# padding: adding 0 at the end of the list until it reaches the max length of 100 
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [0]:
# Using GloVe embeddings to create our feature matrix. 
# load the GloVe word embeddings and create a dictionary that will contain words as keys and their corresponding embedding list as values

from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
glove_file = open('/glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [0]:
# creating an embedding matrix where each row number will correspond to the index of the word in the corpus
# the matrix will have 100 columns where each column will contain the GloVe word embeddings for the words in our corpus

embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [0]:
print(embedding_matrix.shape)
print(embedding_matrix.size)

(35932, 100)
3593200


# Classification with Simple Neural Network

In [0]:
model = Sequential()

# Since we are not training our own embeddings and using the GloVe embedding, set trainable to False and in the weights attribute we pass our own embedding matrix
# set output length to 100( = input length of embedding layer)
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)

# The embedding layer is then added to the model
model.add(embedding_layer)

# since we are directly connecting our embedding layer to densely connected layer, we flatten the embedding layer.
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))











In [0]:
# model compilation
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model.summary())



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          3593200   
_________________________________________________________________
flatten_1 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 10001     
Total params: 3,603,201
Trainable params: 10,001
Non-trainable params: 3,593,200
_________________________________________________________________
None


**Explanation of the model compilation output:**

Since there are 35932 words in our corpus and each word is represented as a 100-dimensional vector, the number of trainable parameter will be 35932x100 in the embedding layer. In the flattening layer, we simply multiply rows and column. Finally in the dense layer the number of parameters are 10000 (from the flattening layer) and 1 for the bias parameter, for a total of 10001.

In [0]:
# training the model
history = model.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)



Train on 20455 samples, validate on 5114 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [0]:
# model evaluation:
score = model.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.17698338600078062
Test Accuracy: 0.9421241982580257


**Observation**

The model is overfitting as there is a vast difference between the training and test accuracy

# Classification with a Convolutional Neural Network

In [0]:
model = Sequential()

embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model.add(embedding_layer)

#one-dimensional convolutional layer with 128 features, or kernels. The kernel size is 5
model.add(Conv1D(128, 5, activation='relu'))

# add a global max pooling layer to reduce feature size
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [0]:
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 100)          3593200   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 96, 128)           64128     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 3,657,457
Trainable params: 64,257
Non-trainable params: 3,593,200
_________________________________________________________________
None


**Explanation of the model compilation output:**
In the above case we don't need to flatten our embedding layer.Also note: the feature size is now reduced using the pooling layer.

In [0]:
# training the model:
history = model.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

Train on 20455 samples, validate on 5114 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [0]:
# model evaluation:

score = model.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.1336600723078082
Test Accuracy: 0.9571406224720097


**Observation**

CNN model is still overfitting as there is a vast difference between the training and test accuracy

# Classification with Recurrent Neural Network (LSTM)

In [0]:
model = Sequential()
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model.add(embedding_layer)

# create an bidirectional LSTM layer with 128 neurons (number of nueurons can be adjusted to have better model performance)
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))

model.add(Dense(512, activation='relu'))
model.add(Dropout(0.50))

model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [0]:
print(model.summary())

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 100, 100)          3593200   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               234496    
_________________________________________________________________
dense_4 (Dense)              (None, 512)               131584    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 513       
Total params: 3,959,793
Trainable params: 366,593
Non-trainable params: 3,593,200
_________________________________________________________________
None


In [0]:
# training the model
history = model.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

Train on 20455 samples, validate on 5114 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [0]:
# model evaluation

score = model.evaluate(X_test, y_test, verbose=1)

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.25460220199272365
Test Accuracy: 0.9297669324986013


In [0]:
instance = train_data['processed_tweets'][1]
instance

'thanks for lyft credit i cant use cause they dont offer wheelchair vans in pdx disapointed getthanked'

In [0]:
# predicting any random tweet

instance = tokenizer.texts_to_sequences(instance)

flat_list = []
for sublist in instance:
    #print(sublist)
    for item in sublist:
        #print(item)
        flat_list.append(item)

flat_list = [flat_list]

instance = pad_sequences(flat_list, padding='post', maxlen=maxlen)

model.predict(instance)

array([[0.06360893]], dtype=float32)

As the score(0.06360893) is less than 0.5, the binary outcome of the result is considered to be 0 i.e it's not a hateful tweet