In [5]:
# libraries for dataframes and array
import numpy as np
import pandas as pd

#text prepreprocessing library
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english")

#library to map the words to numbers to pass into 
#the neural network
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

#library to create the neural network and the various
#layers in the model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Flatten
from sklearn.metrics import confusion_matrix

# fix random seed for reproducibility
np.random.seed(7)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def load_as_list(fname):
    """Read in the dataset csv, store it in
    a pandas dataframe and """
    df = pd.read_csv(fname)
    id = df['id'].values.tolist()
    label = df['label'].values.tolist()
    tweets = df['tweet'].values.tolist()
    return tweets, label

tweets, label = load_as_list("train.csv")

# removes stopwords and punctuation from tweets list
for i in range(len(tweets)):
    tweets[i] = ' '.join([word for word in tweets[i].split() if word not in cachedStopWords])
    tweets[i] = re.sub(r'[^\w\s]', '', tweets[i])

print(f'The first 5 tweets: {tweets[:5]}')
print(f'The length of tweets list is: {len(tweets)}')
print(f'The length of labels list is: {len(label)}')

The first 5 tweets: ['user father dysfunctional selfish drags kids dysfunction run', 'user user thanks lyft credit cant use cause offer wheelchair vans pdx disapointed getthanked', 'bihday majesty', 'model love u take u time urð ðððð ððð', 'factsguide society motivation']
The length of tweets list is: 31962
The length of labels list is: 31962


In [7]:
# counts the number of unique vocab in the twitter tweets list
word_count = {}
for word in tweets:
    word_list = word.split()
    for sub_word in word_list:
        if sub_word in word_count:
            word_count[sub_word] += 1
        else:
            word_count[sub_word] = 1
print("executed")

# filters only the vocab words that occur 30 or more times in the 
# twitter tweets list
word_count = {key:val for key, val in word_count.items() if val >= 30}
print(f'The number of unique vocab in twitter list is: {len(word_count)}')

executed
The number of unique vocab in twitter list is: 1304


In [8]:
# removing the uncommon words and only keeping the vocab words 
# with a frequency of 30 or more in the tweets list
accepted_list = list(word_count.keys())

for word in tweets:
    word_list = word.split()
    final_word_list = [word for word in word_list if word in accepted_list]
    word = ' '.join(final_word_list)
  
print(f'The first 5 tweets: {tweets[:5]}')
print(f'The length of tweets list is: {len(tweets)}')
print(f'The length of labels list is: {len(label)}')

def find_max_list(list):
    list_len = [len(i) for i in list]
    print(max(list_len))

#print output#
find_max_list(tweets)
    

The first 5 tweets: ['user father dysfunctional selfish drags kids dysfunction run', 'user user thanks lyft credit cant use cause offer wheelchair vans pdx disapointed getthanked', 'bihday majesty', 'model love u take u time urð ðððð ððð', 'factsguide society motivation']
The length of tweets list is: 31962
The length of labels list is: 31962
127


In [9]:
vocab_size = 1304
encoded_docs = [one_hot(d, vocab_size) for d in tweets]


# pad documents to a max length of 128 words
max_length = 128
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

x_train = padded_docs[:25570]
y_train = label[:25570]

x_test = padded_docs[25570:]
y_test = label[25570:]

print(f'The first 5 tweets after encoding is: {x_train[:5]}')
print(len(x_train))
print(f'The first 5 labels are: {x_test[:5]}')
print(len(x_test))

The first 5 tweets after encoding is: [[1245  569  449  370  829  568  315  568    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]
 [1245 1245 1166  707  723  555  570 1068   24  995 1241 1166  845 1208
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0  

In [10]:
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 256, input_length=max_length))
model.add(LSTM(1024, return_sequences=True))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# fit the model
model.fit(x_train, np.array(y_train), epochs= 5, batch_size = 16)

with open('model_summary_RNN.txt', 'w') as f:
    model.summary(print_fn=lambda x: f.write(x + '\n'))

2022-04-19 03:05:44.645298: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1052] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-19 03:05:44.898277: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1052] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-19 03:05:44.898857: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1052] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-19 03:05:44.901030: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1052] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-19 03:05:44.901561: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1052] successful NUMA node read f

Epoch 1/5


2022-04-19 03:05:52.863930: I tensorflow/stream_executor/cuda/cuda_dnn.cc:377] Loaded cuDNN version 8302


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [11]:
# evaluate the model
loss, accuracy = model.evaluate(np.array(x_train), np.array(y_train))
print('Accuracy: %f' % (accuracy*100))

Accuracy: 98.251855


In [12]:
#predict the model 
y_pred = model.predict(x_test)
y_pred = y_pred.flatten()
y_pred = np.where(y_pred > 0.5, 1, 0)
print(y_pred[:5])
print(y_test[:5])

#Confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(f'Confusion Matrix Matrics (Test dataset size: {len(y_test)})')
print(f'True Negatives: {tn}')
print(f'False Negatives: {fn}')
print(f'True Positives: {tp}')
print(f'False Positives: {fp}')


[0 0 0 0 0]
[1, 0, 0, 0, 0]
Confusion Matrix Matrics (Test dataset size: 6392)
True Negatives: 5873
False Negatives: 302
True Positives: 141
False Positives: 76
