In [None]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from google.colab import files
from google.colab import drive
import io

#mounting googler drive
drive.mount("/content/gdrive")

#file path to where the model is saved
project_path = "gdrive/My Drive/Colab Notebooks/Ignition Hack/"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


**Defining variables for ease of access and manipulability**

In [None]:
#variables
training_size = 700000
oov_tok = "<OOV>"
trunc_type='post'
padding_type='post'
embedding_dim = 16
max_length = 32
vocab_size = 28000

**Data preprocessing**

In [None]:
#loading the full training CSV file
raw_data = pd.read_csv(project_path + "/data/training_data.csv")

#clean data for case and punctuation
clean_data = raw_data
clean_data.Text = raw_data.Text.apply(lambda x: " ".join(word.lower() for word in x.split()))
clean_data.Text = clean_data.Text.str.replace('[^\w\s]', '')

#display a sample from the array
clean_data.head()

Unnamed: 0,ID,User,Text,Sentiment
0,864192,Carly_FTS,i heart filling up dennisschaub desk 1 it mean...,1
1,523691,Open_Sourcing,sociomat people create prettier younger and b...,1
2,584154,xxcharlx,no way i dont want the tour to end,0
3,1527961,andreapuddu,hemalradia hi amazing brother sending limitles...,1
4,28609,umbec,flockmaster they are chocolate,1


In [None]:
#reshaping the data to remove the first 2 columns
clean_data = clean_data.drop(columns=['ID','User'])

#display the new array size
clean_data.shape

(1000000, 2)

In [None]:
#display a sample from the reshaped array
clean_data.head()

Unnamed: 0,Text,Sentiment
0,i heart filling up dennisschaub desk 1 it mean...,1
1,sociomat people create prettier younger and b...,1
2,no way i dont want the tour to end,0
3,hemalradia hi amazing brother sending limitles...,1
4,flockmaster they are chocolate,1


In [None]:
#splitting the shaped array into 2 seperate arrays for sentences and labels
sentences = clean_data['Text']
labels = clean_data['Sentiment']

In [None]:
#display a sample from the sentences array
sentences.head()

0    i heart filling up dennisschaub desk 1 it mean...
1    sociomat  people create prettier younger and b...
2                   no way i dont want the tour to end
3    hemalradia hi amazing brother sending limitles...
4                       flockmaster they are chocolate
Name: Text, dtype: object

In [None]:
#display a sample from the labels array
labels.head()

0    1
1    1
2    0
3    1
4    1
Name: Sentiment, dtype: int64

In [None]:
#splitting the data into 4 arrays for training using the predetermined training size
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

**Tokenizing and padding the tweets**

In [None]:
#initiating the tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

#using the total vocab from the training split
tokenizer.fit_on_texts(training_sentences)

#creating a dictionary for the tokenized vocab
word_index = tokenizer.word_index
word_index_flip = {v: k for k, v in word_index.items()}
#word_index_flip[31500]
#len(word_index)

In [None]:
#padding and tokenizing the training split tweets
training_sentences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sentences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

#padding and tokenizing the testing split tweets
testing_sentences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sentences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
#changing the data into a np.array
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [None]:
#building the model architecture (using 1 pooling and 1 dense layer)
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(48, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 32, 16)            448000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 48)                816       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 49        
Total params: 448,865
Trainable params: 448,865
Non-trainable params: 0
_________________________________________________________________


**Training the model**

In [None]:
#making directory to store best performing models
!mkdir "gdrive/My Drive/Colab Notebooks/Ignition Hack/best"


model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = "gdrive/My Drive/Colab Notebooks/Ignition Hack/best", 
    save_weights_only = False, 
    monitor = 'val_accuracy', 
    mode = 'auto', 
    save_best_only = True)

model.fit(training_padded, training_labels, epochs=20, validation_data=(testing_padded, testing_labels), verbose=1, callbacks=model_checkpoint_callback)

mkdir: cannot create directory ‘gdrive/My Drive/Colab Notebooks/Ignition Hack/best’: File exists
Epoch 1/20
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/Ignition Hack/best/assets
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f8d14339860>

In [None]:
#saving the model for future use, must be .model
model.save(project_path + "Sentiment_predictor.model")

INFO:tensorflow:Assets written to: gdrive/My Drive/Colab Notebooks/Ignition Hack/Sentiment_predictor.model/assets


**Using the best model(previously saved) to predict the sentiment of the judgement data**

In [None]:
#loading the full judgement CSV file
judgement_data = pd.read_csv(project_path + "/data/judgement.csv")

#clean data for case and punctuation
judgement_data.Text = judgement_data.Text.apply(lambda x: " ".join(word.lower() for word in x.split()))
judgement_data.Text = judgement_data.Text.str.replace('[^\w\s]', '')

#display a sample from the array
judgement_data.head()

Unnamed: 0,ID,User,Text
0,599303,sauce_pot,im on my way to miss kacys 4th bday party at b...
1,359673,lovelyritaz,ripestapple i might not be the right person to...
2,391095,Dannymassacur,zomgkris i know it is
3,820049,Ms_Lady09,mii vision is blurryim goin to bednite niteihu...
4,658429,EvilSue,tealou anyways i did something good for someo...


In [None]:
#padding and tokenizing the judgement data text
text_encoded = tokenizer.texts_to_sequences(judgement_data.Text)
text_padded = pad_sequences(text_encoded, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
text_padded

array([[  14,   15,    6, ...,    0,    0,    0],
       [   1,    2,  278, ...,    0,    0,    0],
       [   1,    2,   58, ...,    0,    0,    0],
       ...,
       [   1,   82,   14, ...,    0,    0,    0],
       [2191,    1,  108, ...,    0,    0,    0],
       [  44,  792,  478, ...,    0,    0,    0]], dtype=int32)

In [None]:
#loading the best model
model = keras.models.load_model("gdrive/My Drive/Colab Notebooks/Ignition Hack/best")

#make predictions using the train dataset
predictions = model.predict(text_padded)

#changing to the predictions to integers
predictions = np.around(predictions)
predictions = predictions.astype(int)

In [None]:
#reloading the full judgement CSV file
judgement_data = pd.read_csv(project_path + "/data/judgement.csv")

#appending the predicted data as a new column
judgement_data['Sentiment'] = predictions

#saving as a CSV with the predicted values
judgement_data.to_csv(project_path + "/data/predictions.csv")