## Google Drive setup

In [1]:
# from google.colab import drive
# drive.mount('/content/drive/')

In [2]:
# %cd "/content/drive/MyDrive/colab_not"  

## Local setup

In [3]:
import os
import sys
from os import path, pardir
import pathlib

# Add the "scripts" folder to the PATH
scripts = path.join(pathlib.Path(os.path.abspath('')), 'scripts')
if scripts not in sys.path:
    sys.path.append(scripts)

# Import our libraries
from helpers import data_path
import data_cleaning as dc

## Project II

In [7]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.regularizers import l2
import numpy as np
import string
import pandas as pd

In [8]:
def split_set(data_to_split, ratio=0.8):
    mask = np.random.rand(len(data_to_split)) < ratio
    return [data_to_split[mask].reset_index(drop=True), data_to_split[~mask].reset_index(drop=True)]

In [9]:
batch_size = 64

raw_train_pos = pd.read_table("data/train/train_pos_clean.txt", sep = "\n", header=None,quoting= 3)
raw_train_pos['pred'] = 1

raw_train_neg = pd.read_table("data/train/train_neg_clean.txt", sep = "\n", header=None, quoting = 3)
raw_train_neg['pred'] = 0


#########################################################################
########### TESTING ONLY - Take only 25% of the whole dataset ###########
#########################################################################
# pos_total = len(raw_train_pos)
# neg_total = len(raw_train_neg)
# raw_train_pos = raw_train_pos[:int(pos_total * .25)]
# raw_train_neg = raw_train_neg[:int(neg_total * .25)]
#########################################################################
#########################################################################
#########################################################################

# Form training data
raw_train = pd.concat((raw_train_neg,raw_train_pos))

# Separating training and validation data
raw_train_tr,raw_train_val  = split_set(raw_train.sample(frac=1,random_state=0))
target_train_tr = raw_train_tr.pop('pred')
target_train_val = raw_train_val.pop('pred')

# Turning Pandas dataframes into Tensorflow datasets
raw_train_ds = tf.data.Dataset.from_tensor_slices((np.squeeze(raw_train_tr.values), target_train_tr.values))
raw_val_ds = tf.data.Dataset.from_tensor_slices((np.squeeze(raw_train_val.values),target_train_val.values))


# Batchify data
raw_train_ds = raw_train_ds.batch(batch_size=batch_size)
raw_val_ds = raw_val_ds.batch(batch_size=batch_size)

print(
    "Number of batches in raw_train_ds: %d"
    % tf.data.experimental.cardinality(raw_train_ds)
)
print(
    "Number of batches in raw_val_ds: %d" % tf.data.experimental.cardinality(raw_val_ds)
)

Number of batches in raw_train_ds: 2497
Number of batches in raw_val_ds: 629


In [10]:
max_features = len(raw_train)
embedding_dim = 32
sequence_length = 280

vectorize_layer = TextVectorization(
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

# Keep only text
text_ds = raw_train_ds.map(lambda x, y: x)

vectorize_layer.adapt(text_ds)

# We want to know our vacobulary size, so that we can pass it to the embedding layer
vocab_size = len(vectorize_layer.get_vocabulary())

In [11]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

# Vectorize the data.
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
# test_ds = raw_test_ds.map(vectorize_text)

# Do async prefetching / buffering of the data for best performance on GPU.
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
# test_ds = test_ds.cache().prefetch(buffer_size=10)

In [12]:
model = tf.keras.Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=sequence_length))
model.add(layers.Flatten())
model.add(layers.Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy']) 
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 280, 32)           2866208   
_________________________________________________________________
flatten (Flatten)            (None, 8960)              0         
_________________________________________________________________
dense (Dense)                (None, 1)                 8961      
Total params: 2,875,169
Trainable params: 2,875,169
Non-trainable params: 0
_________________________________________________________________
None


Importing the test data.

## IMDB Dataset tryouts

In [21]:
# Taken from https://www.liip.ch/en/blog/sentiment-detection-with-keras-word-embeddings-and-lstm-deep-learning-networks

# Import the data
from tensorflow.keras.datasets import imdb
top_words = 5000 
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

# Truncate and pad the review sequences 
from tensorflow.keras.preprocessing import sequence
max_review_length = 500 
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

# Build the model 
embedding_vector_length = 32
model = tf.keras.Sequential()
model.add(layers.Embedding(top_words, embedding_vector_length, input_length=max_review_length))
model.add(layers.Flatten())
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)

25000
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fe6bd8951f0>

## Our Twitter Dataset

In [10]:
epochs = 5
# Fit the model using the train and test datasets.
model.fit(train_ds, validation_data=val_ds, epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fcc02e1de50>

In [292]:
# test_ds = tf.data.TextLineDataset("data/test/test_data_cl.txt")

In [41]:
def vectorize_text_test(text):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text)

test_ds = test_ds.map(vectorize_text_test)

test_ds = test_ds.cache().prefetch(buffer_size=10)

In [42]:
preds = model.predict(test_ds)

In [43]:
preds = np.where(preds > 0.5, 1, -1)

In [46]:
preds_path = "predictions/preds.csv"
with open(preds_path, "w") as f:
    f.write("Id,Prediction\n")
    for i, y in enumerate(preds):
        f.write(str(i + 1) + "," + str(int(y)) + "\n")
print(f"Done. Predictions can be found in {preds_path}.")

Done. Predictions can be found in predictions/preds.csv.
