## Google Drive setup

In [1]:
# from google.colab import drive
# drive.mount('/content/drive/')

In [2]:
# %cd "/content/drive/MyDrive/colab_not"  

## Local setup

In [3]:
import os
import sys
from os import path, pardir
import pathlib

# Add the "scripts" folder to the PATH
scripts = path.join(pathlib.Path(os.path.abspath('')), 'scripts')
if scripts not in sys.path:
    sys.path.append(scripts)

## Project II

In [4]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.regularizers import l2
import numpy as np
import string
import pandas as pd

In [5]:
def split_set(data_to_split, ratio=0.8):
    mask = np.random.rand(len(data_to_split)) < ratio
    return [data_to_split[mask].reset_index(drop=True), data_to_split[~mask].reset_index(drop=True)]

# Data pre-processing experiments

## Purpose

The purpose of that section is to do **small pre-processing experiments**, to **assess whether a pre-processing
has a positive impact on the overall score** on the model predictions.

This part is _self-contained_ on purpose: it will import its own data, create its own (very fast) model.

## How it works

We will construct _two datasets_ : the **original dataset**, and **one with one small preprocessing applied to it**.

We will then **run our model on those two datasets**, and we will compare the scores we obtain for both parts.

We do all of those pre-processing directly in the code, because we want to be able to test many experiments quickly.

## Process

1. Do ONE CHANGE to tweets in `preprocess_tweet(tweet)` inside `data_cleaning.py` in the repo.
2. 

In [25]:
# Autoreload the data_cleaning library, for faster tests
%load_ext autoreload
%autoreload 1
%aimport data_cleaning

# Import the raw data
raw_train_pos = pd.read_table("data/train/train_pos_textblob.txt", names=["tweet"], sep = "\n", header=None,quoting=3)
raw_train_neg = pd.read_table("data/train/train_neg.txt", names=["tweet"], sep = "\n", header=None, quoting=3)

# Add predictions to the raw datasets
raw_train_pos['pred'] = 1
raw_train_neg['pred'] = 0

# Drop duplicates
raw_train_pos = raw_train_pos.drop_duplicates()
raw_train_neg = raw_train_neg.drop_duplicates()

#########################################################################
########### TESTING ONLY - Take only 25% of the whole dataset ###########
#########################################################################
# pos_total = len(raw_train_pos)
# neg_total = len(raw_train_neg)
# raw_train_pos = raw_train_pos[:int(pos_total * .25)]
# raw_train_neg = raw_train_neg[:int(neg_total * .25)]
#########################################################################
#########################################################################
#########################################################################

# Form training data
raw_train = pd.concat((raw_train_neg,raw_train_pos))

# Separating training and validation data
raw_train_tr,raw_train_val  = split_set(raw_train.sample(frac=1,random_state=0))
target_train_tr = raw_train_tr.pop('pred')
target_train_val = raw_train_val.pop('pred')

# Turning Pandas dataframes into Tensorflow datasets
raw_train_ds = tf.data.Dataset.from_tensor_slices((np.squeeze(raw_train_tr.values), target_train_tr.values))
raw_val_ds = tf.data.Dataset.from_tensor_slices((np.squeeze(raw_train_val.values),target_train_val.values))

# Batchify data
batch_size = 64
raw_train_ds = raw_train_ds.batch(batch_size=batch_size)
raw_val_ds = raw_val_ds.batch(batch_size=batch_size)

print(
    "Number of batches in raw_train_ds: %d"
    % tf.data.experimental.cardinality(raw_train_ds)
)
print(
    "Number of batches in raw_val_ds: %d" % tf.data.experimental.cardinality(raw_val_ds)
)

######################################
### Construct the embedding layer ####
######################################

max_words_in_vocab = 10000
embedding_dim = 32
sequence_length = 280

vectorize_layer = TextVectorization(
    max_tokens=max_words_in_vocab, # We only consider the top "max_words_in_vocab" words for the vocabulary
    output_mode="int",
    output_sequence_length=sequence_length, # We pad our outputs to 280 characters
)

# Keep only text
text_ds = raw_train_ds.map(lambda x, y: x)

vectorize_layer.adapt(text_ds)

####################################
########### FUN TUTORIAL ###########
####################################

# Print the top 10 words of our vocabulary
# vectorize_layer.get_vocabulary()[:10]

# Test the vectorizer
# output = vectorize_layer([["the cat sat on the mat"]])
# output.numpy()[0, :6]
### --------------------
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

# Vectorize the data.
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
# test_ds = raw_test_ds.map(vectorize_text)

# Do async prefetching / buffering of the data for best performance on GPU.
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
# test_ds = test_ds.cache().prefetch(buffer_size=10)
### --------------------
embedding_layer = layers.Embedding(max_words_in_vocab, embedding_dim, input_length=sequence_length)
### --------------------
model = tf.keras.Sequential()
model.add(embedding_layer)
model.add(layers.Flatten())
model.add(layers.Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy']) 
print(model.summary())
### --------------------
epochs = 4
# Fit the model using the train and test datasets.
model.fit(train_ds, validation_data=val_ds, epochs=epochs)
### --------------------

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Number of batches in raw_train_ds: 2268
Number of batches in raw_val_ds: 566
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 280, 32)           320000    
_________________________________________________________________
flatten_4 (Flatten)          (None, 8960)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 8961      
Total params: 328,961
Trainable params: 328,961
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7f8e69351970>

## Option 2 - Create the embedding layer from scratch

## Create the embedding layer from scratch

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 280, 32)           320000    
_________________________________________________________________
flatten (Flatten)            (None, 8960)              0         
_________________________________________________________________
dense (Dense)                (None, 1)                 8961      
Total params: 328,961
Trainable params: 328,961
Non-trainable params: 0
_________________________________________________________________
None


## Our Twitter Dataset

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7fbcd8a7f730>

In [29]:
test_ds = tf.data.TextLineDataset("data/test/test_data_textblob_clean.txt")

In [30]:
def vectorize_text_test(text):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text)

test_ds = test_ds.map(vectorize_text_test)

test_ds = test_ds.cache().prefetch(buffer_size=10)

In [31]:
preds = model.predict(test_ds)

In [32]:
preds = np.where(preds > 0.5, 1, -1)

In [33]:
preds_path = "predictions/preds.csv"
with open(preds_path, "w") as f:
    f.write("Id,Prediction\n")
    for i, y in enumerate(preds):
        f.write(str(i + 1) + "," + str(int(y)) + "\n")
print(f"Done. Predictions can be found in {preds_path}.")

Done. Predictions can be found in predictions/preds.csv.


## Option 1 - Create the embedding layer using pre-trained Glove embeddings

In [19]:
# Adapted from https://keras.io/examples/nlp/pretrained_word_embeddings/

# Get the vocabulary from the Vectorize Layer
voc = vectorize_layer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

# Load the data
embeddings_index = {}
with open('data/glove/glove.twitter.27B.100d.txt') as file:
    for line in file:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

        
print("Found %s word vectors." % len(embeddings_index))

# Create the embedding matrix
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

embedding_layer = layers.Embedding(
    num_tokens,
    embedding_dim,
    weights=[embedding_matrix],
    trainable=False,
    input_length=sequence_length,
)

Found 1193514 word vectors.
Converted 18556 words (1444 misses)


## IMDB Dataset tryouts

In [12]:
# Taken from https://www.liip.ch/en/blog/sentiment-detection-with-keras-word-embeddings-and-lstm-deep-learning-networks

# Import the data
from tensorflow.keras.datasets import imdb
top_words = 5000 
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

# Truncate and pad the review sequences
from tensorflow.keras.preprocessing import sequence
max_review_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

# Build the model 
embedding_vector_length = 32
model = tf.keras.Sequential()
model.add(layers.Embedding(top_words, embedding_vector_length, input_length=max_review_length))
# model.add(layers.LSTM(100)) # [0.8755, 0.8730, 0.8764], 2:10min per epoch, and my laptop was about to explode
model.add(layers.Flatten()) # -> [0.8737, 0.8783, 0.8844], 1s per epoch
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f90c60a3640>