## Google Drive setup

In [1]:
# from google.colab import drive
# drive.mount('/content/drive/')

In [2]:
# %cd "/content/drive/MyDrive/colab_not"  

## Local setup

In [3]:
import os
import sys
from os import path, pardir
import pathlib

# Add the "scripts" folder to the PATH
scripts = path.join(pathlib.Path(os.path.abspath('')), 'scripts')
if scripts not in sys.path:
    sys.path.append(scripts)

## Project II

In [4]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.regularizers import l2
import numpy as np
import string
import pandas as pd

In [5]:
def split_set(data_to_split, ratio=0.8):
    mask = np.random.rand(len(data_to_split)) < ratio
    return [data_to_split[mask].reset_index(drop=True), data_to_split[~mask].reset_index(drop=True)]

# Data pre-processing experiments

## Purpose

The purpose of that section is to do **small pre-processing experiments**, to **assess whether a pre-processing
has a positive impact on the overall score** on the model predictions.

This part is _self-contained_ on purpose: it will import its own data, create its own (very fast) model.

## How it works

We will construct _two datasets_ : the **original dataset**, and **one with one small preprocessing applied to it**.

We will then **run our model on those two datasets**, and we will compare the scores we obtain for both parts.

We do all of those pre-processing directly in the code, because we want to be able to test many experiments quickly.

## Important

**Do not change anything under the "RAW DATA" section**: this is your control group.

## Processes

### Validate that a new tweet preprocessing idea increases accuracy

1. Under the "PREPROCESSED DATA" section, set the `preprocessing` variable to `True`.
2. In `scripts/data_cleaning.py`, add your preprocessing in `def preprocess_tweet(tweet)` (we recommend adding only one, to know exactly which preprocessing has an impact).
3. Run the code below.


### Compare two datasets with our model

1. Under the "PREPROCESSED DATA" section, set the datasets you would like to use in `pos_ds_path` and `neg_ds_path` variables.
1. [Optional] , set the `preprocessing` variable to `False` (if you want to preprocess data on the fly, set it to `True`.
3. Run the code below.

In [6]:
# Autoreload the data_cleaning library, for faster tests
%load_ext autoreload
%autoreload 1
%aimport data_cleaning

# Useful constants
max_words_in_vocab = 10000
embedding_dim = 32
sequence_length = 280

# Returns a training dataset and a validation dataset
def prepare_datasets(pos_ds_path, neg_ds_path, preprocessing=False):
  # Import the raw data
  pos_ds = pd.read_table(pos_ds_path, names=["tweet"], sep = "\n", header=None,quoting=3)
  neg_ds = pd.read_table(neg_ds_path, names=["tweet"], sep = "\n", header=None, quoting=3)

  # Add predictions to the raw datasets
  pos_ds['pred'] = 1
  neg_ds['pred'] = 0

  # Drop duplicates
  pos_ds = pos_ds.drop_duplicates()
  neg_ds = neg_ds.drop_duplicates()

  #########################################################################
  ########### TESTING ONLY - Take only 25% of the whole dataset ###########
  #########################################################################
  # pos_total = len(pos_ds)
  # neg_total = len(neg_ds)
  # pos_ds = pos_ds[:int(pos_total * .25)]
  # neg_ds = neg_ds[:int(neg_total * .25)]
  #########################################################################
  #########################################################################
  #########################################################################

  # Form training data
  full_ds = pd.concat((neg_ds, pos_ds))

  # Construct the preprocessed version of the tweets on the fly
  if preprocessing:
    full_ds.tweet = full_ds.tweet.apply(data_cleaning.preprocess_tweet)

  # Separating training and validation data
  train_ds, val_ds = split_set(full_ds.sample(frac=1,random_state=0))
    
  # Turning Pandas dataframes into Tensorflow datasets
  train_ds = tf.data.Dataset.from_tensor_slices((train_ds.tweet.values, train_ds.pred.values))
  val_ds = tf.data.Dataset.from_tensor_slices((val_ds.tweet.values,val_ds.pred.values))

  # Batchify data
  batch_size = 64
  train_ds   = train_ds.batch(batch_size=batch_size)
  val_ds     = val_ds.batch(batch_size=batch_size)

  return (train_ds, val_ds)


######################################
########## Vectorize data ############
######################################

# Is used by both of the functions below...please refactor
vectorize_layer = TextVectorization(
  max_tokens=max_words_in_vocab, # We only consider the top "max_words_in_vocab" words for the vocabulary
  output_mode="int",
  output_sequence_length=sequence_length, # We pad our outputs to 280 characters
)

def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

def vectorize_data(dataset):
  # Keep only text
  text_ds = dataset.map(lambda x, y: x)
  vectorize_layer.adapt(text_ds)

  # Vectorize the data.
  returned_dataset = dataset.map(vectorize_text)

  # Do async prefetching / buffering of the data for best performance on GPU.
  returned_dataset = returned_dataset.cache().prefetch(buffer_size=10)

  return returned_dataset

In [8]:
##############################
########## RAW DATA ##########
##############################

# Load/Prepare data
train_ds, val_ds = prepare_datasets('data/train/train_pos.txt', 'data/train/train_neg.txt')
train_ds = vectorize_data(train_ds)
val_ds = vectorize_data(val_ds)

embedding_layer = layers.Embedding(max_words_in_vocab, embedding_dim, input_length=sequence_length)

# Construct the model
model = tf.keras.Sequential()
model.add(embedding_layer)
model.add(layers.Flatten())
model.add(layers.Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy']) 
print(model.summary())

# Train the model
epochs = 4
history = model.fit(train_ds, validation_data=val_ds, epochs=epochs)
raw_accuracies = history.history['val_accuracy']

##############################
##### PREPROCESSED DATA ######
##############################

# Enable live preprocessing of tweets
preprocessing = True
pos_ds_path = 'data/train/train_pos.txt'
neg_ds_path = 'data/train/train_neg.txt'

# Load/Prepare data
train_ds, val_ds = prepare_datasets(pos_ds_path, neg_ds_path, preprocessing=preprocessing)
train_ds = vectorize_data(train_ds)
val_ds = vectorize_data(val_ds)

embedding_layer = layers.Embedding(max_words_in_vocab, embedding_dim, input_length=sequence_length)

# Construct the model
tf.keras.backend.clear_session() # First clear any existing Keras session: We want to start from scratch!
model = tf.keras.Sequential()
model.add(embedding_layer)
model.add(layers.Flatten())
model.add(layers.Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy']) 
print(model.summary())

# Train the model
epochs = 4
history = model.fit(train_ds, validation_data=val_ds, epochs=epochs)
pre_accuracies = history.history['val_accuracy']

experiment_result_percentage = (pre_accuracies[-1] - raw_accuracies[-1]) * 100

print()
print('=====================================================================')
print("Your experiment validation accuracy difference with raw data: %s" % experiment_result_percentage)
print('=====================================================================')

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 280, 32)           320000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 8960)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 8961      
Total params: 328,961
Trainable params: 328,961
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 280, 32)           320000    
_________________________________________________________________
flatten (Flatten)            (None, 8960)        

# Testing the model

In [8]:
# test_ds = tf.data.TextLineDataset("data/test/test_data_textblob_clean.txt")
test_ds = tf.data.TextLineDataset("data/test/test_data_cl.txt")

In [9]:
def vectorize_text_test(text):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text)

test_ds = test_ds.map(vectorize_text_test)

test_ds = test_ds.cache().prefetch(buffer_size=10)

In [10]:
preds = model.predict(test_ds)

In [11]:
preds = np.where(preds > 0.5, 1, -1)

In [12]:
preds_path = "predictions/preds.csv"
with open(preds_path, "w") as f:
    f.write("Id,Prediction\n")
    for i, y in enumerate(preds):
        f.write(str(i + 1) + "," + str(int(y)) + "\n")
print(f"Done. Predictions can be found in {preds_path}.")

Done. Predictions can be found in predictions/preds.csv.
