## Google Drive setup

In [23]:
# from google.colab import drive
# drive.mount('/content/drive/')

In [24]:
# %cd "/content/drive/MyDrive/colab_not"  

## Local setup

In [25]:
from os import path, pardir
import pathlib
import sys

# Add the "scripts" folder to the PATH
scripts = path.join(pathlib.Path(os.path.abspath('')), 'scripts')
if module_path not in sys.path:
    sys.path.append(scripts)

# Import our libraries
from helpers import data_path
import data_cleaning as dc

## Project II

In [26]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import tensorflow as tf
import numpy as np
import string
import pandas as pd

In [27]:
def split_set(data_to_split, ratio=0.8):
    mask = np.random.rand(len(data_to_split)) < ratio
    return [data_to_split[mask].reset_index(drop=True), data_to_split[~mask].reset_index(drop=True)]

In [28]:
batch_size = 512


raw_train_pos =pd.read_table("data/train/train_pos_clean.txt", sep = "\n", header=None,quoting= 3)
raw_train_pos['pred'] = 1

raw_train_neg =pd.read_table("data/train/train_neg_clean.txt", sep = "\n", header=None, quoting = 3)
raw_train_neg['pred'] = 0

# Form training data and remove duplicates
raw_train = pd.concat((raw_train_neg,raw_train_pos))
raw_train.drop_duplicates(inplace=True)

# Separating training and validation data

raw_train_tr,raw_train_val  = split_set(raw_train.sample(frac=1,random_state=0))
target_train_tr = raw_train_tr.pop('pred')
target_train_val = raw_train_val.pop('pred')

#Turning Pandas dataframes into Tensorflow datasets
raw_train_ds = tf.data.Dataset.from_tensor_slices((np.squeeze(raw_train_tr.values), target_train_tr.values))
raw_val_ds = tf.data.Dataset.from_tensor_slices((np.squeeze(raw_train_val.values),target_train_val.values))


#Batchify data
raw_train_ds = raw_train_ds.batch(batch_size=batch_size)
raw_val_ds = raw_val_ds.batch(batch_size=batch_size)

print(
    "Number of batches in raw_train_ds: %d"
    % tf.data.experimental.cardinality(raw_train_ds)
)
print(
    "Number of batches in raw_val_ds: %d" % tf.data.experimental.cardinality(raw_val_ds)
)

Number of batches in raw_train_ds: 283
Number of batches in raw_val_ds: 71


In [30]:
def remove_skips(input_data):
    return tf.strings.regex_replace(input_data, "\n", " ")

In [33]:
# Keep only text
text_ds = raw_train_ds.map(lambda x, y: x) 

In [34]:
type(text_ds)

tensorflow.python.data.ops.dataset_ops.MapDataset

In [35]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


In [36]:
max_features = len(raw_train)
embedding_dim = 128
sequence_length = 280


vectorize_layer = TextVectorization(
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

vectorize_layer.adapt(text_ds)

In [37]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label


# Vectorize the data.
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
# test_ds = raw_test_ds.map(vectorize_text)

# Do async prefetching / buffering of the data for best performance on GPU.
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
# test_ds = test_ds.cache().prefetch(buffer_size=10)

In [38]:
from tensorflow.keras import layers  

# A integer input for vocab indices.
inputs = tf.keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = tf.keras.Model(inputs, predictions)

# Compile the model with binary crossentropy loss and an adam optimizer.
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

Importing the test data.

In [39]:
epochs = 4
# Fit the model using the train and test datasets.
model.fit(train_ds, validation_data=val_ds, epochs=epochs)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7f8e4409ba60>

In [40]:
test_ds = tf.data.TextLineDataset("data/test/test_data_cl.txt")

In [41]:
def vectorize_text_test(text):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text)

test_ds = test_ds.map(vectorize_text_test)

test_ds = test_ds.cache().prefetch(buffer_size=10)

In [42]:
preds = model.predict(test_ds)

In [43]:
preds = np.where(preds > 0.5, 1, -1)

In [45]:
preds_path = "predictions/preds.csv"
with open(preds_path, "w") as f:
    f.write("Id,Prediction\n")
    for i, y in enumerate(preds):
        f.write(str(i + 1) + "," + str(int(y)) + "\n")
print(f"Done. Predictions can be found in {preds_path}.")

Done. Predictions can be found in data/preds.csv.
