In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Flatten

In [None]:
drive_dir = '/content/drive'
drive.mount(drive_dir, force_remount=True)

data_path = "/content/drive/My Drive/CSCI2470_final_project/clean.pkl"
df = pd.read_pickle(data_path)
print(df.head())

Mounted at /content/drive
                                       padded_tokens  label
0  [4654, 2564, 8701, 5920, 2890, 13013, 2135, 10...      1
1  [2572, 1045, 6881, 1045, 2123, 1005, 1056, 213...      0
2  [2633, 12609, 2003, 2471, 2058, 1012, 1012, 10...      0
3  [1045, 2342, 2393, 29427, 2393, 2033, 10047, 6...      1
4  [1045, 1521, 1049, 2061, 2439, 18223, 2080, 10...      1


In [None]:
X = df['padded_tokens']
y = df['label']

X = np.array(df['padded_tokens'].tolist())

print('X shape:', X.shape)

X shape: (310601, 250)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
max_token_id = max([max(tokens) for tokens in df['padded_tokens'] if tokens])

max_length = 250  # each sequence is padded to 250
vocab_size = max_token_id + 1 # +1 for pad
embedding_dim = 3

In [None]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    Conv1D(filters=3, kernel_size=2, activation='relu'),
    Flatten(),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 250, 3)            91566     
                                                                 
 conv1d_4 (Conv1D)           (None, 249, 3)            21        
                                                                 
 flatten_2 (Flatten)         (None, 747)               0         
                                                                 
 dense_4 (Dense)             (None, 1)                 748       
                                                                 
Total params: 92335 (360.68 KB)
Trainable params: 92335 (360.68 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    verbose=1,
    restore_best_weights=True
)
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_accuracy:.4f}, Test loss: {test_loss:.4f}")

Test accuracy: 0.9373, Test loss: 0.1750


In [None]:
model.save('/content/drive/My Drive/CSCI2470_final_project/CNN_base', save_format="h5")

  saving_api.save_model(
