In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import tensorflow as tf

from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Layer, Input, Embedding, Conv1D, Add, ReLU, LeakyReLU, GlobalAveragePooling1D, Dense, Dropout, BatchNormalization, Concatenate



In [None]:
drive_dir = '/content/drive'
drive.mount(drive_dir, force_remount=True)

data_path = "/content/drive/My Drive/CSCI2470_final_project/clean.pkl"
df = pd.read_pickle(data_path)
print(df.head())

Mounted at /content/drive
                                       padded_tokens  label
0  [4654, 2564, 8701, 5920, 2890, 13013, 2135, 10...      1
1  [2572, 1045, 6881, 1045, 2123, 1005, 1056, 213...      0
2  [2633, 12609, 2003, 2471, 2058, 1012, 1012, 10...      0
3  [1045, 2342, 2393, 29427, 2393, 2033, 10047, 6...      1
4  [1045, 1521, 1049, 2061, 2439, 18223, 2080, 10...      1


In [None]:
X = df['padded_tokens']
y = df['label']

X = np.array(df['padded_tokens'].tolist())

print('X shape:', X.shape)

X shape: (310601, 250)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
max_token_id = max([max(tokens) for tokens in df['padded_tokens'] if tokens])

In [None]:
class GatedConv1D(Layer):
    def __init__(self, filters, kernel_size, **kwargs):
        super(GatedConv1D, self).__init__(**kwargs)
        self.conv = Conv1D(filters, kernel_size, activation='linear', padding='same')
        self.gate = Conv1D(filters, kernel_size, activation='sigmoid', padding='same')

    def call(self, inputs):
        conv_output = self.conv(inputs)
        conv_output = LeakyReLU()(conv_output)
        gate_output = self.gate(inputs)
        return conv_output * gate_output

In [None]:
# Parameters
seq_length = 250
vocab_size = max_token_id + 1
embedding_dim = 50
num_filters = 128
kernel_sizes_first = [3, 5]  # for the first Conv1D layers (Filter1_1 and Filter1_2)
kernel_sizes_second = [5, 7]  # for the second Conv1D layers (Filter2_1 and Filter2_2)

inputs = Input(shape=(seq_length,))
x_embedded = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=seq_length)(inputs)

# first Conv1D layer
conv_branches = []
for kernel_size in kernel_sizes_first:
    conv_branch = Conv1D(filters=num_filters, kernel_size=kernel_size, padding='same', activation='relu')(x_embedded)
    conv_branches.append(conv_branch)

# second Conv1D layer that further splits each branch from the first layer
split_conv_branches = []
for i, kernel_size in enumerate(kernel_sizes_second):
    for conv_branch in conv_branches:
        split_conv_branch = Conv1D(filters=num_filters, kernel_size=kernel_size, padding='same', activation='relu')(conv_branch)
        split_conv_branches.append(split_conv_branch)

# gated convolutions for each of the split branches
gated_branches = []
for split_conv_branch in split_conv_branches:
    gated_branch = GatedConv1D(filters=num_filters, kernel_size=kernel_size)(split_conv_branch)
    gated_branch = BatchNormalization()(gated_branch)
    gated_branch = Dropout(0.5)(gated_branch)

    # residual connection
    res_block = Add()([split_conv_branch, gated_branch])
    res_block = ReLU()(res_block)

    gated_branches.append(res_block)

# pool each branch
pooled_outputs = [GlobalAveragePooling1D()(gated_branch) for gated_branch in gated_branches]

# concatenate the pooled outputs from each branch
concatenated = Concatenate()(pooled_outputs)

# final dense layers
dense = Dense(64, activation='relu')(concatenated)
dense = Dropout(0.5)(dense)
outputs = Dense(1, activation='sigmoid')(dense)

model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 250)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 250, 50)              1526100   ['input_1[0][0]']             
                                                                                                  
 conv1d (Conv1D)             (None, 250, 128)             19328     ['embedding[0][0]']           
                                                                                                  
 conv1d_1 (Conv1D)           (None, 250, 128)             32128     ['embedding[0][0]']           
                                                                                              

In [None]:
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    verbose=1,
    restore_best_weights=True
)
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 5: early stopping


In [None]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_accuracy:.4f}, Test loss: {test_loss:.4f}")

Test accuracy: 0.9455, Test loss: 0.1480


In [None]:
model.save('/content/drive/My Drive/CSCI2470_final_project/MGL_CNN', save_format="h5")

  saving_api.save_model(
