In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import tensorflow as tf

from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Layer, Input, Embedding, Conv1D, Add, ReLU, LeakyReLU, GlobalAveragePooling1D, Dense, Dropout, BatchNormalization, Concatenate


In [None]:
drive_dir = '/content/drive'
drive.mount(drive_dir, force_remount=True)

data_path = "/content/drive/My Drive/CSCI2470_final_project/clean.pkl"
df = pd.read_pickle(data_path)
print(df.head())

Mounted at /content/drive
                                       padded_tokens  label
0  [4654, 2564, 8701, 5920, 2890, 13013, 2135, 10...      1
1  [2572, 1045, 6881, 1045, 2123, 1005, 1056, 213...      0
2  [2633, 12609, 2003, 2471, 2058, 1012, 1012, 10...      0
3  [1045, 2342, 2393, 29427, 2393, 2033, 10047, 6...      1
4  [1045, 1521, 1049, 2061, 2439, 18223, 2080, 10...      1


In [None]:
X = df['padded_tokens']
y = df['label']

X = np.array(df['padded_tokens'].tolist())

print('X shape:', X.shape)

X shape: (310601, 250)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
max_token_id = max([max(tokens) for tokens in df['padded_tokens'] if tokens])

In [None]:
class GatedConv1D(Layer):
    def __init__(self, filters, kernel_size, **kwargs):
        super(GatedConv1D, self).__init__(**kwargs)
        self.conv = Conv1D(filters, kernel_size, activation='linear', padding='same')
        self.gate = Conv1D(filters, kernel_size, activation='sigmoid', padding='same')

    def call(self, inputs):
        conv_output = self.conv(inputs)
        conv_output = LeakyReLU()(conv_output)
        gate_output = self.gate(inputs)
        return conv_output * gate_output

In [None]:
# Parameters
seq_length = 250  # each sequence is padded to 250
vocab_size = max_token_id + 1  # +1 for pad
embedding_dim = 50
num_classes = 2
num_filters = 128
kernel_sizes = [3, 5]  # different kernel sizes for parallel blocks

inputs = Input(shape=(seq_length,))
x = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=seq_length)(inputs)

# two parallel gated convolutional blocks with residual connections
pooled_outputs = []
for kernel_size in kernel_sizes:
    conv = Conv1D(filters=num_filters, kernel_size=kernel_size, padding='same', activation='linear')(x)
    gated = GatedConv1D(filters=num_filters, kernel_size=7)(conv)
    gated = BatchNormalization()(gated)
    gated = Dropout(0.5)(gated)

    # residual connection
    res_block = Add()([conv, gated])
    res_block = ReLU()(res_block)

    # pool each branch
    pooled = GlobalAveragePooling1D()(res_block)
    pooled_outputs.append(pooled)

# concatenate parallel block outputs
concatenated = Concatenate()(pooled_outputs) if len(pooled_outputs) > 1 else pooled_outputs[0]

# dense layers
x_final = Dense(64, activation='relu')(concatenated)
x_final = Dropout(0.5)(x_final)
outputs = Dense(1, activation='sigmoid')(x_final)

model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 250)]                0         []                            
                                                                                                  
 embedding_1 (Embedding)     (None, 250, 50)              1526100   ['input_2[0][0]']             
                                                                                                  
 conv1d_6 (Conv1D)           (None, 250, 128)             19328     ['embedding_1[0][0]']         
                                                                                                  
 conv1d_9 (Conv1D)           (None, 250, 128)             32128     ['embedding_1[0][0]']         
                                                                                            

In [None]:
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    verbose=1,
    restore_best_weights=True
)
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 5: early stopping


In [None]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_accuracy:.4f}, Test loss: {test_loss:.4f}")

Test accuracy: 0.9430, Test loss: 0.1527


In [None]:
model.save('/content/drive/My Drive/CSCI2470_final_project/SGL_CNN', save_format="h5")

  saving_api.save_model(
