In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
#Data Split - Random Seed = 42 is important.
#USE embeddings are saved according to the data generated by the seed.

train_data = pd.read_csv('gdrive/My Drive/train_2024.csv', quoting=3)
eval_data = pd.read_csv('gdrive/My Drive/test_2024.csv', quoting=3)
dev_data = pd.read_csv('gdrive/My Drive/dev_2024.csv', quoting=3)


train_texts = train_data['text'].tolist()
dev_texts = dev_data['text'].tolist()
eval_texts = eval_data['text'].tolist()

train_labels = train_data['label'].tolist()
dev_labels = dev_data['label'].tolist()
eval_labels = eval_data['label'].tolist()

train_texts.extend(dev_texts)
train_labels.extend(dev_labels)

X = train_texts
y = train_labels

train_texts, val_texts, train_labels, val_labels = train_test_split(X, y, test_size=0.1, random_state=42)
len(train_texts), len(train_labels), len(eval_texts), len(eval_labels)

(99000, 99000, 12001, 12001)

In [4]:
train_labels = np.array(train_labels)
val_labels = np.array(val_labels)
eval_labels = np.array(eval_labels)

In [5]:
train_embeddings = np.genfromtxt('gdrive/My Drive/SBERT_train.csv', delimiter=',')
val_embeddings = np.genfromtxt('gdrive/My Drive/SBERT_val.csv', delimiter=',')
eval_embeddings = np.genfromtxt('gdrive/My Drive/SBERT_eval.csv', delimiter=',')

In [6]:
train_embeddings.shape, val_embeddings.shape, eval_embeddings.shape

((99000, 384), (11000, 384), (12001, 384))

Simple MLP

In [10]:
import tensorflow as tf
from sklearn.metrics import accuracy_score, f1_score

In [11]:
# Define the model architecture
def create_mlp(input_shape):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(256, activation='relu', input_shape=input_shape),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(2, activation='softmax')  # Output layer with softmax activation for binary classification
    ])
    return model

# Create the model
input_shape = (384,)
model = create_mlp(input_shape)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 256)               98560     
                                                                 
 dense_5 (Dense)             (None, 128)               32896     
                                                                 
 dense_6 (Dense)             (None, 64)                8256      
                                                                 
 dense_7 (Dense)             (None, 32)                2080      
                                                                 
 dense_8 (Dense)             (None, 16)                528       
                                                                 
 dense_9 (Dense)             (None, 2)                 34        
                                                                 
Total params: 142354 (556.07 KB)
Trainable params: 142

In [12]:
history = model.fit(train_embeddings, train_labels, epochs=7, batch_size=32, validation_data=(val_embeddings, val_labels))

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [13]:
# Evaluate the model
loss, accuracy = model.evaluate(val_embeddings, val_labels)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Test Loss: 0.3872690796852112
Test Accuracy: 0.8518182039260864


In [14]:
val_pred = model.predict(val_embeddings)
val_pred = np.argmax(val_pred, axis=1)



In [15]:
f1_score(val_labels, val_pred)

0.806872037914692

Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(max_iter=500)
LR.fit(train_embeddings, train_labels)

In [19]:
val_pred = LR.predict(val_embeddings)

In [21]:
f1_score(val_labels, val_pred)

0.7787929481405225

In [23]:
eval_pred = LR.predict(eval_embeddings)

fei_op = pd.read_csv('gdrive/My Drive/submission-stack.csv')
count_matches = 0
for i in range(len(eval_pred)):
  if fei_op['label'][i] == eval_pred[i]:
    count_matches += 1


print(count_matches)
print(count_matches/len(eval_pred))

9934
0.8277643529705858


KNN

In [24]:
from sklearn.neighbors import KNeighborsClassifier

In [25]:
knn = KNeighborsClassifier(n_neighbors=100)

In [26]:
knn.fit(train_embeddings, train_labels)

In [27]:
val_pred = knn.predict(val_embeddings)

In [28]:
acc = accuracy_score(val_labels, val_pred)
f1 = f1_score(val_labels, val_pred)
print(acc, f1)

0.7921818181818182 0.7404632152588556


In [29]:
eval_pred = knn.predict(eval_embeddings)
fei_op = pd.read_csv('gdrive/My Drive/submission-stack.csv')
count_matches = 0
for i in range(len(eval_pred)):
  if fei_op['label'][i] == eval_pred[i]:
    count_matches += 1


print(count_matches)
print(count_matches/len(eval_pred))

9531
0.7941838180151654
