In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
#Data Split - Random Seed = 42 is important.
#USE embeddings are saved according to the data generated by the seed.

train_data = pd.read_csv('gdrive/My Drive/train_2024.csv', quoting=3)
eval_data = pd.read_csv('gdrive/My Drive/test_2024.csv', quoting=3)
dev_data = pd.read_csv('gdrive/My Drive/dev_2024.csv', quoting=3)


train_texts = train_data['text'].tolist()
dev_texts = dev_data['text'].tolist()
eval_texts = eval_data['text'].tolist()

train_labels = train_data['label'].tolist()
dev_labels = dev_data['label'].tolist()
eval_labels = eval_data['label'].tolist()

train_texts.extend(dev_texts)
train_labels.extend(dev_labels)

X = train_texts
y = train_labels

train_texts, val_texts, train_labels, val_labels = train_test_split(X, y, test_size=0.1, random_state=42)
len(train_texts), len(train_labels), len(eval_texts), len(eval_labels)

(99000, 99000, 12001, 12001)

In [4]:
train_labels = np.array(train_labels)
val_labels = np.array(val_labels)
eval_labels = np.array(eval_labels)

In [5]:
train_embeddings = np.genfromtxt('gdrive/My Drive/SBERT_train.csv', delimiter=',')
val_embeddings = np.genfromtxt('gdrive/My Drive/SBERT_val.csv', delimiter=',')
eval_embeddings = np.genfromtxt('gdrive/My Drive/SBERT_eval.csv', delimiter=',')

In [6]:
train_embeddings.shape, val_embeddings.shape, eval_embeddings.shape

((99000, 384), (11000, 384), (12001, 384))

Simple MLP

In [7]:
import tensorflow as tf
from sklearn.metrics import accuracy_score, f1_score

In [8]:
# Define the model architecture
def create_mlp(input_shape):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(256, activation='relu', input_shape=input_shape),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(2, activation='softmax')  # Output layer with softmax activation for binary classification
    ])
    return model

# Create the model
input_shape = (384,)
model = create_mlp(input_shape)

In [9]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               98560     
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dense_3 (Dense)             (None, 32)                2080      
                                                                 
 dense_4 (Dense)             (None, 16)                528       
                                                                 
 dense_5 (Dense)             (None, 2)                 34        
                                                                 
Total params: 142354 (556.07 KB)
Trainable params: 14235

In [10]:
history = model.fit(train_embeddings, train_labels, epochs=7, batch_size=32, validation_data=(val_embeddings, val_labels))

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [11]:
# Evaluate the model
loss, accuracy = model.evaluate(val_embeddings, val_labels)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Test Loss: 0.38159245252609253
Test Accuracy: 0.8539999723434448


In [12]:
val_pred = model.predict(val_embeddings)
val_pred = np.argmax(val_pred, axis=1)



In [13]:
f1_score(val_labels, val_pred)

0.8038113852919619

In [14]:
tr_pred = model.predict(train_embeddings)
tr_pred = np.argmax(tr_pred, axis=1)



In [15]:
f1_score(train_labels, tr_pred)

0.9195212434671938

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(max_iter=500)
LR.fit(train_embeddings, train_labels)

In [None]:
val_pred = LR.predict(val_embeddings)

In [None]:
f1_score(val_labels, val_pred)

0.7787929481405225

In [None]:
eval_pred = LR.predict(eval_embeddings)

fei_op = pd.read_csv('gdrive/My Drive/submission-stack.csv')
count_matches = 0
for i in range(len(eval_pred)):
  if fei_op['label'][i] == eval_pred[i]:
    count_matches += 1


print(count_matches)
print(count_matches/len(eval_pred))

9934
0.8277643529705858


KNN

In [16]:
from sklearn.neighbors import KNeighborsClassifier

In [17]:
knn = KNeighborsClassifier(n_neighbors=100)

In [18]:
knn.fit(train_embeddings, train_labels)

In [19]:
val_pred = knn.predict(val_embeddings)

In [20]:
acc = accuracy_score(val_labels, val_pred)
f1 = f1_score(val_labels, val_pred)
print(acc, f1)

0.7921818181818182 0.7404632152588556


In [None]:
eval_pred = knn.predict(eval_embeddings)
fei_op = pd.read_csv('gdrive/My Drive/submission-stack.csv')
count_matches = 0
for i in range(len(eval_pred)):
  if fei_op['label'][i] == eval_pred[i]:
    count_matches += 1


print(count_matches)
print(count_matches/len(eval_pred))

9531
0.7941838180151654


In [21]:
import numpy as np
from sklearn.decomposition import PCA

In [22]:
pca = PCA(n_components=100)

In [23]:
new_training_embeddings = pca.fit_transform(train_embeddings)
new_val_embeddings = pca.transform(val_embeddings)

In [24]:
knn = KNeighborsClassifier(n_neighbors=100)
knn.fit(new_training_embeddings, train_labels)

In [25]:
val_pred = knn.predict(new_val_embeddings)

In [26]:
acc = accuracy_score(val_labels, val_pred)
f1 = f1_score(val_labels, val_pred)
print(acc, f1)

0.8088181818181818 0.7436928702010969


In [27]:
tr_pred = knn.predict(new_training_embeddings)

In [28]:
acc = accuracy_score(train_labels, tr_pred)
f1 = f1_score(train_labels, tr_pred)
print(acc, f1)

0.8154848484848485 0.7507402606263219


SVM Model

In [29]:
from sklearn import svm
svm_model = svm.SVC(kernel='linear', C = 0.01)
svm_model.fit(new_training_embeddings, train_labels)

In [30]:
val_labels = np.array(val_labels)
val_preds = svm_model.predict(new_val_embeddings)
acc = accuracy_score(val_labels, val_preds)
f1 = f1_score(val_labels, val_preds)
print(acc, f1)

0.8193636363636364 0.7377590075227662


In [31]:
train_labels = np.array(train_labels)
tr_preds = svm_model.predict(new_training_embeddings)
acc = accuracy_score(train_labels, tr_preds)
f1 = f1_score(train_labels, tr_preds)
print(acc, f1)

0.8234646464646465 0.7432307353265262


Adaboost

In [32]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

adaboost = AdaBoostClassifier(estimator = DecisionTreeClassifier(max_depth=2), n_estimators=100)
adaboost.fit(new_training_embeddings, train_labels)

In [33]:
val_pred = adaboost.predict(new_val_embeddings)
acc = accuracy_score(val_labels, val_pred)
f1 = f1_score(val_labels, val_pred)
print(acc, f1)

0.8119090909090909 0.7370695132799594


In [34]:
tr_pred = adaboost.predict(new_training_embeddings)
acc = accuracy_score(train_labels, tr_pred)
f1 = f1_score(train_labels, tr_pred)
print(acc, f1)

0.827050505050505 0.7588043049529498


Random Forest

In [35]:
from sklearn.ensemble import RandomForestClassifier

In [36]:
rf = RandomForestClassifier(max_depth=20, random_state=0, n_estimators=150)
rf.fit(new_training_embeddings, train_labels)

In [37]:
train_pred = rf.predict(new_training_embeddings)
acc = accuracy_score(train_labels, train_pred)
f1 = f1_score(train_labels, train_pred)
print(acc, f1)

0.9963939393939394 0.9950702183189031


In [38]:
val_pred = rf.predict(new_val_embeddings)
acc = accuracy_score(val_labels, val_pred)
f1 = f1_score(val_labels, val_pred)
print(acc, f1)

0.8019090909090909 0.6973190720933463
