# Libraries

In [1]:
# !pip install scikit-learn
# !pip install spacy
# !python -m spacy download en_core_web_sm
# !pip install torchtext==0.6.0
# !pip install transformers sentence-transformers
# !pip install tqdm

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
   ---------------------------------------- 0.0/171.5 kB ? eta -:--:--
   ------- -------------------------------- 30.7/171.5 kB 1.3 MB/s eta 0:00:01
   --------------------- ------------------ 92.2/171.5 kB 1.3 MB/s eta 0:00:01
   ---------------------------------------- 171.5/171.5 kB 1.7 MB/s eta 0:00:00
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.7.0


In [17]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import re
import string
from torchtext.vocab import GloVe
import torch
from torchtext import data
import spacy
import random
from joblib import dump, load

# Read Data

In [3]:
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('dev.csv')

# Tokenisation

In [4]:
tokens = []
count = 0
num_inv = 0
num_oov = 0
glove_mode = True

update_inv_mode = False
update_oov_mode = False
word_mode = (glove_mode, update_inv_mode, update_oov_mode)

In [5]:
# Load SpaCy English tokenizer
spacy_en = spacy.load('en_core_web_sm')
inputs = data.Field(lower=True, tokenize=lambda text: [token.text for token in spacy_en.tokenizer(text)])

In [6]:
train_data['processed_premise'] = train_data['premise'].astype(str).apply(inputs.preprocess)
train_data['processed_hypothesis'] = train_data['hypothesis'].astype(str).apply(inputs.preprocess)

val_data['processed_premise'] = val_data['premise'].astype(str).apply(inputs.preprocess)
val_data['processed_hypothesis'] = val_data['hypothesis'].astype(str).apply(inputs.preprocess)

# Embedding

In [7]:
# Load GloVe embeddings
EMBEDDING_DIM = 300
glove = GloVe(name='840B', dim=EMBEDDING_DIM) # Using GloVe with 840 billion tokens and 300 dimensions

# GloVe's vocabulary and vectors
wv_dict = glove.stoi  # Word to index mapping
wv_arr = glove.vectors  # Embedding matrix
wr_size = glove.dim  # Embedding dimension

In [8]:
def embedding(tokens):
    embeddings = torch.zeros(300)
    for word in tokens:
        if word in wv_dict:
            embeddings += wv_arr[wv_dict[word]]
        else:
            embeddings += torch.Tensor([random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM)])
    embeddings = embeddings / len(tokens)
    return embeddings
    
train_data['premise_vec'] = train_data['processed_premise'].apply(embedding)
train_data['hypothesis_vec'] = train_data['processed_hypothesis'].apply(embedding)

val_data['premise_vec'] = val_data['processed_premise'].apply(embedding)
val_data['hypothesis_vec'] = val_data['processed_hypothesis'].apply(embedding)

In [9]:
stacked_tensors = []
for index, row in train_data.iterrows():
    vector1 = row['premise_vec']
    vector2 = row['hypothesis_vec']

    stacked_tensor = torch.concatenate((vector1, vector2), dim=0)
    
    stacked_tensors.append(stacked_tensor)
train_data['combined_vector'] = stacked_tensors

stacked_tensors = []
for index, row in val_data.iterrows():
    vector1 = row['premise_vec']
    vector2 = row['hypothesis_vec']

    stacked_tensor = torch.concatenate((vector1, vector2), dim=0)
    
    stacked_tensors.append(stacked_tensor)
val_data['combined_vector'] = stacked_tensors

# SVM

In [12]:
# Bootstrapping
X_train, y_train = np.array(train_data["combined_vector"].tolist()), np.array(train_data["label"].tolist())
# X_bootstrap_samples = {}
# y_bootstrap_samples = {}

# for i in range(3):
#     n_samples = len(X_train)
    
#     bootstrap_indices = np.random.randint(0, n_samples, size=n_samples)
    
#     X_bootstrap_samples[f'X_train_{i+1}'] = X_train[bootstrap_indices]
#     y_bootstrap_samples[f'y_train_{i+1}'] = y_train[bootstrap_indices]

X_val, y_val = val_data["combined_vector"].tolist(), val_data["label"].tolist()

## First

In [26]:
svm_1 = SVC(kernel='rbf', C=1) 
svm_1.fit(X_bootstrap_samples['X_train_1'], y_bootstrap_samples['y_train_1'])

In [27]:
# Evaluate
y_pred_1 = svm_1.predict(X_val)
print(classification_report(y_val, y_pred_1))

              precision    recall  f1-score   support

           0       0.67      0.55      0.61      3259
           1       0.64      0.75      0.69      3478

    accuracy                           0.65      6737
   macro avg       0.66      0.65      0.65      6737
weighted avg       0.66      0.65      0.65      6737



## Second

In [13]:
svm_2 = SVC(kernel='rbf', C=1) 
svm_2.fit(train_data["combined_vector"].tolist(), train_data['label'].tolist())

# Evaluate
y_pred_2 = svm_2.predict(X_val)
print(classification_report(y_val, y_pred_2))

              precision    recall  f1-score   support

           0       0.69      0.56      0.62      3259
           1       0.65      0.76      0.70      3478

    accuracy                           0.66      6737
   macro avg       0.67      0.66      0.66      6737
weighted avg       0.67      0.66      0.66      6737



## Third

In [16]:
svm_3 = SVC(kernel='rbf', C=3.5) 
svm_3.fit(train_data["combined_vector"].tolist(), train_data['label'].tolist())

# Evaluate
y_pred_3 = svm_3.predict(X_val)
print(classification_report(y_val, y_pred_3))

              precision    recall  f1-score   support

           0       0.69      0.59      0.64      3259
           1       0.66      0.76      0.71      3478

    accuracy                           0.67      6737
   macro avg       0.68      0.67      0.67      6737
weighted avg       0.68      0.67      0.67      6737



In [20]:
svm_4 = SVC(kernel='rbf', C=3) 
svm_4.fit(train_data["combined_vector"].tolist(), train_data['label'].tolist())

# Evaluate
y_pred_4 = svm_4.predict(X_val)
print(classification_report(y_val, y_pred_4))

              precision    recall  f1-score   support

           0       0.70      0.59      0.64      3259
           1       0.66      0.76      0.71      3478

    accuracy                           0.68      6737
   macro avg       0.68      0.67      0.67      6737
weighted avg       0.68      0.68      0.67      6737



In [21]:
dump(svm_4, 'svm_glove.joblib')

['svm_glove.joblib']

In [32]:
y_pred_combined = (y_pred_1*0.5 + y_pred_2 + y_pred_3 + y_pred_4*3) / 5.5
y_pred_final = np.where(y_pred_combined >= 0.5, 1, 0)
print(classification_report(y_val, y_pred_final))

              precision    recall  f1-score   support

           0       0.69      0.59      0.64      3259
           1       0.66      0.76      0.71      3478

    accuracy                           0.68      6737
   macro avg       0.68      0.67      0.67      6737
weighted avg       0.68      0.68      0.67      6737



In [74]:
df = pd.DataFrame({"prediction": y_pred_1,
                  "prediction2": y_pred_2,
                  "prediction3": y_pred_3,
                  "prediction4": y_pred_4})
df.to_csv('result.csv', index=False)

In [33]:
df = pd.DataFrame({"prediction": y_pred_final})
df.to_csv('result.csv', index=False)