# Libraries

In [None]:
# !pip install scikit-learn
# !pip install spacy
# !python -m spacy download en_core_web_sm
# !pip install torchtext==0.6.0
# !pip install transformers sentence-transformers
# !pip install tqdm

In [45]:
from joblib import dump, load
import pandas as pd
import numpy as np
import spacy
from torchtext import data
from torchtext.vocab import GloVe
import torch
import random
from transformers import DebertaTokenizer, DebertaModel
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
from sklearn.metrics import classification_report

# Load models and data

In [29]:
test_data = pd.read_csv('dev.csv')
svm_glove = load('models/svm_glove.joblib')
svm_DeBERTa = load('models/svm_DeBERTa.joblib')
svm_sBERT = load('models/svm_sBERT.joblib')

# Process data

## GloVe

### Tokenization

In [30]:
tokens = []
count = 0
num_inv = 0
num_oov = 0
glove_mode = True

update_inv_mode = False
update_oov_mode = False
word_mode = (glove_mode, update_inv_mode, update_oov_mode)

# Load SpaCy English tokenizer
spacy_en = spacy.load('en_core_web_sm')
inputs = data.Field(lower=True, tokenize=lambda text: [token.text for token in spacy_en.tokenizer(text)])

test_data['processed_premise'] = test_data['premise'].astype(str).apply(inputs.preprocess)
test_data['processed_hypothesis'] = test_data['hypothesis'].astype(str).apply(inputs.preprocess)

### Embedding

In [31]:
def embedding(tokens):
    embeddings = torch.zeros(300)
    for word in tokens:
        if word in wv_dict:
            embeddings += wv_arr[wv_dict[word]]
        else:
            embeddings += torch.Tensor([random.uniform(-0.05, 0.05) for i in range(EMBEDDING_DIM)])
    embeddings = embeddings / len(tokens)
    return embeddings

# Load GloVe embeddings
EMBEDDING_DIM = 300
glove = GloVe(name='840B', dim=EMBEDDING_DIM) # Using GloVe with 840 billion tokens and 300 dimensions

# GloVe's vocabulary and vectors
wv_dict = glove.stoi  # Word to index mapping
wv_arr = glove.vectors  # Embedding matrix
wr_size = glove.dim  # Embedding dimension

test_data['premise_vec'] = test_data['processed_premise'].apply(embedding)
test_data['hypothesis_vec'] = test_data['processed_hypothesis'].apply(embedding)

In [32]:
stacked_tensors = []
for index, row in test_data.iterrows():
    vector1 = row['premise_vec']
    vector2 = row['hypothesis_vec']

    stacked_tensor = torch.concatenate((vector1, vector2), dim=0)
    
    stacked_tensors.append(stacked_tensor)
test_data['combined_vector'] = stacked_tensors

## DeBERTa

### Initialization

In [33]:
tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
model = DebertaModel.from_pretrained('microsoft/deberta-base')

### Tokenization

In [34]:
def tokenize(premises, hypotheses, max_length=128):
    # Tokenize a list of premise and hypothesis pairs
    tokenized_output = tokenizer(premises, hypotheses,
                                 padding='max_length',
                                 truncation=True,
                                 max_length=max_length,
                                 return_tensors="pt",
                                 truncation_strategy='only_second')
    
    return tokenized_output
    
tokenized_test = tokenize([str(p) for p in test_data["premise"].tolist()], [str(h) for h in test_data["hypothesis"].tolist()])

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

### Embedding

In [37]:
def get_embeddings(model, tokenized_input):
    # Move tokenized input to the same device as the model
    input_ids = tokenized_input['input_ids'].to(model.device)
    attention_mask = tokenized_input['attention_mask'].to(model.device)

    with torch.no_grad():
        # Forward pass, get model outputs
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Get the embeddings from the last hidden state
        # You could opt for other strategies like taking the output of the [CLS] token
        embeddings = outputs.last_hidden_state.mean(dim=1)
    
    return embeddings.cpu().numpy()  # Assuming you want to continue in numpy, move tensor to CPU and convert to numpy

def get_embeddings_in_chunks(model, tokenized_inputs, chunk_size=128):
    # Split tokenized input into smaller chunks
    all_embeddings = []
    input_chunks = []

    for i in range(0, len(tokenized_inputs['input_ids']), chunk_size):
        input_chunks.append(tokenized_inputs[i:i + chunk_size])
    
    index = 1
    for chunk in input_chunks:
        print(f'{index}/{len(input_chunks)}')
        
        embeddings = get_embeddings(model, chunk) 
        all_embeddings.append(embeddings)
        
        index+=1

    # Concatenate all chunk embeddings
    return np.concatenate(all_embeddings, axis=0)

X_test = get_embeddings_in_chunks(model, tokenized_test)

1/53
2/53
3/53
4/53
5/53
6/53
7/53
8/53
9/53
10/53
11/53
12/53
13/53
14/53
15/53
16/53
17/53
18/53
19/53
20/53
21/53
22/53
23/53
24/53
25/53
26/53
27/53
28/53
29/53
30/53
31/53
32/53
33/53
34/53
35/53
36/53
37/53
38/53
39/53
40/53
41/53
42/53
43/53
44/53
45/53
46/53
47/53
48/53
49/53
50/53
51/53
52/53
53/53


## sBERT

### Initialization

In [40]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

### Sentence Embedding

In [43]:
# Function to apply model encoding with a progress bar
def encode_with_progress(series):
    return series.astype(str).progress_apply(model.encode)

# Function to combine premise and hypothesis
def combine_embeddings(embedding1, embedding2):
    # Element-wise difference
    difference = np.subtract(embedding1, embedding2)
    
    # Element-wise product
    product = np.multiply(embedding1, embedding2)
    
    # Concatenate features
    combined_embedding = np.concatenate((embedding1, embedding2, difference, product))
    
    return combined_embedding

# Initialize tqdm within the pandas apply
tqdm.pandas()

# Embedding the data
test_data['embedding_premise'] = encode_with_progress(test_data['premise'])
test_data['embedding_hypothesis'] = encode_with_progress(test_data['hypothesis'])

# Combining the data
test_data['combined_embedding'] = test_data.apply(lambda x: combine_embeddings(x['embedding_premise'], x['embedding_hypothesis']), axis=1)

  0%|          | 0/6737 [00:00<?, ?it/s]

  0%|          | 0/6737 [00:00<?, ?it/s]

# Predict

In [46]:
y_val = test_data["label"].tolist()

## GloVe

In [44]:
y_pred_glove = svm_glove.predict(test_data["combined_vector"].tolist())

In [47]:
print(classification_report(y_val, y_pred_glove))

              precision    recall  f1-score   support

           0       0.70      0.59      0.64      3259
           1       0.66      0.76      0.71      3478

    accuracy                           0.68      6737
   macro avg       0.68      0.67      0.67      6737
weighted avg       0.68      0.68      0.67      6737



## DeBERTa

In [48]:
y_pred_DeBERTa = svm_DeBERTa.predict(X_test)
print(classification_report(y_val, y_pred_DeBERTa))

              precision    recall  f1-score   support

           0       0.65      0.67      0.66      3259
           1       0.68      0.66      0.67      3478

    accuracy                           0.67      6737
   macro avg       0.67      0.67      0.67      6737
weighted avg       0.67      0.67      0.67      6737



## sBERT

In [50]:
y_pred_sBERT = svm_sBERT.predict(test_data['combined_embedding'].tolist())
print(classification_report(y_val, y_pred_sBERT))

              precision    recall  f1-score   support

           0       0.79      0.69      0.74      3259
           1       0.74      0.83      0.78      3478

    accuracy                           0.76      6737
   macro avg       0.77      0.76      0.76      6737
weighted avg       0.77      0.76      0.76      6737



In [68]:
y_pred_combined = (y_pred_glove*68 + y_pred_DeBERTa*67 + y_pred_sBERT*77) / 212
y_pred_final = np.where(y_pred_combined >= 0.5, 1, 0)
print(classification_report(y_val, y_pred_final))

              precision    recall  f1-score   support

           0       0.76      0.68      0.72      3259
           1       0.73      0.80      0.76      3478

    accuracy                           0.74      6737
   macro avg       0.74      0.74      0.74      6737
weighted avg       0.74      0.74      0.74      6737



In [69]:
df = pd.DataFrame({"prediction": y_pred_sBERT})
df.to_csv('Group_9_A.csv', index=False)