In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from joblib import dump, load
from tqdm.auto import tqdm

In [3]:
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('dev.csv')

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [4]:
# Function to apply model encoding with a progress bar
def encode_with_progress(series):
    return series.astype(str).progress_apply(model.encode)

# Function to combine premise and hypothesis
def combine_embeddings(embedding1, embedding2):
    # Element-wise difference
    difference = np.subtract(embedding1, embedding2)
    
    # Element-wise product
    product = np.multiply(embedding1, embedding2)
    
    # Concatenate features
    combined_embedding = np.concatenate((embedding1, embedding2, difference, product))
    
    return combined_embedding

In [5]:
# Initialize tqdm within the pandas apply
tqdm.pandas()

# Embedding the data
train_data['embedding_premise'] = encode_with_progress(train_data['premise'])
train_data['embedding_hypothesis'] = encode_with_progress(train_data['hypothesis'])

val_data['embedding_premise'] = encode_with_progress(val_data['premise'])
val_data['embedding_hypothesis'] = encode_with_progress(val_data['hypothesis'])

  0%|          | 0/26944 [00:00<?, ?it/s]

  0%|          | 0/26944 [00:00<?, ?it/s]

  0%|          | 0/6737 [00:00<?, ?it/s]

  0%|          | 0/6737 [00:00<?, ?it/s]

In [6]:
train_data['combined_embedding'] = train_data.apply(lambda x: combine_embeddings(x['embedding_premise'], x['embedding_hypothesis']), axis=1)
val_data['combined_embedding'] = val_data.apply(lambda x: combine_embeddings(x['embedding_premise'], x['embedding_hypothesis']), axis=1)

In [7]:
svm = SVC(kernel='rbf', C=2.0) 
svm.fit(train_data['combined_embedding'].tolist(), train_data['label'].tolist())

# Evaluate
y_pred = svm.predict(val_data['combined_embedding'].tolist())
print(classification_report(val_data['label'].tolist(), y_pred))

              precision    recall  f1-score   support

           0       0.79      0.69      0.74      3259
           1       0.74      0.83      0.78      3478

    accuracy                           0.76      6737
   macro avg       0.77      0.76      0.76      6737
weighted avg       0.77      0.76      0.76      6737



In [8]:
dump(svm, 'svm_sBERT.joblib')

['svm_sBERT.joblib']

In [10]:
df = pd.DataFrame({"prediction": y_pred})
df.to_csv('result_sBERT.csv', index=False)