# Logistic Regression & SVM


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [None]:
# Load training and validation datasets
train_df = pd.read_csv("train.csv")
dev_df = pd.read_csv("dev.csv")

label_counts = train_df['label'].value_counts()
val_label_counts = dev_df['label'].value_counts()
print(val_label_counts)
print(label_counts)

label
0    4286
1    1640
Name: count, dtype: int64
label
0    15654
1     5854
Name: count, dtype: int64


In [None]:
# Combine claim and evidence into a single text feature
train_df["text"] = train_df["Claim"] + " " + train_df["Evidence"]
dev_df["text"] = dev_df["Claim"] + " " + dev_df["Evidence"]

# Extract labels
y_train = train_df["label"]
y_dev = dev_df["label"]

In [None]:
# Convert text into TF-IDF features
tfidf = TfidfVectorizer(max_features=18000, ngram_range=(1,4))  # Unigrams & bigrams & trigrams
X_train_tfidf = tfidf.fit_transform(train_df["text"])
X_dev_tfidf = tfidf.transform(dev_df["text"])

## SMOTE

In [None]:
# Implementing SMOTE
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled_smote, y_train_resampled_smote = smote.fit_resample(X_train_tfidf, y_train)

In [None]:
print("\n--- Logistic Regression with SMOTE ---")
# Creating the class weights
pos = np.sum(y_train_resampled_smote == 1)
neg = np.sum(y_train_resampled_smote == 0)

variation = 0.09 # This is used to vary the bias
class_weights = {0: pos/(neg+pos) + variation, 1: neg/(pos+neg) - variation}

# Logistic Regression model with SMOTE
logreg_smote = LogisticRegression(max_iter=500, tol=0.00005, class_weight=class_weights, C=1.18, random_state=42)

# Fit the model on training and dev sample.
logreg_smote.fit(X_train_resampled_smote, y_train_resampled_smote)
y_pred_logreg_smote = logreg_smote.predict(X_dev_tfidf)

# Calculating metrics
f1_logreg_smote = f1_score(y_dev, y_pred_logreg_smote)
accuracy_logreg_smote = accuracy_score(y_dev, y_pred_logreg_smote)
macrof1_logreg_smote = f1_score(y_dev, y_pred_logreg_smote, average='weighted')

# Printing metrics and report
print(f"Validation F1-Score (SMOTE): {f1_logreg_smote}")
print(f"Validation Accuracy (SMOTE): {accuracy_logreg_smote}")
print(f"Validation Macro F1-Score (SMOTE): {macrof1_logreg_smote}")
print(classification_report(y_dev, y_pred_logreg_smote))


--- Logistic Regression with SMOTE ---
{0: np.float64(0.59), 1: np.float64(0.41000000000000003)}
Validation F1-Score (SMOTE): 0.6425970873786407
Validation Accuracy (SMOTE): 0.8012149848126898
Validation Macro F1-Score (SMOTE): 0.8015116054717532
              precision    recall  f1-score   support

           0       0.86      0.86      0.86      4286
           1       0.64      0.65      0.64      1640

    accuracy                           0.80      5926
   macro avg       0.75      0.75      0.75      5926
weighted avg       0.80      0.80      0.80      5926



# SVM

In [None]:
# print("\n--- SVM with SMOTE ---")
# svm_smote = SVC(kernel="linear", C=1.0, class_weight=None)
# svm_smote.fit(X_train_resampled_smote, y_train_resampled_smote)
# y_pred_svm_smote = svm_smote.predict(X_dev_tfidf)
# f1_svm_smote = f1_score(y_dev, y_pred_svm_smote)
# print(f"Validation F1-Score (SMOTE): {f1_svm_smote}")
# print(classification_report(y_dev, y_pred_svm_smote))

## Original Logistic Regression and SVM


In [None]:
# # Train Logistic Regression model
# pos = 5854
# neg = 15654
# variation = 0.09
# class_weights = {0: pos/(neg+pos) + variation, 1: neg/(pos+neg) - variation}

# logreg = LogisticRegression(max_iter=500, class_weight=class_weights, C=1.17)
# logreg.fit(X_train_tfidf, y_train)

# # Predict on validation set
# y_pred_logreg = logreg.predict(X_dev_tfidf)

# # Evaluate performance
# print("Logistic Regression Performance:")
# f1 = f1_score(y_dev, y_pred_logreg)
# accuracy = accuracy_score(y_dev, y_pred_logreg)
# macrof1 = f1_score(y_dev, y_pred_logreg, average='weighted')
# print(f"Validation F1-Score: {f1}")
# print(f"Validation Accuracy: {accuracy}")
# print(f"Validation Macro F1-Score: {macrof1}")
# print(classification_report(y_dev, y_pred_logreg))

In [None]:
# # Train SVM model
# svm = SVC(kernel="linear", C=1.0, class_weight='balanced')  # Linear kernel is best for text classification
# svm.fit(X_train_tfidf, y_train)

# # Predict on validation set
# y_pred_svm = svm.predict(X_dev_tfidf)

# # Evaluate performance
# print("SVM Performance:")
# f1 = f1_score(y_dev, y_pred_svm)
# print(f"Accuracy: {f1}")
# print(classification_report(y_dev, y_pred_svm))

## Gridsearch

In [None]:
from sklearn.model_selection import GridSearchCV

# Example for Logistic Regression
param_grid = {'C': [1.17, 1.16]}
grid_logreg = GridSearchCV(LogisticRegression(max_iter=500, class_weight='balanced'),
                           param_grid, cv=5, scoring='f1_weighted')
grid_logreg.fit(X_train_tfidf, y_train)
print("Best parameters for Logistic Regression:", grid_logreg.best_params_)
