# Baseline Classical Models for ANLI R2

This notebook trains three classical ML baseline models for Natural Language Inference on the ANLI Round 2 dataset:

### Models Included:
1. **Logistic Regression**
2. **Linear SVM (LinearSVC)**
3. **XGBoost Classifier**

### Workflow:
1. Load dataset (`src/data_loading.py`)
2. Preprocess text → combine Premise + Hypothesis
3. Train classical models
4. Evaluate on validation & test sets
5. Save each model
6. Produce comparison metrics table


## 1. Imports & Setup

_Assumes this notebook is running locally and `src/` folder exists._

In [1]:
import sys, os

project_root = os.path.abspath("..")
sys.path.append(project_root)

print("PYTHONPATH updated:", project_root)


PYTHONPATH updated: /Users/ashmitgupta/Desktop/anli-nli-project


In [2]:
import src.evaluation
print(src.evaluation.__file__)


/Users/ashmitgupta/Desktop/anli-nli-project/src/evaluation.py


In [3]:
from datasets import load_dataset
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier
import joblib
import numpy as np
import pandas as pd

# Local imports
from src.data_loading import load_anli_r2, LABEL2NAME
from src.preprocessing import combine_premise_hypothesis
from src.evaluation import evaluate_and_print

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


## 2. Load Dataset

In [4]:
train, val, test = load_anli_r2()
train

Dataset({
    features: ['uid', 'premise', 'hypothesis', 'label', 'reason'],
    num_rows: 45460
})

## 3. Prepare Inputs (X and y)

In [5]:
X_train = [combine_premise_hypothesis(x) for x in train]
y_train = train["label"]

X_val = [combine_premise_hypothesis(x) for x in val]
y_val = val["label"]

X_test = [combine_premise_hypothesis(x) for x in test]
y_test = test["label"]

target_names = [LABEL2NAME[i] for i in sorted(LABEL2NAME.keys())]

# -----------------------------
## 4. Model 1 — Logistic Regression (TF-IDF)
# -----------------------------

In [6]:
logreg_clf = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=100_000, ngram_range=(1, 2))),
    ("logreg", LogisticRegression(max_iter=300, class_weight="balanced", n_jobs=-1))
])

print("Training Logistic Regression...")
logreg_clf.fit(X_train, y_train)
print("Done.")

Training Logistic Regression...
Done.


In [7]:
logreg_val_preds = logreg_clf.predict(X_val)
evaluate_and_print(y_val, logreg_val_preds, target_names, prefix="Logistic Regression — Validation")

logreg_test_preds = logreg_clf.predict(X_test)
evaluate_and_print(y_test, logreg_test_preds, target_names, prefix="Logistic Regression — Test")

joblib.dump(logreg_clf, "baseline_logreg.joblib")
print("Saved: baseline_logreg.joblib")


===== Logistic Regression — Validation =====
               precision    recall  f1-score   support

   entailment       0.33      0.37      0.35       334
      neutral       0.33      0.29      0.31       333
contradiction       0.33      0.32      0.33       333

     accuracy                           0.33      1000
    macro avg       0.33      0.33      0.33      1000
 weighted avg       0.33      0.33      0.33      1000

Confusion Matrix:
 [[124 103 107]
 [123  98 112]
 [128  97 108]]

===== Logistic Regression — Test =====
               precision    recall  f1-score   support

   entailment       0.32      0.36      0.34       334
      neutral       0.39      0.36      0.37       333
contradiction       0.32      0.30      0.31       333

     accuracy                           0.34      1000
    macro avg       0.34      0.34      0.34      1000
 weighted avg       0.34      0.34      0.34      1000

Confusion Matrix:
 [[120  98 116]
 [113 119 101]
 [142  91 100]]
Saved: b

# -----------------------------
## 5. Model 2 — Linear SVM (LinearSVC)
# -----------------------------

In [8]:
svm_clf = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=100_000, ngram_range=(1, 2))),
    ("svm", LinearSVC())
])

print("Training LinearSVC...")
svm_clf.fit(X_train, y_train)
print("Done.")

Training LinearSVC...




Done.


In [9]:
svm_val_preds = svm_clf.predict(X_val)
evaluate_and_print(y_val, svm_val_preds, target_names, prefix="Linear SVM — Validation")

svm_test_preds = svm_clf.predict(X_test)
evaluate_and_print(y_test, svm_test_preds, target_names, prefix="Linear SVM — Test")

joblib.dump(svm_clf, "baseline_svm.joblib")
print("Saved: baseline_svm.joblib")


===== Linear SVM — Validation =====
               precision    recall  f1-score   support

   entailment       0.32      0.36      0.34       334
      neutral       0.35      0.43      0.38       333
contradiction       0.35      0.23      0.27       333

     accuracy                           0.34      1000
    macro avg       0.34      0.34      0.33      1000
 weighted avg       0.34      0.34      0.33      1000

Confusion Matrix:
 [[121 142  71]
 [119 143  71]
 [133 125  75]]

===== Linear SVM — Test =====
               precision    recall  f1-score   support

   entailment       0.34      0.37      0.35       334
      neutral       0.39      0.49      0.44       333
contradiction       0.32      0.20      0.25       333

     accuracy                           0.36      1000
    macro avg       0.35      0.36      0.35      1000
 weighted avg       0.35      0.36      0.35      1000

Confusion Matrix:
 [[124 128  82]
 [106 164  63]
 [136 129  68]]
Saved: baseline_svm.joblib

# -----------------------------
## 6. Model 3 — XGBoost Classifier
# -----------------------------
XGBoost cannot directly use sparse TF-IDF matrices, so we convert them to dense arrays.

In [10]:
tfidf_vec = TfidfVectorizer(max_features=10_000, ngram_range=(1, 2))
tfidf_vec.fit(X_train)

X_train_tfidf = tfidf_vec.transform(X_train).toarray()
X_val_tfidf   = tfidf_vec.transform(X_val).toarray()
X_test_tfidf  = tfidf_vec.transform(X_test).toarray()

print("TF-IDF vectors ready.")

TF-IDF vectors ready.


In [11]:
xgb_clf = XGBClassifier(
    n_estimators=300,
    max_depth=8,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softprob",
    num_class=3,   
    eval_metric="mlogloss",
    tree_method="hist"
)

print("Training XGBoost (300 trees)...")
xgb_clf.fit(X_train_tfidf, y_train)
print("Done.")

Training XGBoost (300 trees)...


Done.


In [12]:
xgb_val_preds = xgb_clf.predict(X_val_tfidf)
evaluate_and_print(y_val, xgb_val_preds, target_names, prefix="XGBoost — Validation")

xgb_test_preds = xgb_clf.predict(X_test_tfidf)
evaluate_and_print(y_test, xgb_test_preds, target_names, prefix="XGBoost — Test")

joblib.dump((xgb_clf, tfidf_vec), "baseline_xgboost.joblib")
print("Saved: baseline_xgboost.joblib (model + TF-IDF vectorizer)")


===== XGBoost — Validation =====
               precision    recall  f1-score   support

   entailment       0.35      0.34      0.35       334
      neutral       0.36      0.58      0.45       333
contradiction       0.37      0.17      0.23       333

     accuracy                           0.36      1000
    macro avg       0.36      0.36      0.34      1000
 weighted avg       0.36      0.36      0.34      1000

Confusion Matrix:
 [[114 173  47]
 [ 94 192  47]
 [115 163  55]]

===== XGBoost — Test =====
               precision    recall  f1-score   support

   entailment       0.39      0.40      0.40       334
      neutral       0.39      0.59      0.47       333
contradiction       0.40      0.17      0.24       333

     accuracy                           0.39      1000
    macro avg       0.39      0.39      0.37      1000
 weighted avg       0.39      0.39      0.37      1000

Confusion Matrix:
 [[135 152  47]
 [ 95 198  40]
 [112 164  57]]
Saved: baseline_xgboost.joblib (

# -----------------------------
## 7. Compare Model Performances
# -----------------------------

In [13]:
results = pd.DataFrame({
    "Model": [
        "Logistic Regression",
        "Linear SVM",
        "XGBoost"
    ],
    "Val Accuracy": [
        accuracy_score(y_val, logreg_val_preds),
        accuracy_score(y_val, svm_val_preds),
        accuracy_score(y_val, xgb_val_preds)
    ],
    "Test Accuracy": [
        accuracy_score(y_test, logreg_test_preds),
        accuracy_score(y_test, svm_test_preds),
        accuracy_score(y_test, xgb_test_preds)
    ]
})

results

Unnamed: 0,Model,Val Accuracy,Test Accuracy
0,Logistic Regression,0.33,0.339
1,Linear SVM,0.339,0.356
2,XGBoost,0.361,0.39
