In [14]:
import pandas as pd
import numpy as np
import string
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from IPython.display import display
import itertools
import joblib
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [4]:
df_train = pd.read_csv('/kaggle/input/trainingdata/train.csv')
df_valid = pd.read_csv('/kaggle/input/trainingdata/dev.csv')

df_train[:10]
print("Train columns:", df_train.columns)
print("Valid columns:", df_valid.columns)


Train columns: Index(['premise', 'hypothesis', 'label'], dtype='object')
Valid columns: Index(['premise', 'hypothesis', 'label'], dtype='object')
Test columns: Index(['premise', 'hypothesis'], dtype='object')


In [5]:
# stop_words = set(stopwords.words('english'))
def pre_clean(text):
    # Case folding
    text = text.lower()
    # keep english words,numbers and space 
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    words = text.split()
    # Remove stop words
    # words = [word for word in words if word not in stop_words]
    return ' '.join(words)
df_train['premise'] = df_train['premise'].apply(pre_clean)
df_train['hypothesis'] = df_train['hypothesis'].apply(pre_clean)
df_valid['premise'] = df_valid['premise'].apply(pre_clean)
df_valid['hypothesis'] = df_valid['hypothesis'].apply(pre_clean) 


In [6]:
X_train = df_train['premise'] + ' [SEP] ' + df_train['hypothesis'].astype(str)
y_train = df_train['label']
X_valid = df_valid['premise'] + ' [SEP] ' + df_valid['hypothesis'].astype(str)
y_valid = df_valid['label']

diff_results = []

In [7]:
param_grid = {
    "max_features": [3000, 5000 ,10000],
    "ngrams": [(1, 1), (1, 2),(1,3)],
    "Cs": [0.2, 0.5, 1.0],
    "Alphas":[0.2, 0.5, 1.0],
    "n_estimators":[30, 50, 100]
}
param_combinations = list(itertools.product(
    param_grid["max_features"],
    param_grid["ngrams"],
    param_grid["Cs"],
    param_grid["Alphas"],
    param_grid["n_estimators"],
))

In [8]:
# 1. TF-IDF + Logistic Regression
lr_combinations = list(set([
    (m, ngram, C) for (m, ngram, C, _, _) in param_combinations
]))
for m, ngram, C_val in lr_combinations:
    vectorizer_lr = TfidfVectorizer(
                        max_features=m,
                        ngram_range=ngram,
                        min_df=2,
                        max_df=0.9,
                        sublinear_tf=True
                        
                    )
    # Transform the training and validation data into TF-IDF feature vectors
    X_train_tfidf_lr = vectorizer_lr.fit_transform(X_train)
    X_valid_tfidf_lr = vectorizer_lr.transform(X_valid)
    # Initialize a Logistic Regression
    reg = LogisticRegression(
        C = C_val, # regularization strength
        max_iter=1000, 
        solver='liblinear',
        verbose = 1
                    )
    # Train the Logistic Regression model
    reg.fit(X_train_tfidf_lr, y_train)
    y_pred_lr = reg.predict(X_valid_tfidf_lr)
    diff_results.append({
        "Model": "TF-IDF + Logistic Regression",
        "max_features": m,
        "ngram_range": ngram,
        "C": C_val,
        "Accuracy": accuracy_score(y_valid, y_pred_lr),
        "F1 Score": f1_score(y_valid, y_pred_lr, average='weighted')
    })

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

In [9]:
# 2. Bag-of-Words + SVM

 # Create a CountVectorizer (BoW) with the current max_features and n-gram settingbow_combinations = list(set([
    (m, ngram, C) for (m, ngram, C, _, _) in param_combinations
]))
for m, ngram, C_val in bow_combinations:
     # Create a CountVectorizer (BoW) with the current max_features and n-gram setting
    vectorizer_bow = CountVectorizer(
                        max_features=m,
                        ngram_range=ngram,
                    )
    # Transform the training and validation text data into BoW feature vectors
    X_train_bow = vectorizer_bow.fit_transform(X_train)
    X_valid_bow = vectorizer_bow.transform(X_valid)
    # Initialize a Logistic Regression classifier
    svm = LinearSVC(
        C = C_val,
        max_iter=20000
    )
    # Train the SVM model on the BoW-transformed training data
    svm.fit(X_train_bow, y_train)
    y_pred_svm = svm.predict(X_valid_bow)
    diff_results.append({
        "Model": "BoW + SVM",
        "max_features": m,
        "ngram_range": ngram,
        "C": C_val,
        "Accuracy": accuracy_score(y_valid, y_pred_svm),
        "F1 Score": f1_score(y_valid, y_pred_svm, average='weighted')
    })



In [10]:
# 3. TF-IDF + Naive Bayes
nb_combinations = list(set([
    (m, ngram, aa) for (m, ngram, _, aa, _) in param_combinations
]))
for m, ngram,aa,in nb_combinations:
    # Create a TF-IDF vectorizer with the specified parameters
    vectorizer_nb = TfidfVectorizer(
                        max_features=m,
                        ngram_range=ngram,
                        min_df=1,
                        max_df=0.9,
                        sublinear_tf=True
                    )
    # Transform training and validation data into TF-IDF feature vectors
    X_train_tfidf_nb = vectorizer_nb.fit_transform(X_train)
    X_valid_tfidf_nb = vectorizer_nb.transform(X_valid)
    try:    
        nb = MultinomialNB(
            fit_prior=False, 
            alpha=aa, # smoothing parameter 
            force_alpha=True
        )
        # Train the Naive Bayes classifier
        nb.fit(X_train_tfidf_nb, y_train)
        y_pred_nb = nb.predict(X_valid_tfidf_nb)
        
        diff_results.append({
            "Model": "TF-IDF + Naive Bayes",
            "max_features": m,
            "ngram_range": ngram,
            "alpha": aa,
            "Accuracy": accuracy_score(y_valid, y_pred_nb),
            "F1 Score": f1_score(y_valid, y_pred_nb, average='weighted')
        })
    except Exception as e:
        print(f"Skipped alpha={aa}, m={m}, ngram={ngram}")

In [11]:
# 4. TF-IDF + Random Forest
rf_combinations = list(set([
    (m, ngram, est) for (m, ngram, _, _, est) in param_combinations
]))
for m, ngram,est in rf_combinations:
    # Create a TF-IDF vectorizer with specified max_features and n-gram range
    vectorizer_rf = TfidfVectorizer(
        max_features=m,
        ngram_range=ngram,
        min_df=2,
        max_df=0.9,
        sublinear_tf=True
    )
    # Transform the text data into TF-IDF feature vectors
    X_train_tfidf_rf = vectorizer_rf.fit_transform(X_train)
    X_valid_tfidf_rf = vectorizer_rf.transform(X_valid)
    # Initialize the Random Forest classifier
    rf = RandomForestClassifier(
        n_estimators= est, 
        random_state= 42,
        class_weight='balanced'
    )
    # Train the Random Forest model
    rf.fit(X_train_tfidf_rf, y_train)
    y_pred_rf = rf.predict(X_valid_tfidf_rf)
    
    diff_results.append({
        "Model": "TF-IDF + Random Forest",
        "max_features": m,
        "ngram_range": ngram,
        "n_ests":est,
        "Accuracy": accuracy_score(y_valid, y_pred_rf),
        "F1 Score": f1_score(y_valid, y_pred_rf, average='weighted')
    })


In [15]:
import pandas as pd

df = pd.DataFrame(diff_results)

if "C" not in df.columns:
    df["C"] = None
if "alpha" not in df.columns:
    df["alpha"] = None
if "n_ests" not in df.columns:
    df["n_ests"] = None

# Top 3 for every model
summary_df = (
    df.sort_values(by="Accuracy", ascending=False)
      .groupby("Model")
      .head(3)
      .reset_index(drop=True)
)

summary_df = summary_df[[
    "Model",
    "max_features",
    "C",
    "ngram_range",
    "alpha",
    "n_ests",
    "Accuracy",
    "F1 Score"
]]
summary_df.columns = ["Model", "Max_features", "C", "Ngram_range","Alpha","n_ests","Accuracy", "F1 Score"]

summary_df["Accuracy"] = summary_df["Accuracy"].round(4)
summary_df["F1 Score"] = summary_df["F1 Score"].round(4)

display(summary_df)


Unnamed: 0,Model,Max_features,C,Ngram_range,Alpha,n_ests,Accuracy,F1 Score
0,TF-IDF + Random Forest,5000,,"(1, 3)",,100.0,0.6263,0.6239
1,TF-IDF + Random Forest,3000,,"(1, 2)",,50.0,0.6237,0.6225
2,TF-IDF + Random Forest,3000,,"(1, 2)",,100.0,0.6222,0.6203
3,TF-IDF + Logistic Regression,10000,0.2,"(1, 3)",,,0.6177,0.6115
4,TF-IDF + Logistic Regression,5000,0.5,"(1, 2)",,,0.6177,0.6154
5,TF-IDF + Logistic Regression,3000,0.2,"(1, 2)",,,0.6176,0.6128
6,TF-IDF + Naive Bayes,3000,,"(1, 2)",0.2,,0.5917,0.5917
7,TF-IDF + Naive Bayes,3000,,"(1, 2)",0.5,,0.5909,0.5909
8,TF-IDF + Naive Bayes,3000,,"(1, 3)",1.0,,0.5904,0.5903
9,BoW + SVM,3000,0.2,"(1, 3)",,,0.5904,0.59


In [13]:
# best model
vectorizer_rf_best = TfidfVectorizer(
        max_features = 5000,
        ngram_range=(1,3),
        min_df=2,
        max_df=0.9,
        sublinear_tf=True
    )
X_train_tfidf_rf_best = vectorizer_rf_best.fit_transform(X_train)
X_valid_tfidf_rf_best = vectorizer_rf_best.transform(X_valid)

rf_best = RandomForestClassifier(
    n_estimators= 100, 
    random_state= 42,
    class_weight='balanced'
)
rf_best.fit(X_train_tfidf_rf_best, y_train)
y_pred_rf_best = rf_best.predict(X_valid_tfidf_rf_best)
    
print("Accuracy:", accuracy_score(y_valid, y_pred_rf_best))
print("F1 Score (weighted):", f1_score(y_valid, y_pred_rf_best, average='weighted'))
print("\nClassification Report:\n", classification_report(y_valid, y_pred_rf_best))

Accuracy: 0.6263361045130641
F1 Score (weighted): 0.6238936747518368

Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.55      0.59      3258
           1       0.62      0.70      0.66      3478

    accuracy                           0.63      6736
   macro avg       0.63      0.62      0.62      6736
weighted avg       0.63      0.63      0.62      6736



In [16]:
# Save best model
# Save the TF-IDF vectorizer
joblib.dump(vectorizer_rf_best, 'vectorizer_rf_best.pkl')

# Save the trained RandomForest model
joblib.dump(rf_best, 'rf_best_model.pkl')

print("The model and vectorizer have been saved successfully")


The model and vectorizer have been saved successfully
