In [2]:
import os
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print(" Training stance classification model...")

# Load cleaned dataset
data_path = r"C:\Users\Ayush Ahlawat\OneDrive\Documents\Public Comment Analysis\public-comment-analysis\dataset\policy_comment_dataset_500_cleaned.csv"
df = pd.read_csv(data_path)

# Features & labels
X = df["cleaned_comment_text"].astype(str)
y = df["stance_label"].astype(str)

# Train/test split
X_train, X_test, Y_train, Y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=5000,
    ngram_range=(1, 2)
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Model
model = LogisticRegression(
    max_iter=1000,
    C=4.0,
    class_weight="balanced",
    n_jobs=-1
)

# Cross validation
print("Performing 5-fold Cross Validation...")
cv_scores = cross_val_score(model, X_train_vec, Y_train, cv=5, n_jobs=-1)
print("Cross-validation scores:", cv_scores)
print("CV Mean Accuracy:", cv_scores.mean())

# Train model on full set
model.fit(X_train_vec, Y_train)

# Evaluate
Y_pred = model.predict(X_test_vec)
print("Test Accuracy:", accuracy_score(Y_test, Y_pred))
print("\nClassification Report:\n", classification_report(Y_test, Y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(Y_test, Y_pred))

# Save model
os.makedirs("models", exist_ok=True)
joblib.dump(model, "models/stance_model.joblib")
joblib.dump(vectorizer, "models/vectorizer.joblib")

print("\n Model trained and saved successfully!")
print("Saved in models folder:")
print("  stance_model.joblib")
print("  vectorizer.joblib")


 Training stance classification model...
Performing 5-fold Cross Validation...
Cross-validation scores: [1. 1. 1. 1. 1.]
CV Mean Accuracy: 1.0
Test Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

     Neutral       1.00      1.00      1.00        31
      Oppose       1.00      1.00      1.00        33
     Support       1.00      1.00      1.00        36

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100


Confusion Matrix:
 [[31  0  0]
 [ 0 33  0]
 [ 0  0 36]]

 Model trained and saved successfully!
Saved in models folder:
  stance_model.joblib
  vectorizer.joblib
