In [8]:
import pandas as pd
import os
import joblib

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

print("üî• Training improved stance classification model...")

data_path = r"C:\Users\Ayush Ahlawat\OneDrive\Documents\Public Comment Analysis\public-comment-analysis\dataset\final_clean_dataset.csv"
df = pd.read_csv(data_path)

df = df.dropna(subset=["comment_text", "stance_label"])

X = df["comment_text"].astype(str)
y = df["stance_label"].astype(str)

# Custom sentiment transformer
class SentimentExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.analyzer = SentimentIntensityAnalyzer()
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        scores = [self.analyzer.polarity_scores(text)["compound"] for text in X]
        return np.array(scores).reshape(-1, 1)

vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=6000,
    ngram_range=(1, 2)
)

pipeline = Pipeline([
    ("features", FeatureUnion([
        ("tfidf", vectorizer),
        ("sentiment", SentimentExtractor())
    ])),
    ("clf", LogisticRegression(
        max_iter=2000,
        C=3.0,
        class_weight="balanced"
    ))
])

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("‚è≥ Cross-validation...")
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, n_jobs=-1)
print("CV Mean Accuracy:", cv_scores.mean())

pipeline.fit(X_train, y_train)

# Evaluate
y_pred = pipeline.predict(X_test)
print("üéØ Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nüìù Classification Report:\n", classification_report(y_test, y_pred))
print("\nüî¢ Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

os.makedirs("models", exist_ok=True)
joblib.dump(pipeline, "models/stance_model_v2.joblib")

print("üéâ Improved model saved successfully!")


üî• Training improved stance classification model...
‚è≥ Cross-validation...
CV Mean Accuracy: 0.9684706310238225
üéØ Test Accuracy: 0.9693877551020408

üìù Classification Report:
               precision    recall  f1-score   support

     Neutral       0.96      0.99      0.97       138
      Oppose       0.97      0.97      0.97        80
     Support       0.99      0.93      0.96        76

    accuracy                           0.97       294
   macro avg       0.97      0.96      0.97       294
weighted avg       0.97      0.97      0.97       294


üî¢ Confusion Matrix:
 [[136   1   1]
 [  2  78   0]
 [  4   1  71]]
üéâ Improved model saved successfully!
