In [17]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# ----------------------------
# 1. Load data
# ----------------------------
df = pd.read_csv(
    "C:/Users/ABHISHEK KINAGI/Programs/Projects/interview_experience/real_interview_dataset.csv",
    header=None,
    on_bad_lines="skip"
)

df.columns = ["text", "difficulty", "result", "sentiment"]

# Keep only required columns
df = df.dropna(subset=["text", "sentiment"])

# ----------------------------
# 2. Label encoding
# Positive + Neutral â†’ 1
# Negative â†’ 0
# ----------------------------
df["label"] = df["sentiment"].map({
    "Positive": 1,
    "Neutral": 1,
    "Negative": 0
})

df = df.dropna(subset=["label"])
df["label"] = df["label"].astype(int)

print("Class distribution:")
print(df["label"].value_counts())

# ----------------------------
# 3. Train-test split (STRATIFIED)
# ----------------------------
X = df["text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y   # ðŸ”¥ very important
)

# ----------------------------
# 4. Pipeline (Improved TF-IDF + LR)
# ----------------------------
model = Pipeline([
    ("tfidf", TfidfVectorizer(
        stop_words="english",
        max_features=8000,        # increased
        ngram_range=(1, 2),
        min_df=2,                 # remove rare noise
        max_df=0.9                # remove overly common words
    )),
    ("clf", LogisticRegression(
        max_iter=2000,
        class_weight="balanced",
        solver="liblinear"
    ))
])

# ----------------------------
# 5. Train
# ----------------------------
model.fit(X_train, y_train)

# ----------------------------
# 6. Evaluate
# ----------------------------
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("ROC-AUC:", roc_auc_score(y_test, y_prob))


Class distribution:
label
0    175
1     93
Name: count, dtype: int64

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.83      0.87        35
           1       0.73      0.84      0.78        19

    accuracy                           0.83        54
   macro avg       0.82      0.84      0.82        54
weighted avg       0.84      0.83      0.84        54

Confusion Matrix:
[[29  6]
 [ 3 16]]
ROC-AUC: 0.9293233082706767


In [18]:
import joblib
joblib.dump(model, "C:/Users/ABHISHEK KINAGI/Programs/Projects/interview_experience/ml_integration/interview_sentiment_model.pkl")

['C:/Users/ABHISHEK KINAGI/Programs/Projects/interview_experience/ml_integration/interview_sentiment_model.pkl']

Sentiment Analysis Model
Trained on realistic interview narratives to classify emotional tone (Positive / Negative).
Achieved 76% accuracy with balanced precision and recall, reflecting the subjective and nuanced nature of interview experiences.

['interview_sentiment_model.pkl']