In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [45]:
df = pd.read_csv(
    "C:/Users/ABHISHEK KINAGI/Programs/Projects/interview_experience/real_interview_dataset.csv",
    header=None,
    on_bad_lines="skip"
)

df.columns = ["text", "difficulty", "result", "sentiment"]


In [46]:
df = df.dropna(subset=["text", "sentiment"])
print(df.describe())

        text difficulty    result sentiment
count    137        137       137       137
unique   137          4         3         4
top     text     Medium  Selected  Positive
freq       1         67        93        93


In [53]:
df["label"] = df["sentiment"].map({
    "Positive": 1,
    "Neutral": 1,
    "Negative": 0
})





In [54]:
df = df.dropna(subset=["label", "text"])
df["label"] = df["label"].astype(int)


In [55]:

X = df["text"]
y = df["label"]

In [56]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [57]:
model = Pipeline([
    ("tfidf", TfidfVectorizer(
        stop_words="english",
        max_features=5000,
        ngram_range=(1,2)
    )),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))
])


In [59]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [60]:
df = df.dropna(subset=["label"])

In [61]:

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.50      0.67      0.57         6
           1       0.89      0.80      0.84        20

    accuracy                           0.77        26
   macro avg       0.69      0.73      0.71        26
weighted avg       0.80      0.77      0.78        26



In [63]:
import joblib
joblib.dump(model, "interview_sentiment_model.pkl")

['interview_sentiment_model.pkl']