In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report,roc_auc_score, confusion_matrix

df = pd.read_csv(
    "C:/Users/ABHISHEK KINAGI/Programs/Projects/interview_experience/real_interview_dataset.csv",
    header=None,
    on_bad_lines="skip"
)

df.columns = ["text", "difficulty", "result", "sentiment"]
df = df.dropna(subset=["text", "result"])
print(df.describe())
df["selection_label"] = df["result"].map({
    "Selected": 1,
    "Rejected": 0
})

df = df.dropna(subset=["selection_label"])
X = df["text"]
y = df["selection_label"]
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y  
)

model = Pipeline([
    ("tfidf", TfidfVectorizer(
    stop_words="english",
    max_features=3000,     
    min_df=3,               
    max_df=0.9,              
    ngram_range=(1,2),
    sublinear_tf=True         
)),
    ("clf", LogisticRegression(
    max_iter=3000,
    class_weight="balanced",
    C=0.5,            
    solver="liblinear"
))
])


model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]
THRESHOLD = 0.45
y_pred = (y_prob >= THRESHOLD).astype(int)

print("Class distribution:")
print(df["selection_label"].value_counts())

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nROC-AUC:", roc_auc_score(y_test, y_prob))



                                                     text difficulty  \
count                                                 278        278   
unique                                                278          4   
top     Company Shopee. Role Server Engineer. Experien...     Medium   
freq                                                    1        137   

          result sentiment  
count        278       278  
unique         3         4  
top     Rejected  Negative  
freq         184       175  
Class distribution:
selection_label
0.0    184
1.0     93
Name: count, dtype: int64

Classification Report:
              precision    recall  f1-score   support

         0.0       0.96      0.59      0.73        37
         1.0       0.55      0.95      0.69        19

    accuracy                           0.71        56
   macro avg       0.75      0.77      0.71        56
weighted avg       0.82      0.71      0.72        56


Confusion Matrix:
[[22 15]
 [ 1 18]]

ROC-AUC: 0.958748221

In [23]:
import joblib
joblib.dump(model, "C:/Users/ABHISHEK KINAGI/Programs/Projects/interview_experience/ml_integration/interview_selection_model.pkl")

['C:/Users/ABHISHEK KINAGI/Programs/Projects/interview_experience/ml_integration/interview_selection_model.pkl']

The model prioritizes recall for selected candidates to avoid missing strong profiles. With a ROC-AUC of 0.95, it demonstrates strong ranking capability. False positives are intentionally tolerated and handled through a review-based decision system