In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier


In [2]:
# Cleaned resume data
resumes = pd.read_csv("combined_resume_cleaned.csv")

# BERT embeddings
resume_embeddings = np.load("resume_embeddings.npy")


In [3]:
y = resumes['career_label']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


In [10]:
indices = np.arange(len(y_encoded))

X_train_idx, X_test_idx, y_train, y_test = train_test_split(
    indices,
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)


In [6]:
print(resumes.columns.tolist())


['career_label', 'skills', 'experience_years', 'education_level']


In [11]:
tfidf_text = (
    'skills ' + resumes['skills'].astype(str) +
    ' experience ' + resumes['experience_years'].astype(str) + ' years' +
    ' education ' + resumes['education_level'].astype(str)
)


In [12]:
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english'
)

X_tfidf = tfidf.fit_transform(tfidf_text)

X_train_tfidf = X_tfidf[X_train_idx]
X_test_tfidf = X_tfidf[X_test_idx]


In [13]:
lr_model = LogisticRegression(
    max_iter=1000,
    n_jobs=-1
)

lr_model.fit(X_train_tfidf, y_train)
y_pred_lr = lr_model.predict(X_test_tfidf)




In [14]:
print("=== Baseline 1: Logistic Regression + TF-IDF ===")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


=== Baseline 1: Logistic Regression + TF-IDF ===
Accuracy: 0.5938061041292639
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00         3
           2       1.00      0.96      0.98        45
           3       0.00      0.00      0.00        47
           4       0.00      0.00      0.00         7
           5       0.00      0.00      0.00         5
           6       0.00      0.00      0.00         8
           7       0.78      0.56      0.65        50
           8       0.61      0.49      0.54        51
           9       0.00      0.00      0.00         5
          10       0.54      0.49      0.52        51
          11       0.45      0.35      0.40        48
          12       0.17      0.29      0.22        48
          13       0.34      0.26      0.29        47
          14       0.00      0.00      0.00         2
          15       0.80      0.83      0.82       103
   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [15]:
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=20,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train_tfidf, y_train)
y_pred_rf = rf_model.predict(X_test_tfidf)


In [16]:
print("=== Baseline 2: Random Forest + TF-IDF ===")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


=== Baseline 2: Random Forest + TF-IDF ===
Accuracy: 0.6032315978456014
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00         3
           2       0.93      0.96      0.95        45
           3       0.00      0.00      0.00        47
           4       0.50      0.14      0.22         7
           5       0.40      0.40      0.40         5
           6       1.00      0.62      0.77         8
           7       0.67      0.58      0.62        50
           8       0.55      0.61      0.58        51
           9       1.00      0.40      0.57         5
          10       0.54      0.53      0.53        51
          11       0.42      0.40      0.41        48
          12       0.20      0.35      0.26        48
          13       0.36      0.19      0.25        47
          14       0.00      0.00      0.00         2
          15       0.78      0.83      0.81       103
         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [17]:
X_train_bert = resume_embeddings[X_train_idx]
X_test_bert = resume_embeddings[X_test_idx]


In [18]:
xgb_model = XGBClassifier(
    objective='multi:softmax',
    num_class=len(np.unique(y_encoded)),
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss',
    random_state=42
)

xgb_model.fit(X_train_bert, y_train)
y_pred_xgb = xgb_model.predict(X_test_bert)


In [19]:
print("=== Proposed Hybrid Model: BERT + XGBoost ===")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))


=== Proposed Hybrid Model: BERT + XGBoost ===
Accuracy: 0.6449730700179533
              precision    recall  f1-score   support

           0       0.29      0.50      0.36         4
           1       0.00      0.00      0.00         3
           2       1.00      0.96      0.98        45
           3       0.23      0.15      0.18        47
           4       0.50      0.14      0.22         7
           5       0.33      0.20      0.25         5
           6       0.35      1.00      0.52         8
           7       0.66      0.66      0.66        50
           8       0.58      0.67      0.62        51
           9       1.00      0.40      0.57         5
          10       0.76      0.63      0.69        51
          11       0.53      0.42      0.47        48
          12       0.23      0.29      0.26        48
          13       0.45      0.36      0.40        47
          14       0.00      0.00      0.00         2
          15       0.85      0.80      0.82       103
      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [2]:
import pandas as pd

career_results_df = pd.DataFrame({
    "Model": [
        "Logistic Regression + TF-IDF",
        "Random Forest + TF-IDF",
        "Proposed Hybrid (BERT + XGBoost)"
    ],
    "Accuracy": [
        0.594,   # from your output
        0.603,   # from your output
        0.645    # from your output
    ],
    "Precision": [
        0.56,
        0.60,
        0.65
    ],
    "Recall": [
        0.59,
        0.60,
        0.64
    ],
    "F1-Score": [
        0.57,
        0.59,
        0.64
    ]
})

career_results_df


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,Logistic Regression + TF-IDF,0.594,0.56,0.59,0.57
1,Random Forest + TF-IDF,0.603,0.6,0.6,0.59
2,Proposed Hybrid (BERT + XGBoost),0.645,0.65,0.64,0.64


In [3]:
career_results_df.to_csv("career_prediction_results.csv", index=False)
