In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns

# Load and clean data


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_excel("/home/malladi/projects/resume_project/clas.xlsx")

# Define valid roles
roles = [
    "Backend Software Engineer", "Frontend Developer (React)", "DevOps Engineer",
    "QA Automation Engineer", "Data Analyst", "Machine Learning Engineer",
    "Cloud Engineer", "Security Engineer", "Software Product Manager", "Full Stack Developer"
]

# Clean and filter
print("Original rows:", len(df))
df = df[df['role'].notnull()]
df = df[df['role'].isin(roles)]
df = df.drop_duplicates(subset='resume_str')
df = df[df['resume_str'].str.len() > 50]
print("Cleaned rows:", len(df))



Original rows: 209
Cleaned rows: 189


In [3]:
from sklearn.utils import resample

# Oversample Full Stack Developer class
fs_mask = df['role'] == 'Full Stack Developer'
df_fs = df[fs_mask]
df_upsampled = resample(df_fs, replace=True, n_samples=30, random_state=42)

# Combine with other data
df = pd.concat([df, df_upsampled])


In [4]:
# Encode labels
le = LabelEncoder()
df['label'] = le.fit_transform(df['role'])

# Generate sentence embeddings using BERT (MiniLM)
print("Generating embeddings...")
model = SentenceTransformer('all-mpnet-base-v2')
X = model.encode(df['resume_str'].tolist(), show_progress_bar=True)
y = df['label']


Generating embeddings...


Batches: 100%|██████████| 7/7 [00:01<00:00,  5.94it/s]


In [5]:

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [8]:
clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
clf.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [6]:
print("\nTraining Logistic Regression...")
log_reg = LogisticRegression(max_iter=1000, class_weight='balanced')
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)
print("\nClassification Report (Logistic Regression):")
print(classification_report(y_test, y_pred_lr, target_names=le.classes_))


Training Logistic Regression...

Classification Report (Logistic Regression):
                            precision    recall  f1-score   support

 Backend Software Engineer       0.60      0.75      0.67         4
            Cloud Engineer       0.33      0.50      0.40         4
              Data Analyst       1.00      0.50      0.67         4
           DevOps Engineer       0.75      0.75      0.75         4
Frontend Developer (React)       0.40      0.50      0.44         4
      Full Stack Developer       0.71      0.50      0.59        10
 Machine Learning Engineer       0.80      1.00      0.89         4
    QA Automation Engineer       0.50      0.50      0.50         4
         Security Engineer       0.50      0.33      0.40         3
  Software Product Manager       0.50      0.67      0.57         3

                  accuracy                           0.59        44
                 macro avg       0.61      0.60      0.59        44
              weighted avg       0.

['xgb_resume_classifier.pkl']

In [None]:
import joblib
from sentence_transformers import SentenceTransformer

# Load saved models
clf = joblib.load("xgb_resume_classifier.pkl")
le = joblib.load("label_encoder.pkl")

# Load the same embedding model you used for training
model = SentenceTransformer('all-mpnet-base-v2')

# Example: new resume text
new_resume = """
full stack developer with exp in react mongodb html css mysql 
"""

# Generate embedding
embedding = model.encode([new_resume])

# Predict
pred_label = clf.predict(embedding)[0]
pred_role = le.inverse_transform([pred_label])[0]

print(f"Predicted Role: {pred_role}")



Predicted Role: Frontend Developer (React)
