In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [8]:
file_path = "candidate_priority.csv"
df = pd.read_csv(file_path)

print("=== CANDIDATE PRIORITY RANKING MODEL ===\n")
print(f"Total candidates: {len(df)}")
print(f"Features: {df.columns.tolist()}\n")

=== CANDIDATE PRIORITY RANKING MODEL ===

Total candidates: 200
Features: ['id', 'years_exp_band', 'skills_coverage_band', 'referral_flag', 'english_level', 'location_match', 'priority']



In [9]:
df = df.drop(columns=['id'])
df['years_exp_band'] = df['years_exp_band'].str.strip().str.replace('â€"', '-', regex=False)
df['skills_coverage_band'] = df['skills_coverage_band'].str.strip().str.title()
df['english_level'] = df['english_level'].str.strip().str.upper()
df['location_match'] = df['location_match'].str.strip().str.title()
df['priority'] = df['priority'].str.strip().str.title()

df.to_csv("cleaned_data.csv", index=False)
for col in ['years_exp_band', 'skills_coverage_band', 'english_level', 'location_match']:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mode()[0], inplace=True)


df['referral_flag'].fillna(0, inplace=True)


mapping_exp = {'0-1': 0, '1-3': 1, '3-6': 2, '6+': 3}
mapping_skills = {'Low': 0, 'Medium': 1, 'High': 2}
mapping_english = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5, 'None': 0}
mapping_location = {'Local': 2, 'Remoteok': 1, 'Relocate': 0}
mapping_priority = {'Low': 0, 'Med': 1, 'High': 2}

df['years_exp_band'] = df['years_exp_band'].map(mapping_exp)
df['skills_coverage_band'] = df['skills_coverage_band'].map(mapping_skills)
df['english_level'] = df['english_level'].map(mapping_english)
df['location_match'] = df['location_match'].map(mapping_location)
df['priority'] = df['priority'].map(mapping_priority)

df['years_exp_band'].fillna(df['years_exp_band'].median(), inplace=True)

print(f"Clean dataset size: {len(df)} candidates")
print(f"\nPriority distribution:")
print(df['priority'].value_counts().sort_index())
print()

df.to_csv("cleaned_data.csv", index=False)

X = df.drop('priority', axis=1)
y = df['priority']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {len(X_train)} candidates")
print(f"Test set: {len(X_test)} candidates\n")


model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    class_weight='balanced'
)

model.fit(X_train, y_train)


y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)


print("=== MODEL PERFORMANCE ===\n")
print("Classification Report:")
print(classification_report(
    y_test,
    y_pred,
    target_names=['Low', 'Med', 'High'],
    zero_division=0
))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(pd.DataFrame(
    cm,
    index=['Actual Low', 'Actual Med', 'Actual High'],
    columns=['Pred Low', 'Pred Med', 'Pred High']
))


cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"\nCross-validation accuracy: {cv_scores.mean():.2%} (+/- {cv_scores.std():.2%})")


print("\n=== FEATURE IMPORTANCE ===\n")
importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)
print(importances.to_string(index=False))


def predict_candidate_priority(years_exp, skills, referral, english, location):
    """
    Predict candidate priority with confidence score

    Parameters:
    - years_exp: '0-1', '1-3', '3-6', '6+'
    - skills: 'Low', 'Medium', 'High'
    - referral: 0 or 1
    - english: 'A1', 'A2', 'B1', 'B2', 'C1', 'C2'
    - location: 'Local', 'RemoteOK', 'Relocate'
    """
    exp_encoded = mapping_exp.get(years_exp, 1)
    skills_encoded = mapping_skills.get(skills.title(), 1)
    english_encoded = mapping_english.get(english.upper(), 2)
    location_encoded = mapping_location.get(location.title(), 1)
    features = np.array([[exp_encoded, skills_encoded, referral,
                         english_encoded, location_encoded]])

    prediction = model.predict(features)[0]
    probabilities = model.predict_proba(features)[0]

    priority_labels = {0: 'Low', 1: 'Medium', 2: 'High'}
    confidence = probabilities[int(prediction)] * 100

    return {
        'priority': priority_labels[prediction],
        'confidence': f"{confidence:.1f}%",
        'probabilities': {
            'Low': f"{probabilities[0]*100:.1f}%",
            'Medium': f"{probabilities[1]*100:.1f}%",
            'High': f"{probabilities[2]*100:.1f}%"
        }
    }
print("\n=== EXAMPLE PREDICTIONS ===\n")

examples = [
    ("6+", "High", 1, "C1", "Local"),
    ("1-3", "Medium", 0, "B2", "RemoteOK"),
    ("0-1", "Low", 0, "A2", "Relocate")
]

for exp, skills, ref, eng, loc in examples:
    result = predict_candidate_priority(exp, skills, ref, eng, loc)
    ref_text = "Yes" if ref == 1 else "No"
    print(f"Candidate: {exp} years | {skills} skills | Referral: {ref_text} | {eng} English | {loc}")
    print(f"Priority: {result['priority']} (Confidence: {result['confidence']})")
    print(f"Probabilities: {result['probabilities']}")
    print()

print("\n=== USE THIS MODEL TO: ===")
print("1. Rank candidates for interview priority")
print("2. Identify high-potential candidates quickly")
print("3. Optimize recruiter time by focusing on top candidates")
print("4. Make data-driven hiring decisions with confidence scores")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['referral_flag'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values a

Clean dataset size: 200 candidates

Priority distribution:
priority
0    37
1    94
2    69
Name: count, dtype: int64

Training set: 160 candidates
Test set: 40 candidates

=== MODEL PERFORMANCE ===

Classification Report:
              precision    recall  f1-score   support

         Low       0.83      0.71      0.77         7
         Med       0.83      0.79      0.81        19
        High       0.81      0.93      0.87        14

    accuracy                           0.82        40
   macro avg       0.83      0.81      0.82        40
weighted avg       0.83      0.82      0.82        40


Confusion Matrix:
             Pred Low  Pred Med  Pred High
Actual Low          5         2          0
Actual Med          1        15          3
Actual High         0         1         13

Cross-validation accuracy: 73.50% (+/- 9.17%)

=== FEATURE IMPORTANCE ===

             Feature  Importance
      years_exp_band    0.348843
skills_coverage_band    0.220997
      location_match    0.1781



In [10]:
import pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)