In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn import set_config
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
df=pd.read_csv('/kaggle/input/hirings-dataset/hiring.csv')

In [3]:
df.head(2)

Unnamed: 0,Age,Gender,EducationLevel,ExperienceYears,PreviousCompanies,DistanceFromCompany,InterviewScore,SkillScore,PersonalityScore,RecruitmentStrategy,HiringDecision
0,26,1,2,0,3,27,48,78,91,1,1
1,39,1,4,12,3,26,35,68,80,2,1


In [4]:
X = df.drop(['HiringDecision'], axis=1)
y = df['HiringDecision']

In [5]:
df.columns

Index(['Age', 'Gender', 'EducationLevel', 'ExperienceYears',
       'PreviousCompanies', 'DistanceFromCompany', 'InterviewScore',
       'SkillScore', 'PersonalityScore', 'RecruitmentStrategy',
       'HiringDecision'],
      dtype='object')

In [6]:
# Step 1: Define preprocessing for numerical and categorical columns
numerical_features = ['Age', 'ExperienceYears', 'PreviousCompanies', 'DistanceFromCompany', 'InterviewScore', 'SkillScore', 'PersonalityScore']
categorical_features = ['Gender', 'EducationLevel', 'RecruitmentStrategy']

# Create transformers and scaling
numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
                                        ('scaler', StandardScaler())
                                       ])
# One hot encode categorical data
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                          ('onehot', OneHotEncoder(handle_unknown='ignore'))
                                         ])

# Combine them into a column transformer
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_features),
                                               ('cat', categorical_transformer, categorical_features)
                                              ])

# ull pipeline with preprocessing and model training
pipeline = Pipeline(steps=[('preprocessor', preprocessor),                 # Preprocessing
                           ('smote', SMOTE(sampling_strategy='auto', random_state=42)),  # SMOTE Oversampling
                           ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
                          ])
                           


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'\nModel Accuracy: {accuracy * 100:.2f}%')
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Model Accuracy: 92.00%

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94       215
           1       0.86      0.86      0.86        85

    accuracy                           0.92       300
   macro avg       0.90      0.90      0.90       300
weighted avg       0.92      0.92      0.92       300



In [7]:
confusion_matrix(y_test, y_pred)

array([[203,  12],
       [ 12,  73]])

# Making Prediction and recommendations

In [8]:
# === Make Predictions for All Candidates ===
df['Hire_Probability'] = pipeline.predict_proba(X)[:, 1]  # Probability of being hired

# Filter candidates who were not hired but have a high hire probability
not_hired = df[df['HiringDecision'] == 0]

# Sort by highest predicted probability
top_not_hired = not_hired.sort_values('Hire_Probability', ascending=False)

In [9]:
top_not_hired

Unnamed: 0,Age,Gender,EducationLevel,ExperienceYears,PreviousCompanies,DistanceFromCompany,InterviewScore,SkillScore,PersonalityScore,RecruitmentStrategy,HiringDecision,Hire_Probability
1440,42,1,2,11,4,5,60,99,61,1,0,0.94
1402,30,0,3,14,5,29,57,4,88,1,0,0.89
1445,27,1,2,14,1,45,30,13,77,1,0,0.80
1027,40,1,2,4,4,16,50,58,9,1,0,0.75
1348,50,1,3,5,1,35,14,47,55,1,0,0.66
...,...,...,...,...,...,...,...,...,...,...,...,...
576,34,1,2,1,2,18,29,28,23,3,0,0.00
1185,35,1,2,13,2,18,23,54,54,2,0,0.00
579,27,1,2,0,1,35,41,28,12,3,0,0.00
1174,28,0,1,9,1,29,56,58,46,2,0,0.00


In [10]:
set_config(display='diagram')
pipeline

In [11]:
# === Sample Data (same structure as the original dataset) ===
sample_data = pd.DataFrame({
    'Age': [28, 35, 92],
    'ExperienceYears': [4, 10, 15],
    'PreviousCompanies': [2, 3, 5],
    'DistanceFromCompany': [10,55, 20],
    'InterviewScore': [85, 90, 75],
    'SkillScore': [55,29,52],
    'PersonalityScore': [45, 55, 70],
    'Gender': [1, 0, 1],
    'EducationLevel': [2, 3, 1],
    'RecruitmentStrategy': [1, 1, 2],
})

# === Make Predictions ===
sample_preds = pipeline.predict(sample_data)

# === Display Results ===
print("\nSample Predictions:", sample_preds)


Sample Predictions: [1 1 0]


# Working on Neural network

In [12]:
# # ✅ Import Libraries
# import pandas as pd
# import numpy as np
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from sklearn.base import BaseEstimator, ClassifierMixin
# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.impute import SimpleImputer
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
# from imblearn.over_sampling import SMOTE
# import seaborn as sns
# import matplotlib.pyplot as plt

# # ✅ Load Dataset
# # Replace with your actual dataset
# df=pd.read_csv('/kaggle/input/hirings-dataset/hiring.csv')

# X = df.drop(['HiringDecision'], axis=1)  # Features
# y = df['HiringDecision']  # Target variable

# # ✅ Preprocessing Steps
# numerical_features = ['Age', 'ExperienceYears', 'PreviousCompanies', 
#                       'DistanceFromCompany', 'InterviewScore', 
#                       'SkillScore', 'PersonalityScore']

# categorical_features = ['Gender', 'EducationLevel', 'RecruitmentStrategy']

# # Transformers
# numerical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='mean')),
#     ('scaler', StandardScaler())
# ])

# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

# # ColumnTransformer
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numerical_transformer, numerical_features),
#         ('cat', categorical_transformer, categorical_features)
#     ])

# # ✅ Custom PyTorch Model wrapped in sklearn-style Estimator
# class NeuralNetworkClassifier(BaseEstimator, ClassifierMixin):
#     def __init__(self, input_dim, hidden_dim=64, epochs=50, batch_size=32, learning_rate=0.001):
#         self.input_dim = input_dim
#         self.hidden_dim = hidden_dim
#         self.epochs = epochs
#         self.batch_size = batch_size
#         self.learning_rate = learning_rate
#         self.model = self._build_model()
#         self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
#         self.criterion = nn.BCELoss()

#     def _build_model(self):
#         model = nn.Sequential(
#             nn.Linear(self.input_dim, self.hidden_dim),
#             nn.ReLU(),
#             nn.Dropout(0.3),
#             nn.Linear(self.hidden_dim, 32),
#             nn.ReLU(),
#             nn.Linear(32, 1),
#             nn.Sigmoid()
#         )
#         return model

#     def fit(self, X, y):
#         X_tensor = torch.tensor(X, dtype=torch.float32)
#         y_tensor = torch.tensor(y.values.reshape(-1, 1), dtype=torch.float32)

#         dataset = torch.utils.data.TensorDataset(X_tensor, y_tensor)
#         dataloader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

#         self.model.train()
#         for epoch in range(self.epochs):
#             for batch_X, batch_y in dataloader:
#                 self.optimizer.zero_grad()
#                 outputs = self.model(batch_X)
#                 loss = self.criterion(outputs, batch_y)
#                 loss.backward()
#                 self.optimizer.step()

#     def predict(self, X):
#         self.model.eval()
#         with torch.no_grad():
#             X_tensor = torch.tensor(X, dtype=torch.float32)
#             outputs = self.model(X_tensor)
#             predictions = (outputs >= 0.5).int().flatten().numpy()
#         return predictions

#     def predict_proba(self, X):
#         self.model.eval()
#         with torch.no_grad():
#             X_tensor = torch.tensor(X, dtype=torch.float32)
#             outputs = self.model(X_tensor)
#             probabilities = torch.cat([1 - outputs, outputs], dim=1).numpy()
#         return probabilities

# # ✅ Pipeline with SMOTE and Neural Network
# pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),                 # Preprocessing
#     ('smote', SMOTE(sampling_strategy='auto', random_state=42)),  # SMOTE Oversampling
#     ('nn', NeuralNetworkClassifier(input_dim=X.shape[1], epochs=50, batch_size=32, learning_rate=0.001))  # Neural Network
# ])

# # ✅ Train-Test Split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # ✅ Model Training
# pipeline.fit(X_train, y_train)

# # ✅ Model Predictions
# y_pred = pipeline.predict(X_test)
# y_prob = pipeline.predict_proba(X_test)[:, 1]

# # ✅ Evaluation Metrics
# accuracy = accuracy_score(y_test, y_pred)
# print(f'\n🔥 Model Accuracy: {accuracy * 100:.2f}%')
# print("\nClassification Report:\n", classification_report(y_test, y_pred))