In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 20)
pd.options.display.max_rows = 999


from xgboost import XGBClassifier  
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score 
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.metrics import roc_curve


In [None]:
#Load up trainhr.csv, with index being "employee_id"
data_ori = pd.read_csv("train_hr.csv",index_col = "employee_id")


In [None]:
#Making backup for modifications in this notebook
data = data_ori 

In [None]:
# Input missings
data.education.fillna("Bachelor's",inplace = True)
data.previous_year_rating.fillna(3.0,inplace = True)

### Grouping categorical values

region column

In [None]:
data['count_region']=data.groupby("region")['gender'].transform('count')
data["region_corrected"] = data.region.where(data["count_region"] >=2000, "others_regions")
data["region_corrected"].value_counts()
data = data.drop(columns = ['count_region'])

In [None]:
#####º no_of_trainings.
data["train_grouped"] = np.where(data["no_of_trainings"] >= 6, "+6 trainings",data["no_of_trainings"])  

data["train_grouped"].dtype
data["train_grouped"].value_counts()

### Converting categorical variables to dummies

In [None]:
dt=pd.get_dummies(data,columns = ["department"], drop_first = False)
dt=pd.get_dummies(dt,columns = ["region_corrected"], drop_first = False)
dt=pd.get_dummies(dt,columns = ["education"], drop_first = False)
dt=pd.get_dummies(dt,columns = ["train_grouped"], drop_first = False)
dt=pd.get_dummies(dt,columns = ["recruitment_channel"], drop_first = False)
dt=pd.get_dummies(dt,columns = ["previous_year_rating"], drop_first = False)
dt=pd.get_dummies(dt,columns = ["KPIs_met >80%"], drop_first = True)
dt=pd.get_dummies(dt,columns = ["awards_won?"], drop_first = True)
dt =dt.drop(columns = ['region', 'no_of_trainings', 'gender'])


In [None]:
dt.head()

In [None]:
dt.dtypes

In [None]:
# Separate features and target variable
y = dt["is_promoted"]
X = dt.drop("is_promoted", axis=1)


In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define XGBoost model
xgb_model = XGBClassifier(objective="binary:logistic", 
                          n_estimators=100,
                          learning_rate=0.1,
                          )

In [None]:
# Train XGBoost model
xgb_model.fit(X_train, y_train)

### XGBoost Model training and application

In [None]:
# Make predictions with XGBoost
xgb_preds = xgb_model.predict(X_test)

In [None]:
y_pred_proba = xgb_model.predict_proba(X_test)

In [None]:
# Evaluating model performance
xgb_acc = accuracy_score(y_test, xgb_preds)
print("XGBoost accuracy:", xgb_acc)

In [None]:
# Calculate log loss
log_loss(y_test, y_pred_proba)

In [None]:
# Calculate ROC AUC
auc = roc_auc_score(y_test, y_pred_proba[:,1])
print('AUC: %.3f' % auc)

In [None]:
fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred_proba[:,1])

In [None]:
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
#Precision Recall Curve

from sklearn.metrics import precision_recall_curve

yhat=xgb_model.predict_proba(X_test)
pos_probs = pos_probs = yhat[:, 1]
precision, recall, _ = precision_recall_curve(y_test, pos_probs)

no_skill = len(y[y==1]) / len(y)

In [None]:
# plot the no skill precision-recall curve
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
# calculate model precision-recall curve
precision, recall, _ = precision_recall_curve(y_test, pos_probs)
# plot the model precision-recall curve
plt.plot(recall, precision, marker='.', label='Logistic')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
# show the plot
plt.show()

In [None]:
from sklearn.metrics import classification_report

# Print classification report
print(classification_report(y_test, xgb_preds))