In [1]:
import os
from kaggle.api.kaggle_api_extended import KaggleApi


import kaggle
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import export_text
from sklearn.feature_extraction import DictVectorizer 
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score

import pickle


# Set random seed
SEED = 42


In [2]:

# Initialize Kaggle API
api = KaggleApi()
api.authenticate()

# Define project directory and data path
project_dir = os.path.join(os.getcwd(), "data")
os.makedirs(project_dir, exist_ok=True)

# Kaggle dataset identifier (update with correct dataset ID)
dataset_name = "itssuru/HR-Employee-Attrition"

# Download dataset to the project data directory
api.dataset_download_files(dataset_name, path=project_dir, unzip=True)

print(f"Dataset downloaded and extracted to {project_dir}")


Dataset URL: https://www.kaggle.com/datasets/itssuru/HR-Employee-Attrition
Dataset downloaded and extracted to C:\Users\Acer\Documents\workspace\ml-zoomcamp\mlzoomcamp-midterm-project\notebooks\data


In [3]:
# Path to the project directory
project_dir = r"C:\Users\Acer\Documents\workspace\ml-zoomcamp\mlzoomcamp-midterm-project\notebooks\data"
print(f"Dataset downloaded and extracted to {project_dir}")

# Replace 'your_file_name.csv' with the actual filename from the dataset
csv_file_name = "HR-Employee-Attrition.csv"  # Correct this if the file has a different name
csv_file_path = os.path.join(project_dir, csv_file_name)

# Check if the file exists
if not os.path.exists(csv_file_path):
    print(f"File not found: {csv_file_path}")
else:
    # Assigning a handle `hr` to read the CSV file
    df = pd.read_csv(csv_file_path)

    # Displaying the first few rows of the DataFrame to verify
    print(df.head())

Dataset downloaded and extracted to C:\Users\Acer\Documents\workspace\ml-zoomcamp\mlzoomcamp-midterm-project\notebooks\data
   Age Attrition     BusinessTravel  DailyRate              Department  \
0   41       Yes      Travel_Rarely       1102                   Sales   
1   49        No  Travel_Frequently        279  Research & Development   
2   37       Yes      Travel_Rarely       1373  Research & Development   
3   33        No  Travel_Frequently       1392  Research & Development   
4   27        No      Travel_Rarely        591  Research & Development   

   DistanceFromHome  Education EducationField  EmployeeCount  EmployeeNumber  \
0                 1          2  Life Sciences              1               1   
1                 8          1  Life Sciences              1               2   
2                 2          2          Other              1               4   
3                 3          4  Life Sciences              1               5   
4                 2          1 

In [4]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [5]:
# Drop all constant attributes (1 from the result about) that will not make a difference to our model description
df.drop(['employeecount', 'over18','standardhours' ],axis=1,inplace=True)


In [6]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=SEED)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=SEED)

In [7]:
# Create the target variables 
y_train = (df_full_train.attrition == 'yes').values
y_test = (df_test.attrition == 'yes').values

In [8]:
del df_full_train['attrition']
del df_test['attrition']

In [9]:
# Convert the Dataframes into lists of Dictionaries, then OHE to derive the feature matrices
dict_train = df_full_train.fillna(0).to_dict(orient='records')
dict_test = df_test.fillna(0).to_dict(orient='records')

In [10]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(dict_train)
X_test = dv.transform(dict_test)

In [11]:
# Train the XGBoost Model with the optimal parameters
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=dv.feature_names_)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=dv.feature_names_)

In [12]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'binary:logistic',
    'nthread': 8,

    'seed': 1,
    'verbosity': 1,
}


In [13]:
num_trees = 100

In [14]:
model = xgb.train(xgb_params, dtrain, num_boost_round=num_trees)

In [15]:
y_pred_xgb = model.predict(dtest)
roc_auc_score(y_test, y_pred_xgb)

np.float64(0.7860231271995978)

In [16]:
# Parameter constants
C= 1.0
n_splits = 5

In [17]:
# Train function
def train(df_train, y_train, C=C):
    dicts = df_train[cat_cols + num_cols].to_dict(orient='records')

    dv = DictVectorizer(sparce=False)
    X_train = dv.fit_transform(dicts)

    model = xgb.train(xgb_params, dtrain, num_boost_round=10)
    return dv, model

In [18]:
# Predict function

def predict(df, dv, model):
    dicts = df[cat_cols + num_cols].to_dict(orient='records')
    
    dtest = dv.transform(dicts)
    y_pred = model.predict(dtest,validate_features=False)
    return y_pred

In [19]:
output_file = f'model_C={C}.bin'
output_file

'model_C=1.0.bin'

In [20]:
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)



#### Testing the downloaded Model
##### Load the model
* Restart the kernel: Menu -> Kernel -> Restart Kernel

In [1]:
import pickle
import xgboost as xgb
from xgboost import XGBClassifier

In [2]:
output_file = 'model_C=1.0.bin'
 
output_file

'model_C=1.0.bin'

In [3]:
with open(output_file, 'rb') as f_in:
    (dv, model) = pickle.load(f_in)



In [4]:
employee = {'businesstravel': 'travel_rarely',
 'department': 'sales',
 'educationfield': 'life_sciences',
 'gender': 'female',
 'jobrole': 'sales_executive',
 'maritalstatus': 'single',
 'over18': 'y',
 'overtime': 'no',
 'age': 30,
 'dailyrate': 374,
 'distancefromhome': 20,
 'education': 3,
 'employeecount': 1,
 'employeenumber': 2046,
 'environmentsatisfaction': 4,
 'hourlyrate': 50,
 'jobinvolvement': 3,
 'joblevel': 2,
 'jobsatisfaction': 3,
 'monthlyincome': 1050,
 'monthlyrate': 23333,
 'numcompaniesworked': 8,
 'percentsalaryhike': 15,
 'performancerating': 3,
 'relationshipsatisfaction': 3,
 'standardhours': 80,
 'stockoptionlevel': 0,
 'totalworkingyears': 8,
 'trainingtimeslastyear': 3,
 'worklifebalance': 3,
 'yearsatcompany': 5,
 'yearsincurrentrole': 3,
 'yearssincelastpromotion': 0,
 'yearswithcurrmanager': 1,
}

In [5]:
X = dv.transform([employee])

In [6]:
dX = xgb.DMatrix(X, feature_names=dv.feature_names_)

In [7]:
# y_pred_xgb = model.predict(dX)
# y_pred_xgb 

In [8]:
prediction = model.predict(dX)
print('prediction: %.3f' % prediction.item())

if prediction >= 0.5:
    print('Verdict: Attrition' )
else:
    print('Verdict: Attrition Not Likely')


prediction: 0.470
Verdict: Attrition Not Likely
