In [1]:
import os
from kaggle.api.kaggle_api_extended import KaggleApi


import kaggle
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import export_text
from sklearn.feature_extraction import DictVectorizer 
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score

import pickle


# Set random seed
SEED = 42


In [2]:

# Initialize Kaggle API
api = KaggleApi()
api.authenticate()

# Define project directory and data path
project_dir = os.path.join(os.getcwd(), "data")
os.makedirs(project_dir, exist_ok=True)

# Kaggle dataset identifier (update with correct dataset ID)
dataset_name = "itssuru/HR-Employee-Attrition"

# Download dataset to the project data directory
api.dataset_download_files(dataset_name, path=project_dir, unzip=True)

print(f"Dataset downloaded and extracted to {project_dir}")


Dataset URL: https://www.kaggle.com/datasets/itssuru/HR-Employee-Attrition
Dataset downloaded and extracted to C:\Users\Acer\Documents\workspace\ml-zoomcamp\mlzoomcamp-midterm-project\notebooks\data


In [3]:
# Path to the project directory
project_dir = r"C:\Users\Acer\Documents\workspace\ml-zoomcamp\mlzoomcamp-midterm-project\notebooks\data"
print(f"Dataset downloaded and extracted to {project_dir}")

# Replace 'your_file_name.csv' with the actual filename from the dataset
csv_file_name = "HR-Employee-Attrition.csv"  # Correct this if the file has a different name
csv_file_path = os.path.join(project_dir, csv_file_name)

# Check if the file exists
if not os.path.exists(csv_file_path):
    print(f"File not found: {csv_file_path}")
else:
    # Assigning a handle `hr` to read the CSV file
    df = pd.read_csv(csv_file_path)

    # Displaying the first few rows of the DataFrame to verify
    print(df.head())

Dataset downloaded and extracted to C:\Users\Acer\Documents\workspace\ml-zoomcamp\mlzoomcamp-midterm-project\notebooks\data
   Age Attrition     BusinessTravel  DailyRate              Department  \
0   41       Yes      Travel_Rarely       1102                   Sales   
1   49        No  Travel_Frequently        279  Research & Development   
2   37       Yes      Travel_Rarely       1373  Research & Development   
3   33        No  Travel_Frequently       1392  Research & Development   
4   27        No      Travel_Rarely        591  Research & Development   

   DistanceFromHome  Education EducationField  EmployeeCount  EmployeeNumber  \
0                 1          2  Life Sciences              1               1   
1                 8          1  Life Sciences              1               2   
2                 2          2          Other              1               4   
3                 3          4  Life Sciences              1               5   
4                 2          1 

In [4]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [5]:
df.attrition = (df.attrition == 'yes').astype(int)

In [6]:
# Drop all constant attributes (1 from the result about) that will not make a difference to our model description
df.drop(['employeecount', 'over18','standardhours' ],axis=1,inplace=True)


In [7]:
# Identify categorical and numerical columns
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
num_cols = df.select_dtypes(include=['number']).columns.tolist()
num_cols.remove('attrition')  # Remove target variable from numeric columns

In [8]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=SEED)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=SEED)

In [9]:
 df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [10]:
y_train = df_train.attrition.values
y_val = df_val.attrition.values
y_test = df_test.attrition.values
y_full_train =df_full_train.attrition.values

In [11]:
del df_train['attrition']
del df_val['attrition']
del df_test['attrition']

In [12]:
dict_train = df_train.to_dict(orient='records')
dict_val = df_val.to_dict(orient='records')

In [13]:
dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(dict_train)
X_val = dv.fit_transform(dict_val)

In [14]:
feature_names = dv.feature_names_

In [15]:
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=dv.feature_names_)

In [16]:
dval = xgb.DMatrix(X_val, label=y_val, feature_names=dv.feature_names_)

In [17]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'binary:logistic',
    'nthread': 8,

    'seed': 1,
    'verbosity': 1,
}


In [18]:
model = xgb.train(xgb_params, dtrain, num_boost_round=100)

In [19]:
y_pred = model.predict(dval) # The risk score for each employee in the validation dataset

In [20]:
y_pred

array([1.51200016e-04, 1.54876010e-02, 2.62746006e-01, 5.74889928e-02,
       8.85628257e-03, 7.27617741e-03, 6.59438372e-01, 5.61093271e-04,
       8.59615393e-05, 3.83243430e-03, 5.04899165e-03, 1.74491579e-04,
       1.76951420e-02, 5.30233234e-03, 8.90194904e-04, 5.21506695e-03,
       5.23285149e-03, 1.57800727e-02, 3.31193302e-03, 4.55631502e-03,
       1.19681835e-01, 9.75431561e-01, 2.43347534e-03, 4.41563316e-03,
       7.11973943e-03, 7.05151469e-04, 2.77456373e-01, 1.08045423e-02,
       6.27815537e-03, 4.01561614e-04, 6.64086686e-03, 5.03221788e-02,
       2.08768062e-02, 7.88952748e-04, 1.77160382e-01, 2.60945852e-03,
       6.55420299e-05, 5.11757156e-04, 1.45796742e-02, 2.20155343e-01,
       2.93012918e-03, 3.52598913e-03, 9.55339707e-03, 1.19043544e-01,
       1.15026012e-02, 6.78058714e-05, 1.88100651e-01, 4.87559289e-02,
       4.38843650e-04, 3.86657077e-03, 8.88632476e-01, 5.74631896e-03,
       7.79724896e-01, 5.40162297e-03, 9.01266560e-03, 1.89385295e-03,
      

In [21]:
roc_auc_score(y_val, y_pred)

np.float64(0.8315573770491803)

#### Selecting the Best Model

In [None]:
dt = DecisionTreeClassifier(max_depth=10, min_samples_leaf=15)
dt.fit(X_train, y_train)

In [None]:
y_pred = dt.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, y_pred)

In [None]:
rf = RandomForestClassifier(n_estimators=200, 
                            max_depth=10,
                            min_samples_leaf=3,
                            random_state=42)
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, y_pred)

In [None]:
# XGBoost
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'binary:logistic',
    'nthread': 8,

    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100)

In [None]:
y_pred = model.predict(dval)
roc_auc_score(y_val, y_pred)

In [None]:
df_full_train

In [None]:
df_full_train = df_full_train.reset_index(drop=True)

In [None]:
y_full_train = df_full_train.attrition

In [None]:
del df_full_train['attrition']

In [None]:
dicts_full_train = df_full_train.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_full_train = dv.fit_transform(dicts_full_train)

dicts_test = df_test.to_dict(orient='records')
X_test = dv.transform(dicts_test)


In [None]:
feature_names

In [None]:
# feature_names = dv.get_feature_names_out().tolist()

dfulltrain = xgb.DMatrix(X_full_train, label=y_full_train, 
                        feature_names=dv.get_feature_names_out().tolist())

dtest = xgb.DMatrix(X_test, feature_names=dv.get_feature_names_out().tolist())

In [None]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'binary:logistic',
    'nthread': 8,

    'seed': 1,
    'verbosity': 1,
}
model = xgb.train(xgb_params, dtrain, num_boost_round=10)

In [None]:
y_pred = model.predict(dtest)

In [None]:
y_pred[:10]

In [None]:
roc_auc_score(y_test, y_pred)

## Full Retrain

In [None]:
# Parameter constants
C= 1.0
n_splits = 5

In [None]:
# Train function
def train(df_train, y_train, C=C):
    dicts = df_train[cat_cols + num_cols].to_dict(orient='records')

    dv = DictVectorizer(sparce=False)
    X_train = dv.fit_transform(dicts)

    model = xgb.train(xgb_params, dtrain, num_boost_round=10)
    return dv, model

In [None]:
# Predict function

def predict(df, dv, model):
    dicts = df[cat_cols + num_cols].to_dict(orient='records')
    
    dtest = dv.transform(dicts)
    y_pred = model.predict(dtest,validate_features=False)
    return y_pred

In [None]:
output_file = f'model_C={C}.bin'
output_file

In [None]:
with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)



In [None]:
import pickle
import xgboost as xgb
from xgboost import XGBClassifier


In [None]:
output_file = 'model_C=1.0.bin'
 
output_file

In [None]:
with open(output_file, 'rb') as f_in:
    (dv, model) = pickle.load(f_in)



In [None]:
employee = {'businesstravel': 'travel_rarely',
 'department': 'sales',
 'educationfield': 'life_sciences',
 'gender': 'female',
 'jobrole': 'sales_executive',
 'maritalstatus': 'single',
 'over18': 'y',
 'overtime': 'no',
 'age': 45,
 'dailyrate': 374,
 'distancefromhome': 20,
 'education': 3,
 'employeecount': 1,
 'employeenumber': 2046,
 'environmentsatisfaction': 4,
 'hourlyrate': 50,
 'jobinvolvement': 3,
 'joblevel': 2,
 'jobsatisfaction': 3,
 'monthlyincome': 4850,
 'monthlyrate': 23333,
 'numcompaniesworked': 8,
 'percentsalaryhike': 15,
 'performancerating': 3,
 'relationshipsatisfaction': 3,
 'standardhours': 80,
 'stockoptionlevel': 0,
 'totalworkingyears': 8,
 'trainingtimeslastyear': 3,
 'worklifebalance': 3,
 'yearsatcompany': 5,
 'yearsincurrentrole': 3,
 'yearssincelastpromotion': 0,
 'yearswithcurrmanager': 1,
}

In [None]:
X = dv.transform([employee])

In [None]:
dtest = xgb.DMatrix(X, feature_names.feature_name)

In [None]:
# model.predict_proba(X)[0, 1]
dtest = xgb.DMatrix(X, feature_names=model.feature_names)


In [None]:
print(model.feature_names)  # From the trained model

In [None]:
print(dtest.feature_names)  # From the test DMatrix

In [None]:

# Assume `model` is a Booster
dtest = xgb.DMatrix(X)  # Convert your test data to DMatrix
y_pred = model.predict(dtest)  # Use `predict` directly