In [1]:
import warnings
import random

import pandas as pd
import numpy as np

import optuna as op
import lightgbm as lg

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RepeatedStratifiedKFold

warnings.filterwarnings("ignore")
op.logging.set_verbosity(op.logging.INFO)
warnings.filterwarnings("ignore", category=op.exceptions.ExperimentalWarning, module="optuna.*")

random.seed(1)

# Data Import

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s3e3/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s3e3/test.csv')
original = pd.read_csv('/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')

In [3]:
original = original.rename(columns={'EmployeeNumber': 'id'})
original['Attrition'] = (original['Attrition'] == 'Yes').astype(int)
original = original[train.columns.tolist()]

In [4]:
data = pd.concat([train, original]).reset_index(drop=True)

In [5]:
data.head()

Unnamed: 0,id,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,...,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,0,36,Travel_Frequently,599,Research & Development,24,3,Medical,1,4,...,80,1,10,2,3,10,0,7,8,0
1,1,35,Travel_Rarely,921,Sales,8,3,Other,1,1,...,80,1,4,3,3,4,2,0,3,0
2,2,32,Travel_Rarely,718,Sales,26,3,Marketing,1,3,...,80,2,4,3,3,3,2,1,2,0
3,3,38,Travel_Rarely,1488,Research & Development,2,3,Medical,1,3,...,80,0,15,1,1,6,0,0,2,0
4,4,50,Travel_Rarely,1017,Research & Development,5,4,Medical,1,2,...,80,0,31,0,3,31,14,4,10,1


In [6]:
print(f"Orignial Training Dataset\n {train.Attrition.value_counts()}")
print(f"Extended Dataset\n {data.Attrition.value_counts()}")

Orignial Training Dataset
 0    1477
1     200
Name: Attrition, dtype: int64
Extended Dataset
 0    2710
1     437
Name: Attrition, dtype: int64


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3147 entries, 0 to 3146
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   id                        3147 non-null   int64 
 1   Age                       3147 non-null   int64 
 2   BusinessTravel            3147 non-null   object
 3   DailyRate                 3147 non-null   int64 
 4   Department                3147 non-null   object
 5   DistanceFromHome          3147 non-null   int64 
 6   Education                 3147 non-null   int64 
 7   EducationField            3147 non-null   object
 8   EmployeeCount             3147 non-null   int64 
 9   EnvironmentSatisfaction   3147 non-null   int64 
 10  Gender                    3147 non-null   object
 11  HourlyRate                3147 non-null   int64 
 12  JobInvolvement            3147 non-null   int64 
 13  JobLevel                  3147 non-null   int64 
 14  JobRole                 

In [8]:
original.nunique().sort_values()

StandardHours                  1
EmployeeCount                  1
Over18                         1
Attrition                      2
PerformanceRating              2
OverTime                       2
Gender                         2
BusinessTravel                 3
Department                     3
MaritalStatus                  3
WorkLifeBalance                4
StockOptionLevel               4
RelationshipSatisfaction       4
JobSatisfaction                4
JobInvolvement                 4
EnvironmentSatisfaction        4
Education                      5
JobLevel                       5
EducationField                 6
TrainingTimesLastYear          7
JobRole                        9
NumCompaniesWorked            10
PercentSalaryHike             15
YearsSinceLastPromotion       16
YearsWithCurrManager          18
YearsInCurrentRole            19
DistanceFromHome              29
YearsAtCompany                37
TotalWorkingYears             40
Age                           43
HourlyRate

# Preprocessing

## Lightgbm

In [9]:
def preprocessing_lightgbm(df):
    df = df.drop(['id'], axis=1)
    
    category_columns = ['StandardHours', 'EmployeeCount', 'Over18','Education', 'PerformanceRating',
                        'OverTime', 'Gender', 'BusinessTravel', 'Department', 'MaritalStatus',
                        'WorkLifeBalance', 'StockOptionLevel', 'RelationshipSatisfaction',
                        'JobSatisfaction', 'EnvironmentSatisfaction', 'Education', 'JobLevel',
                        'EducationField', 'TrainingTimesLastYear', 'JobRole', 'Department']
    for col in category_columns:
        df[col] = df[col].astype('category')
    
    numerical_columns = ['NumCompaniesWorked', 'PercentSalaryHike', 'DistanceFromHome',
                         'YearsSinceLastPromotion', 'YearsWithCurrManager', 'YearsInCurrentRole',
                         'YearsAtCompany', 'TotalWorkingYears', 'MonthlyIncome']
    for col in numerical_columns:
        df[col] = df[col].astype(int)
    
    # Approximately Poisson Distributed -> Square-Root-Transformation
    transform_columns = ['DistanceFromHome', 'YearsSinceLastPromotion', 'YearsWithCurrManager',
                        'YearsInCurrentRole', 'YearsAtCompany', 'TotalWorkingYears', 'MonthlyIncome']
    for col in transform_columns:
        df[col] = np.sqrt(df[col])
     
    return df

In [10]:
train_lgb = preprocessing_lightgbm(train)
data_lgb = preprocessing_lightgbm(data)
test_lgb = preprocessing_lightgbm(test)

In [11]:
train_lgb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1677 entries, 0 to 1676
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   Age                       1677 non-null   int64   
 1   BusinessTravel            1677 non-null   category
 2   DailyRate                 1677 non-null   int64   
 3   Department                1677 non-null   category
 4   DistanceFromHome          1677 non-null   float64 
 5   Education                 1677 non-null   category
 6   EducationField            1677 non-null   category
 7   EmployeeCount             1677 non-null   category
 8   EnvironmentSatisfaction   1677 non-null   category
 9   Gender                    1677 non-null   category
 10  HourlyRate                1677 non-null   int64   
 11  JobInvolvement            1677 non-null   int64   
 12  JobLevel                  1677 non-null   category
 13  JobRole                   1677 non-null   catego

In [12]:
y = data_lgb['Attrition']
x = data_lgb.drop(['Attrition'], axis=1)

The large dataset will be used for hyperparameter tuning. However, the model will only be validated on the actual data set provided as part of this competition to prevent the validation on a data set that might come from a slightly different distribution [...]

In [13]:
y_val = train_lgb['Attrition']
x_val = train_lgb.drop(['Attrition'], axis=1)

In [14]:
#from sklearn.utils.class_weight import compute_sample_weight
#weights_df = pd.DataFrame(compute_sample_weight('balanced', data.Attrition), columns=['weight'])

# Optuna Hyperparameter Tuning

Run trials that include [repeated] stratified K-fold cross validation. A high split gives more certainty towards the prevention of overfitting, the repetitions add to this.

In [15]:
rskf = RepeatedStratifiedKFold(n_repeats=5, n_splits=25, random_state=1)

def objective(trial):
    
    scores = []
    
    params = {
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        'boosting_type': 'gbdt',
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-5, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-4, 10.0, log=True),
        'max_depth': trial.suggest_int('max_depth', 2, 75),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.35, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 8),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 25),
    }
    
    for i, (train_idx, test_index) in enumerate(rskf.split(x, y)):
        
        dtrain = lg.Dataset(x.iloc[train_idx], label=y.iloc[train_idx])
        dval = lg.Dataset(x.iloc[test_index], label=y.iloc[test_index])
        
        pruning_callback = op.integration.LightGBMPruningCallback(trial, metric='auc', valid_name='val')
        gbm = lg.train(params=params, train_set=dtrain, valid_sets=[dtrain, dval],
                       valid_names=['train', 'val'], num_boost_round=10_000,
                       callbacks=[lg.log_evaluation(250, show_stdv=False), lg.early_stopping(10, verbose = False),
                                 lg.print_evaluation(250, show_stdv=False), pruning_callback])
        
        pred = gbm.predict(data=x_val)
        score = roc_auc_score(y_val, pred)
        scores.append(score)
        
    return np.mean(scores)
    

In [16]:
pruner = op.pruners.MedianPruner(n_startup_trials=35, n_warmup_steps=50)
sampler = op.samplers.TPESampler(multivariate=True, n_startup_trials=35)
study = op.create_study(direction="maximize", sampler=sampler, pruner=pruner)

study.optimize(objective, n_trials=100)  # Beware! This might take a while :)

[32m[I 2023-01-17 21:09:52,303][0m A new study created in memory with name: no-name-ca057e52-02e0-4ba6-bee7-ef967a68724e[0m
[32m[I 2023-01-17 21:10:21,945][0m Trial 0 finished with value: 0.9658109952606636 and parameters: {'lambda_l1': 0.00016005753784128064, 'lambda_l2': 0.07983533948177961, 'max_depth': 21, 'num_leaves': 192, 'feature_fraction': 0.39008524797023303, 'bagging_fraction': 0.629188642838665, 'bagging_freq': 7, 'min_child_samples': 7}. Best is trial 0 with value: 0.9658109952606636.[0m
[32m[I 2023-01-17 21:10:41,357][0m Trial 1 finished with value: 0.9441300880162493 and parameters: {'lambda_l1': 4.236675696797469e-05, 'lambda_l2': 0.0018781463283669613, 'max_depth': 21, 'num_leaves': 53, 'feature_fraction': 0.6559081899075623, 'bagging_fraction': 0.5899936543575481, 'bagging_freq': 8, 'min_child_samples': 14}. Best is trial 0 with value: 0.9658109952606636.[0m
[32m[I 2023-01-17 21:10:55,977][0m Trial 2 finished with value: 0.8490251861882193 and parameters: {

In [17]:
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 100
Best trial:
  Value: 0.9847593500338525
  Params: 
    lambda_l1: 0.011275010242960529
    lambda_l2: 0.0032813361527887267
    max_depth: 28
    num_leaves: 218
    feature_fraction: 0.7664729878100122
    bagging_fraction: 0.9159150490160863
    bagging_freq: 2
    min_child_samples: 9


# Optuna Visualizations

In [18]:
fig = op.visualization.plot_contour(study, params=['max_depth', 'num_leaves'])
fig.show()

In [19]:
fig = op.visualization.plot_param_importances(study)
fig.show()

In [20]:
fig = op.visualization.plot_intermediate_values(study)
fig.show()

# Model Training

In [21]:
best_params = study.best_params
model = lg.LGBMClassifier(**best_params)
model.fit(x_val, y_val, verbose=0)



LGBMClassifier(bagging_fraction=0.9159150490160863, bagging_freq=2,
               feature_fraction=0.7664729878100122,
               lambda_l1=0.011275010242960529, lambda_l2=0.0032813361527887267,
               max_depth=28, min_child_samples=9, num_leaves=218)

# Prediction & Submission

In [22]:
test_pred = model.predict_proba(x)[:, 1]
roc_auc_score(y, test_pred)

0.9205552787793324

In [23]:
pred = model.predict_proba(test_lgb)[:, 1]
submission = pd.DataFrame({"id": test.id, "Attrition": pred})
submission.head()

Unnamed: 0,id,Attrition
0,1677,0.056057
1,1678,0.000461
2,1679,0.001308
3,1680,0.000651
4,1681,0.693144


In [24]:
submission.to_csv("submission.csv", index=False)