In [1]:
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.core.metrics import make_scorer

import numpy as np
import pandas as pd

from sklearn.metrics import f1_score

pd.set_option('display.max_columns', 500)

In [2]:
df = pd.read_excel('../data/default of credit card clients.xls', skiprows=1)

In [3]:
# renaming the target variable
df.rename(columns={"default payment next month": "target"}, inplace=True)

# dropping ID
df.drop(columns='ID', inplace=True)

# Changing numeric encoding to labels
df['SEX'].replace([1, 2], ['Male', 'Female'], inplace=True)

df['MARRIAGE'].replace([1, 2, 3, 0], ['Married', 'Single', 'Other', 'Other'],
                       inplace=True)

df['EDUCATION'].replace([1, 2, 3, 4, 0, 5, 6], 
    ['GraduateSchool', 'University', 'HighSchool', 'Other', 'Other', 'Other','Other'],
                        inplace=True)

In [4]:
print("Distribution of the target variable - full data:\n",
      df["target"].value_counts(normalize=True),
      sep='')

Distribution of the target variable - full data:
target
0    0.7788
1    0.2212
Name: proportion, dtype: float64


In [5]:
def bank_profit(y_true, y_pred,alpha = 1/3):
    #earnings = alpha*true_negative - (1-alpha)*false_negative
    
    true_positive = sum(y_pred[y_pred==y_true] ==1)
    true_negative = sum(y_pred[y_pred==y_true] ==0)
    false_positive = sum(y_pred[y_pred!=y_true] ==1)
    false_negative = sum(y_pred[y_pred!=y_true] ==0)
    
    profit = (alpha*true_negative - (1-alpha)*false_negative)/(true_negative + false_negative)
    return profit


f1_ag = make_scorer(name="f1_binary", 
                    score_func=f1_score, 
                    average = "binary",
                    greater_is_better=True,
                    optimum = 1)

bank_profit_ag = make_scorer(name="bank_profit", 
                    score_func=bank_profit, 
                    greater_is_better=True)

In [None]:
minutes = 60
hours = 2

label = "target"
predictor_ag = TabularPredictor(label=label, eval_metric = bank_profit_ag)
predictor_ag.fit(df,
     presets='best_quality', #medium_quality, good_quality, best_quality
     time_limit = 60*minutes*hours, # in seconds
     num_gpus=1
     )

In [7]:
print("AutoGluon infers problem type is:", predictor_ag.problem_type)
print("AutoGluon identified the following types of features:")
print(predictor_ag.feature_metadata)

AutoGluon infers problem type is: binary
AutoGluon identified the following types of features:
('category', [])  :  2 | ['EDUCATION', 'MARRIAGE']
('int', [])       : 20 | ['LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', ...]
('int', ['bool']) :  1 | ['SEX']


In [8]:
predictor_ag.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                          model  score_val  eval_metric  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0     NeuralNetTorch_r79_BAG_L2   0.187523  bank_profit      17.152004  3825.446359                0.872996         285.182202            2       True         43
1           WeightedEnsemble_L3   0.187523  bank_profit      17.236003  3866.274917                0.083999          40.828557            3       True         47
2         NeuralNetTorch_BAG_L2   0.186627  bank_profit      17.033011  3810.412966                0.754003         270.148809            2       True         40
3           WeightedEnsemble_L2   0.183815  bank_profit       1.567027  1403.388195                0.084002          40.671755            2       True         30
4     NeuralNetTorch_r79_BAG_L1   0.183759  bank_profit       0.506002   442.649470                0.506002         442.649470  



{'model_types': {'KNeighborsUnif_BAG_L1': 'StackerEnsembleModel_KNN',
  'KNeighborsDist_BAG_L1': 'StackerEnsembleModel_KNN',
  'LightGBMXT_BAG_L1': 'StackerEnsembleModel_LGB',
  'LightGBM_BAG_L1': 'StackerEnsembleModel_LGB',
  'RandomForestGini_BAG_L1': 'StackerEnsembleModel_RF',
  'RandomForestEntr_BAG_L1': 'StackerEnsembleModel_RF',
  'CatBoost_BAG_L1': 'StackerEnsembleModel_CatBoost',
  'ExtraTreesGini_BAG_L1': 'StackerEnsembleModel_XT',
  'ExtraTreesEntr_BAG_L1': 'StackerEnsembleModel_XT',
  'NeuralNetFastAI_BAG_L1': 'StackerEnsembleModel_NNFastAiTabular',
  'XGBoost_BAG_L1': 'StackerEnsembleModel_XGBoost',
  'NeuralNetTorch_BAG_L1': 'StackerEnsembleModel_TabularNeuralNetTorch',
  'LightGBMLarge_BAG_L1': 'StackerEnsembleModel_LGB',
  'CatBoost_r177_BAG_L1': 'StackerEnsembleModel_CatBoost',
  'NeuralNetTorch_r79_BAG_L1': 'StackerEnsembleModel_TabularNeuralNetTorch',
  'LightGBM_r131_BAG_L1': 'StackerEnsembleModel_LGB',
  'NeuralNetFastAI_r191_BAG_L1': 'StackerEnsembleModel_NNFastAiT

In [9]:
display(predictor_ag.feature_importance(df))

Computing feature importance via permutation shuffling for 23 features using 5000 rows with 5 shuffle sets...


	1574.37s	= Expected runtime (314.87s per shuffle set)
	925.6s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
PAY_0,0.040344,0.002796,3e-06,5,0.046101,0.034588
LIMIT_BAL,0.014924,0.001387,9e-06,5,0.01778,0.012067
BILL_AMT1,0.011954,0.002035,9.7e-05,5,0.016144,0.007765
PAY_AMT1,0.010478,0.001546,5.5e-05,5,0.013661,0.007295
PAY_2,0.009419,0.001599,9.6e-05,5,0.012712,0.006127
PAY_AMT2,0.009379,0.002343,0.000431,5,0.014203,0.004554
PAY_AMT3,0.009178,0.001892,0.000205,5,0.013074,0.005282
PAY_3,0.00858,0.001125,3.5e-05,5,0.010897,0.006263
PAY_AMT4,0.008247,0.00141,9.9e-05,5,0.01115,0.005344
BILL_AMT2,0.008206,0.001676,0.000198,5,0.011656,0.004756
