In [1]:
!pip install catboost
!pip install pycaret
!pip install optuna
!pip install wandb



In [2]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from pycaret.classification import *

import argparse
import wandb
wandb.init(project="DACON_235892", name="pycaret")

parser = argparse.ArgumentParser(description='pycaret')
parser.add_argument('--top_n_model', default=5, type=int)
parser.add_argument('--tune_iter', default=10, type=int)
parser.add_argument('--cv', default=5, type=int)
parser.add_argument('--seed', default=1011, type=int)
parser.add_argument('--ensemble_method', default="stack", type=str) # blend or stack
parser.add_argument('--high_cardinality_method', default="frequency", type=str) # frequency or clustering
parser.add_argument('--remove_outliers', default=False, type=bool)
args = parser.parse_args('')

wandb.config.update(args)

top_n_model = args.top_n_model
tune_iter = args.tune_iter
cv = args.cv
seed = args.seed
ensemble_method = args.ensemble_method
high_cardinality_method = args.high_cardinality_method
remove_outliers = args.remove_outliers

def set_seeds(seed=seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

set_seeds()

train = pd.read_csv("/content/drive/MyDrive/DACON-Basic/235892_인구 데이터 기반 소득 예측 경진대회/data/train.csv")
test = pd.read_csv("/content/drive/MyDrive/DACON-Basic/235892_인구 데이터 기반 소득 예측 경진대회/data/test.csv")

train.head()

  defaults = yaml.load(f)
[34m[1mwandb[0m: Currently logged in as: [33mgnoeyheat[0m (use `wandb login --relogin` to force relogin)


Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,0,32,Private,309513,Assoc-acdm,12,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0
1,1,33,Private,205469,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
2,2,46,Private,149949,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0
3,3,23,Private,193090,Bachelors,13,Never-married,Adm-clerical,Own-child,White,Female,0,0,30,United-States,0
4,4,55,Private,60193,HS-grad,9,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,0


In [3]:
train = train.drop("id", axis=1)
test = test.drop("id", axis=1)

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17480 entries, 0 to 17479
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             17480 non-null  int64 
 1   workclass       15644 non-null  object
 2   fnlwgt          17480 non-null  int64 
 3   education       17480 non-null  object
 4   education.num   17480 non-null  int64 
 5   marital.status  17480 non-null  object
 6   occupation      15637 non-null  object
 7   relationship    17480 non-null  object
 8   race            17480 non-null  object
 9   sex             17480 non-null  object
 10  capital.gain    17480 non-null  int64 
 11  capital.loss    17480 non-null  int64 
 12  hours.per.week  17480 non-null  int64 
 13  native.country  16897 non-null  object
 14  target          17480 non-null  int64 
dtypes: int64(7), object(8)
memory usage: 2.0+ MB


In [4]:
train["age_hours"] = train["age"] * train["hours.per.week"]
test["age_hours"] = test["age"] * test["hours.per.week"]

train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target,age_hours
0,32,Private,309513,Assoc-acdm,12,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0,1280
1,33,Private,205469,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1,1320
2,46,Private,149949,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0,1840
3,23,Private,193090,Bachelors,13,Never-married,Adm-clerical,Own-child,White,Female,0,0,30,United-States,0,690
4,55,Private,60193,HS-grad,9,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,0,2200


In [5]:
train.corr()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week,target,age_hours
age,1.0,-0.077435,0.034002,0.071202,0.05767,0.039661,0.228554,0.677726
fnlwgt,-0.077435,1.0,-0.02945,0.004603,-0.007775,-0.015124,-0.006451,-0.06575
education.num,0.034002,-0.02945,1.0,0.117565,0.087377,0.138406,0.339855,0.116083
capital.gain,0.071202,0.004603,0.117565,1.0,-0.030652,0.066213,0.225092,0.098512
capital.loss,0.05767,-0.007775,0.087377,-0.030652,1.0,0.055125,0.153856,0.0801
hours.per.week,0.039661,-0.015124,0.138406,0.066213,0.055125,1.0,0.23609,0.709141
target,0.228554,-0.006451,0.339855,0.225092,0.153856,0.23609,1.0,0.32943
age_hours,0.677726,-0.06575,0.116083,0.098512,0.0801,0.709141,0.32943,1.0


In [6]:
test.corr()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week,age_hours
age,1.0,-0.075825,0.040259,0.085941,0.058318,0.108277,0.731087
fnlwgt,-0.075825,1.0,-0.059419,-0.004425,-0.012994,-0.023271,-0.074165
education.num,0.040259,-0.059419,1.0,0.128804,0.071125,0.159157,0.1273
capital.gain,0.085941,-0.004425,0.128804,1.0,-0.032738,0.093803,0.129693
capital.loss,0.058318,-0.012994,0.071125,-0.032738,1.0,0.052692,0.078919
hours.per.week,0.108277,-0.023271,0.159157,0.093803,0.052692,1.0,0.710936
age_hours,0.731087,-0.074165,0.1273,0.129693,0.078919,0.710936,1.0


In [7]:
train = train.drop("fnlwgt", axis=1)
test = test.drop("fnlwgt", axis=1)

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17480 entries, 0 to 17479
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             17480 non-null  int64 
 1   workclass       15644 non-null  object
 2   education       17480 non-null  object
 3   education.num   17480 non-null  int64 
 4   marital.status  17480 non-null  object
 5   occupation      15637 non-null  object
 6   relationship    17480 non-null  object
 7   race            17480 non-null  object
 8   sex             17480 non-null  object
 9   capital.gain    17480 non-null  int64 
 10  capital.loss    17480 non-null  int64 
 11  hours.per.week  17480 non-null  int64 
 12  native.country  16897 non-null  object
 13  target          17480 non-null  int64 
 14  age_hours       17480 non-null  int64 
dtypes: int64(7), object(8)
memory usage: 2.0+ MB


In [8]:
train = train.drop("education", axis=1)
test = test.drop("education", axis=1)
train["education.num"] = train["education.num"].astype("object")
test["education.num"] = test["education.num"].astype("object")

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17480 entries, 0 to 17479
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             17480 non-null  int64 
 1   workclass       15644 non-null  object
 2   education.num   17480 non-null  object
 3   marital.status  17480 non-null  object
 4   occupation      15637 non-null  object
 5   relationship    17480 non-null  object
 6   race            17480 non-null  object
 7   sex             17480 non-null  object
 8   capital.gain    17480 non-null  int64 
 9   capital.loss    17480 non-null  int64 
 10  hours.per.week  17480 non-null  int64 
 11  native.country  16897 non-null  object
 12  target          17480 non-null  int64 
 13  age_hours       17480 non-null  int64 
dtypes: int64(6), object(8)
memory usage: 1.9+ MB


In [9]:
# train = train.dropna(axis=0)
# test = test.dropna(axis=0)

# train.info()

In [10]:
train.describe()

Unnamed: 0,age,capital.gain,capital.loss,hours.per.week,target,age_hours
count,17480.0,17480.0,17480.0,17480.0,17480.0,17480.0
mean,38.720995,1076.644508,83.8746,40.00246,0.234897,1556.010469
std,14.079617,7439.49862,396.03288,12.671265,0.423947,751.493549
min,17.0,0.0,0.0,1.0,0.0,21.0
25%,27.0,0.0,0.0,38.0,0.0,1040.0
50%,37.0,0.0,0.0,40.0,0.0,1488.0
75%,48.0,0.0,0.0,45.0,0.0,2016.0
max,90.0,99999.0,4356.0,99.0,1.0,8910.0


In [11]:
train.describe(include='object')

Unnamed: 0,workclass,education.num,marital.status,occupation,relationship,race,sex,native.country
count,15644,17480,17480,15637,17480,17480,17480,16897
unique,8,16,7,14,6,5,2,41
top,Private,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
freq,11568,5566,8003,2113,6972,14864,11590,15393


In [12]:
print(list(map(lambda x : str(x), np.sort(train["education.num"].unique()))))

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16']


In [13]:
s = setup(train,  target='target',
          categorical_imputation='mode',
          ordinal_features = {'education.num':list(map(lambda x : str(x), np.sort(train["education.num"].unique())))}, 
          high_cardinality_features = ['native.country'],
          high_cardinality_method = high_cardinality_method,
          bin_numeric_features = ["age"],
          remove_outliers = remove_outliers,
          normalize=True, fix_imbalance=True,
          fold_strategy='stratifiedkfold', fold=cv,
          session_id=seed, log_experiment=True)

Unnamed: 0,Description,Value
0,session_id,1011
1,Target,target
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(17480, 14)"
5,Missing Values,True
6,Numeric Features,5
7,Categorical Features,8
8,Ordinal Features,True
9,High Cardinality Features,True


In [14]:
%%time

if ensemble_method=="stack":
    model=stack_models(compare_models(sort='Accuracy', n_select=top_n_model), choose_better=True)
elif ensemble_method=="blend":
    model=blend_models(compare_models(sort='Accuracy', n_select=top_n_model), choose_better=True)

tuned_model = tune_model(model, n_iter=tune_iter, optimize='Accuracy', search_library='optuna', choose_better=True)
final_model = finalize_model(tuned_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.839,0.9099,0.7617,0.6302,0.6898,0.5824,0.5871
1,0.8472,0.9177,0.7548,0.6507,0.6989,0.5972,0.6002
2,0.8463,0.9139,0.7426,0.6519,0.6943,0.5923,0.5945
3,0.8459,0.9152,0.7339,0.6533,0.6912,0.5891,0.5908
4,0.8435,0.9151,0.7791,0.6364,0.7005,0.5961,0.6016
Mean,0.8444,0.9144,0.7544,0.6445,0.6949,0.5914,0.5949
Std,0.003,0.0026,0.0156,0.0094,0.0042,0.0054,0.0055


CPU times: user 22min 2s, sys: 20.7 s, total: 22min 22s
Wall time: 38min 11s


In [15]:
wandb.log({
    "Accuracy_Mean": pull()["Accuracy"].loc["Mean"],
    "Accuracy_Std": pull()["Accuracy"].loc["Std"],
    "AUC_Mean": pull()["AUC"].loc["Mean"],
    "AUC_Std": pull()["AUC"].loc["Std"],
    "F1_Mean": pull()["F1"].loc["Mean"],
    "F1_Std": pull()["F1"].loc["Std"],
    })

In [16]:
evaluate_model(final_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [17]:
predictions = predict_model(final_model, data=test)
predictions.shape

(15081, 15)

In [18]:
submission=predictions['Label']

sample_submission = pd.read_csv("/content/drive/MyDrive/DACON-Basic/235892_인구 데이터 기반 소득 예측 경진대회/data/sample_submission.csv")
sample_submission.target = submission
sample_submission.to_csv("/content/drive/MyDrive/DACON-Basic/235892_인구 데이터 기반 소득 예측 경진대회/submission.csv",index=False)