# 0. 데이터 디렉토리 마운트 

# 1. 데이터 로드

In [1]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submit = pd.read_csv('sample_submission.csv')

In [2]:
train.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,0,32,Private,309513,Assoc-acdm,12,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0
1,1,33,Private,205469,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
2,2,46,Private,149949,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0
3,3,23,Private,193090,Bachelors,13,Never-married,Adm-clerical,Own-child,White,Female,0,0,30,United-States,0
4,4,55,Private,60193,HS-grad,9,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,0


In [3]:
test.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
0,0,47,Private,304133,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States
1,1,34,Self-emp-inc,154227,Some-college,10,Never-married,Sales,Not-in-family,White,Male,0,0,75,United-States
2,2,31,Local-gov,158291,Bachelors,13,Never-married,Craft-repair,Not-in-family,White,Male,8614,0,40,United-States
3,3,28,Private,183155,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,55,United-States
4,4,54,Local-gov,182543,Some-college,10,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,Mexico


In [4]:
submit.head()

Unnamed: 0,id,target
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


# 2. 데이터 결측치 처리

- 데이터 상태 및 처리 코드는 [baseline](https://dacon.io/competitions/official/235892/codeshare/4830) 활용

In [5]:
def check_missing_col(dataframe):
    missing_col = []
    for col in dataframe.columns:
        missing_values = sum(dataframe[col].isna())
        is_missing = True if missing_values >= 1 else False
        if is_missing:
            print(f'결측치가 있는 컬럼은: {col} 입니다')
            print(f'해당 컬럼에 총 {missing_values} 개의 결측치가 존재합니다.')
            missing_col.append([col, dataframe[col].dtype])
    if missing_col == []:
        print('결측치가 존재하지 않습니다')
    return missing_col

missing_col = check_missing_col(train)

결측치가 있는 컬럼은: workclass 입니다
해당 컬럼에 총 1836 개의 결측치가 존재합니다.
결측치가 있는 컬럼은: occupation 입니다
해당 컬럼에 총 1843 개의 결측치가 존재합니다.
결측치가 있는 컬럼은: native.country 입니다
해당 컬럼에 총 583 개의 결측치가 존재합니다.


In [6]:
# 결측치를 처리하는 함수를 작성합니다.
def handle_na(data, missing_col):
    temp = data.copy()
    for col, dtype in missing_col:
        if dtype == 'O':
            # 범주형 feature가 결측치인 경우 해당 행들을 삭제해 주었습니다.
            temp = temp.dropna(subset=[col])
    return temp

train = handle_na(train, missing_col)

# 결측치 처리가 잘 되었는지 확인해 줍니다.
missing_col = check_missing_col(train) 

결측치가 존재하지 않습니다


In [7]:
#라벨인코딩을 하기 위함 dictionary map 생성 함수
def make_label_map(dataframe):
    label_maps = {}
    for col in dataframe.columns:
        if dataframe[col].dtype=='object':
            label_map = {'unknown':0}
            for i, key in enumerate(dataframe[col].unique()):
                label_map[key] = i  #새로 등장하는 유니크 값들에 대해 1부터 1씩 증가시켜 키값을 부여해줍니다.
            label_maps[col] = label_map
    return label_maps

# 각 범주형 변수에 인코딩 값을 부여하는 함수
def label_encoder(dataframe, label_map):
    for col in dataframe.columns:
        if dataframe[col].dtype=='object':
            dataframe[col] = dataframe[col].map(label_map[col])
            #dataframe[col] = dataframe[col].fillna(label_map[col]['unknown']) #혹시 모를 결측값은 unknown의 값(0)으로 채워줍니다.
    return dataframe

train = label_encoder(train, make_label_map(train))

In [8]:
test = label_encoder(test, make_label_map(test))

In [9]:
train.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,0,32,0,309513,0,12,0,0,0,0,0,0,0,40,0,0
1,1,33,0,205469,1,10,0,1,0,0,0,0,0,40,0,1
2,2,46,0,149949,1,10,0,0,0,0,0,0,0,40,0,0
3,3,23,0,193090,2,13,1,2,1,0,1,0,0,30,0,0
4,4,55,0,60193,3,9,2,2,2,0,1,0,0,40,0,0


# 3. Pycaret - AutoML 라이브러리

## 3-1. Pycaret 설치하기

- colab에서 활용할 수 있게 설치하기. 
- 순서: pip 설치 -> 에러 발생 -> 코드 실행 -> 런타임 초기화 후 재실행 -> pip 재설치 

In [10]:
# !pip install pycaret

In [11]:
from pycaret.utils import enable_colab
enable_colab()

Colab mode enabled.


In [12]:
# !pip install pycaret

## 3-2. Pycaret classification 사용

- 분데데분 님의 [글](https://dacon.io/codeshare/4458?dtype=recent) 을 참고하였습니다.
- 아래의 코드 실행 후, 칸에서 엔터 눌러주세요. 그래야 실행됩니다.

In [14]:
from pycaret.classification import *
exp_101=setup(data=train, target='target', session_id=123,fold_shuffle=True)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,target
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(15081, 16)"
5,Missing Values,False
6,Numeric Features,7
7,Categorical Features,8
8,Ordinal Features,False
9,High Cardinality Features,False


AttributeError: 'Simple_Imputer' object has no attribute 'fill_value_categorical'

### 가장 좋은 모델 확인하기

In [None]:
best_model= compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.866,0.9176,0.6437,0.7816,0.7055,0.6198,0.625,0.141
gbc,Gradient Boosting Classifier,0.8573,0.9121,0.5735,0.799,0.6672,0.5796,0.5928,0.727
ada,Ada Boost Classifier,0.854,0.9083,0.6148,0.7558,0.6774,0.5845,0.5901,0.227
rf,Random Forest Classifier,0.8478,0.8973,0.613,0.7342,0.6676,0.57,0.5743,0.641
lda,Linear Discriminant Analysis,0.8341,0.8886,0.561,0.7146,0.6281,0.5234,0.5301,0.078
ridge,Ridge Classifier,0.8324,0.0,0.5136,0.7363,0.6045,0.5026,0.516,0.026
et,Extra Trees Classifier,0.8246,0.8718,0.591,0.6687,0.6272,0.5132,0.515,0.693
dt,Decision Tree Classifier,0.804,0.7464,0.6311,0.6031,0.6165,0.485,0.4855,0.052
lr,Logistic Regression,0.7914,0.5989,0.2695,0.7207,0.3921,0.2966,0.3491,0.387
nb,Naive Bayes,0.7911,0.8046,0.3101,0.68,0.4258,0.3193,0.3562,0.025


### 가장 좋은 모델 선택하여 모델 생성

- 본 결과에서 가장 좋은 모델은 lightgbm

In [None]:
lgb = create_model('lightgbm')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8551,0.9137,0.5833,0.7817,0.6681,0.5779,0.5881
1,0.8598,0.9107,0.6326,0.7661,0.6929,0.6032,0.6079
2,0.8693,0.9311,0.6477,0.7917,0.7125,0.629,0.6343
3,0.8778,0.9212,0.6705,0.8082,0.7329,0.6546,0.6594
4,0.8532,0.9107,0.5833,0.7739,0.6652,0.5736,0.583
5,0.8759,0.9318,0.678,0.7956,0.7321,0.6521,0.6556
6,0.8502,0.903,0.635,0.7293,0.6789,0.5818,0.5842
7,0.8578,0.909,0.6426,0.7511,0.6926,0.6009,0.604
8,0.8834,0.9233,0.7083,0.8026,0.7525,0.6766,0.6789
9,0.8768,0.9218,0.6553,0.816,0.7269,0.6486,0.6551


### 기계학습 모델 튜닝

In [None]:
tuned_lgb= tune_model(lgb)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8542,0.9113,0.6098,0.7594,0.6765,0.5838,0.5896
1,0.8542,0.9059,0.6136,0.757,0.6778,0.5849,0.5903
2,0.8617,0.929,0.625,0.7783,0.6933,0.6054,0.6115
3,0.8598,0.9105,0.6364,0.7636,0.6942,0.6043,0.6085
4,0.8561,0.9081,0.6174,0.7617,0.682,0.5903,0.5957
5,0.8778,0.9331,0.6932,0.7922,0.7394,0.6601,0.6626
6,0.8483,0.8977,0.6274,0.7269,0.6735,0.5754,0.578
7,0.8455,0.9061,0.6008,0.7315,0.6597,0.561,0.5656
8,0.8758,0.9138,0.678,0.7956,0.7321,0.652,0.6555
9,0.8701,0.9182,0.6439,0.7981,0.7128,0.6301,0.6362


### 튜닝 결과에서 최종 모델 선택

In [None]:
final_lgb=finalize_model(tuned_lgb)
final_lgb

LGBMClassifier(bagging_fraction=0.7, bagging_freq=6, boosting_type='gbdt',
               class_weight=None, colsample_bytree=1.0, feature_fraction=0.5,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=66, min_child_weight=0.001, min_split_gain=0.4,
               n_estimators=90, n_jobs=-1, num_leaves=90, objective=None,
               random_state=123, reg_alpha=0.0005, reg_lambda=0.1,
               silent='warn', subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)

### 최종 모델을 활용하여 예측 실행

In [None]:
pred = predict_model(final_lgb, data=test)

### 결과 출력 및 제출파일 생성

In [None]:
submit['target']= pred['Label']
submit.to_csv('submission.csv', index = False)