# 대회 설명
- 주제 : 신용카드 사용자 데이터를 보고 사용자의 대금 연체 정도를 예측하는 알고리즘 개발 
- 목표 : 신용카드 사용자들의 개인 신상정보 데이터로 사용자의 신용카드 대금 연체 정도 예측

# 변수 설명 (총 19개)
- Numerical : index, child_num, income_total, family_size
- Binary : gender, car, reality, FLAG_MOBIL, work_phone, phone, email
- Ordinal : edu_type
- Nomial : incom_type, family_type, house_type, occupy_type
- 날짜형 : DAYS_BIRTH, DAYS_EMPLOYED, begin_month

# [ Libraries ]

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import sparse

plt.style.use('seaborn')

from collections import Counter

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

sns.set(font_scale=2)

import glob
from sklearn.preprocessing import OneHotEncoder
import random

In [None]:
# 일 단위를 년 단위로 바꿔줌
def days_to_age(x):
    return (x * -1) / 365

# 개월 변환
def days_to_month(x):
    return (x * -1) / 30

# 마이너스 값 변환
def minus(x):
    return x * -1

# [ Load Data ]

In [None]:
path = 'C:/Users/LG/Desktop/DACON/data/'

In [None]:
train = pd.read_csv(path+'train.csv', index_col = 'index')
test = pd.read_csv(path+'test.csv', index_col = 'index')
submission = pd.read_csv(path+'sample_submission.csv')

# data = pd.concat([train, test], axis=0)

# [ 전처리 ]

### 중복데이터 처리 (더 높게 나옴)

In [None]:
train.duplicated().sum()

1634

In [None]:
test.duplicated().sum()

352

In [None]:
train = train.drop_duplicates()
test = train.drop_duplicates()

### FLAG_MOBIL 제거
- FLAG_MOBIL은 변수 요소가 1개이므로 무의미한 정보이다.

In [None]:
train = train.drop('FLAG_MOBIL', axis=1, inplace=False)
test = test.drop('FLAG_MOBIL', axis=1, inplace=False)

## 1. Numerical
- child_num, family_size, income_total, DAYS_BIRTH, DAYS_EMPLOYED, begin_month

### (1) child_num과 family_size의 관계

####  family_size - child_num가 음수인 경우가 있을 수 없으므로 삭제

In [None]:
# train[(train['family_size']-train['child_num']).isin([-1])]

In [None]:
# test[(test['family_size']-test['child_num']).isin([-1])]

In [None]:
train = train.drop(index = 14900)
test = test.drop(index = 31375)

#### family_size, child_num의 이상치 변경

In [None]:
# family_size에서 7이상의 값을 7로 모두 변경 
# train['family_size'] = train['family_size'].apply(lambda x : x if x < 7 else 7)
# test['family_size'] = test['family_size'].apply(lambda x : x if x < 7 else 7)

# child_num에서 3이상의 값을 3로 모두 변경
train['child_num'] = train['child_num'].apply(lambda x : x if x < 3 else 3)
test['child_num'] = test['child_num'].apply(lambda x : x if x < 3 else 3)

#### child_num (or family_size) 열 삭제
- 다중공선성 문제

In [None]:
# del train['child_num']
# del test['child_num']

del train['family_size']
del test['family_size']

### (2) DAYS_EMPLOYED 무직자인 경우 0으로 변환
- log변환하는 것도 가능

In [None]:
# 무직자 => 0
train.loc[(train.DAYS_EMPLOYED)>=0,'DAYS_EMPLOYED'] = 0
test.loc[(test.DAYS_EMPLOYED)>=0,'DAYS_EMPLOYED'] = 0

In [None]:
# log 변환

### (3) 날짜변수 단위 변환

In [None]:
# DAYS_EMPLOYED 년도 단위로 변경
train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'].apply(days_to_age)
test['DAYS_EMPLOYED'] = test['DAYS_EMPLOYED'].apply(days_to_age)
# train['DAYS_EMPLOYED'].hist(bins=30)

# DAY_BIRTH 나이로 변환
train['DAYS_BIRTH'] = train['DAYS_BIRTH'].apply(days_to_age)
test['DAYS_BIRTH'] = test['DAYS_BIRTH'].apply(days_to_age)

# begin_month 양수로 변환
train['begin_month'] = train['begin_month'].apply(minus)
test['begin_month'] = test['begin_month'].apply(minus)

### (4) income_total log변환

In [None]:
# train['income_total'] = train['income_total'].apply(np.log)

## 2. Binary
- gener, car, reality
- Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

train['gender'] = encoder.fit_transform(train['gender'])
train['car'] = encoder.fit_transform(train['car'])
train['reality'] = encoder.fit_transform(train['reality'])

test['gender'] = encoder.fit_transform(test['gender'])
test['car'] = encoder.fit_transform(test['car'])
test['reality'] = encoder.fit_transform(test['reality'])

## 3. Ordinal : edu_type
- 순서형 변수이므로 Mapping Encoding 이용

In [None]:
edu_order = {
    'Lower secondary' : 0, # 중학교 미만
    'Secondary / secondary special' : 1, #중학교
    'Incomplete higher' : 2, # 고등학교 중퇴
    'Higher education' : 3, # 고등학교 졸업
     'Academic degree' : 4 # 학사 이상
}
train.edu_type = train.edu_type.map(edu_order)
test.edu_type = test.edu_type.map(edu_order)

## 4. Nomial
- income_type, family_type, house_type, occupy_type
- Lable Encoding
- https://mizykk.tistory.com/12
- 결측치 처리 : https://m.blog.naver.com/youji4ever/221791455668

In [None]:
# occupy_type 결측치가 있는 행 제거
# train = train.dropna(axis=0)
# test = test.dropna(axis=0)

# occupy_type 결측치 대체
train = train.fillna(method='pad')
test = test.fillna(method='pad')

In [None]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
target_col = {'income_type', 'family_type', 'house_type', 'occyp_type'}

for i in target_col:
    le.fit(train[i])
    train[i] = le.transform(train[i])
    
for i in target_col:
    le.fit(test[i])
    test[i] = le.transform(test[i])
    
train.head()

Unnamed: 0_level_0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,occyp_type,family_size,begin_month,credit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,0,0,0,0,202500.0,0,3,1,2,38.079452,12.90137,0,0,0,18,2.0,6.0,1.0
1,0,0,1,1,247500.0,0,1,0,1,31.178082,4.219178,0,0,1,8,3.0,5.0,1.0
2,1,1,1,0,450000.0,4,3,1,1,52.293151,12.147945,0,1,0,10,2.0,22.0,2.0
3,0,0,1,0,202500.0,0,1,1,1,41.336986,5.731507,0,1,0,14,2.0,37.0,0.0
4,0,1,1,0,157500.0,2,3,1,1,41.19726,5.767123,0,0,0,10,2.0,26.0,2.0


# [ Scaling ]

### (1) Standardization - 데이터 표준화

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled = scaler.fit_transform(train)
train_scaled = pd.DataFrame(data = scaled, columns=train.columns)

scaled2 = scaler.fit_transform(test)
test_scaled = pd.DataFrame(data = scaled2, columns=test.columns)

### (2) 이상치 제거

In [None]:
from scipy import stats

train['zscore_child'] = stats.zscore(train['child_num'])
train = train[train['zscore_child'].between(-2, 2)]
train = train.drop('zscore_child', axis=1, inplace=False)
# train['zscore_family'] = stats.zscore(train['family_size'])
# train = train[train['zscore_family'].between(-2, 2)]
# train = train.drop('zscore_family', axis=1, inplace=False)

test['zscore_child'] = stats.zscore(test['child_num'])
test = test[test['zscore_child'].between(-2, 2)]
test = test.drop('zscore_child', axis=1, inplace=False)
# test['zscore_family'] = stats.zscore(test['family_size'])
# test = test[train['zscore_family'].between(-2, 2)]
# test = test.drop('zscore_family', axis=1, inplace=False)

### (3) Normalization - 데이터 정규화

In [None]:
# min-max scaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_2 = scaler.fit_transform(train_scaled)
train_scaled2 = pd.DataFrame(data=scaled_2, columns=train.columns)

scaled2_2 = scaler.fit_transform(test_scaled)
test_scaled2 = pd.DataFrame(data=scaled2_2, columns=test.columns)

# [ 데이터 분할 ]

In [None]:
#test = data[-10000:]
#train = data[:-10000]

In [None]:
train_x = train.drop('credit', axis=1)
train_y = train[['credit']]
test_x = test

In [None]:
train_x.shape

(26457, 17)

In [None]:
train_y.shape

(26457, 1)

In [None]:
test_x.shape

(10000, 17)

# [ 데이터 불균형 처리 (Sampling) ]
- Oversampling (SMOTE)
- 결과적으로 엄청 높은 log loss를 얻음 .(..)
- https://bandibell.tistory.com/366
- https://ichi.pro/ko/paisseon-eseo-bulgyunhyeong-deiteoleul-daluneun-bangbeob-274564822750187

In [None]:
from collections import Counter
from imblearn.over_sampling import SMOTE

from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score

In [None]:
print("Before oversampling: ",Counter(train_y))

Before oversampling:  Counter({2.0: 5104, 1.0: 1894, 0.0: 995})


In [None]:
sm = SMOTE()
train_x, train_y = sm.fit_resample(train_x, train_y)

In [None]:
print("After oversampling: ",Counter(train_y))

After oversampling:  Counter({1.0: 5104, 2.0: 5104, 0.0: 5104})


In [None]:
model=SVC()
clf_sm = model.fit(train_x, train_y)
pred_sm = clf_sm.predict(test_x)

SVC()

# [ Training ]
- https://data-make.tistory.com/84

### (1) Randomforest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(train_x, train_y)

RandomForestClassifier()

In [None]:
train_proba = clf.predict_proba(train_x)
train_proba[:10]

array([[0.05      , 0.74666667, 0.20333333],
       [0.4865    , 0.4335    , 0.08      ],
       [0.        , 0.4475    , 0.5525    ],
       [0.61      , 0.055     , 0.335     ],
       [0.        , 0.07      , 0.93      ],
       [0.025     , 0.75      , 0.225     ],
       [0.        , 0.03      , 0.97      ],
       [0.7805    , 0.062     , 0.1575    ],
       [0.05      , 0.        , 0.95      ],
       [0.05      , 0.03      , 0.92      ]])

In [None]:
np.argmax(train_proba, axis=1)[:10]

array([1, 0, 2, 0, 2, 1, 2, 0, 2, 2], dtype=int64)

In [None]:
clf.predict_proba(test_x)

array([[0.04      , 0.20333333, 0.75666667],
       [0.43      , 0.18      , 0.39      ],
       [0.03      , 0.05      , 0.92      ],
       ...,
       [0.02      , 0.02      , 0.96      ],
       [0.54      , 0.335     , 0.125     ],
       [0.14      , 0.2085    , 0.6515    ]])

In [None]:
submission.iloc[:,1:] = clf.predict_proba(test_x)

In [None]:
submission.to_csv('sub_RF.csv', index=False)

### (2) RF + Stratified K-fold crossvalidation
- https://wooono.tistory.com/103
- https://ek-koh.github.io/data%20analysis/cv/

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss

In [None]:
x_train, x_val, y_train, y_val = train_test_split(train_x, train_y,
                 stratify = train_y, # train_y에 있는 요소별 비율을 고려해 검증set과 훈련set 분할
                 test_size = 0.25, # 전체 data의 25%를 검증에 사용, 75%를 학습에 사용
                 random_state = 10086 # 복원을 용이하게 함
                 )

In [None]:
print(x_train.shape, x_val.shape, y_train.shape, y_val.shape)

(19842, 17) (6615, 17) (19842, 1) (6615, 1)


In [None]:
clf = RandomForestClassifier()
clf.fit(x_train, y_train)

RandomForestClassifier()

In [None]:
y_proba = clf.predict_proba(x_val)
y_val_onehot = pd.get_dummies(y_val)

log_loss(y_val_onehot, y_proba)

0.8556582434030607

In [None]:
## k-fold crossvalidation
from sklearn.model_selection import StratifiedKFold

folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=55)
outcomes = []
sub = np.zeros((test_x.shape[0], 3))
for n_fold, (train_index, val_index) in enumerate(folds.split(train_x, train_y)):
    x_train, x_val = train_x.iloc[train_index], train_x.iloc[val_index]
    y_train, y_val = train_y.iloc[train_index], train_y.iloc[val_index]
    clf = RandomForestClassifier()
    clf.fit(x_train, y_train)
    predictions = clf.predict_proba(x_val)
    y_val_onehot = pd.get_dummies(y_val)

    sub += clf.predict_proba(test_x)

    logloss = log_loss(y_val_onehot, predictions)
    outcomes.append(logloss)

sub = sub / 10
np.mean(outcomes)  # k-fold crossvalidation score

0.8618152610867135

In [None]:
submission.iloc[:, 1:] = sub
submission.to_csv("sub_RF_SKfold.csv", index = False)

### (3) LGBM

In [None]:
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier

In [None]:
# training
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
lgb_models={}
sub = np.zeros((test_x.shape[0], 3))
for n_fold, (train_index, val_index) in enumerate(folds.split(train_x, train_y)):
    x_train, x_val = train_x.iloc[train_index].values, train_x.iloc[val_index].values
    y_train, y_val = train_y.iloc[train_index].values, train_y.iloc[val_index].values
    lgb = LGBMClassifier(n_estimators=1000)
    lgb.fit(x_train, y_train, 
            eval_set=[(x_train, y_train), (x_val, y_val)], 
            early_stopping_rounds=30,
            verbose=100)
    lgb_models[n_fold]=lgb
    
    sub += lgb_models[n_fold].predict_proba(test_x)

sub = sub / 10
np.mean(logloss)

Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.658965	valid_1's multi_logloss: 0.737418
[200]	training's multi_logloss: 0.574012	valid_1's multi_logloss: 0.721376
[300]	training's multi_logloss: 0.512297	valid_1's multi_logloss: 0.713299
Early stopping, best iteration is:
[365]	training's multi_logloss: 0.478119	valid_1's multi_logloss: 0.710691
Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.655344	valid_1's multi_logloss: 0.762817
[200]	training's multi_logloss: 0.571433	valid_1's multi_logloss: 0.7446
[300]	training's multi_logloss: 0.510335	valid_1's multi_logloss: 0.733752
[400]	training's multi_logloss: 0.460337	valid_1's multi_logloss: 0.729854
Early stopping, best iteration is:
[390]	training's multi_logloss: 0.464862	valid_1's multi_logloss: 0.729408
Training until validation scores don't improve for 30 rounds
[100]	training's multi_logloss: 0.654246	valid_1's multi_logloss: 0.765106

0.7810018705059704

In [None]:
submission.iloc[:, 1:] = sub
submission.to_csv("sub_LGBM10_7.csv", index = False)

### (5) Ridge

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score,mean_absolute_error, mean_squared_error

In [None]:
# training

ridge = Ridge(alpha = 0.05, normalize=True)
ridge.fit(train_x, train_y)

Ridge(alpha=0.05, normalize=True)

In [None]:
pred = ridge.predict(test_x)

In [None]:
ridge_r2 = r2_score(test_x, pred)
ridge_MSE = mean_squared_error(test_x, pred)
ridge_MAE = mean_absolute_error(test_x, pred)

ValueError: y_true and y_pred have different number of output (16!=1)

### (6) AutoLGB

In [None]:
from kaggler.preprocessing import LabelEncoder
from kaggler.model import AutoLGB

ModuleNotFoundError: No module named 'kaggler'

### (7) Pycaret

### KNN

### SVM