In [51]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings, random
warnings.filterwarnings(action='ignore')

from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from category_encoders.ordinal import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold

from sklearn.cluster import KMeans
from catboost import CatBoostClassifier, Pool

In [22]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [23]:
train.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0


In [24]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          26457 non-null  int64  
 1   gender         26457 non-null  object 
 2   car            26457 non-null  object 
 3   reality        26457 non-null  object 
 4   child_num      26457 non-null  int64  
 5   income_total   26457 non-null  float64
 6   income_type    26457 non-null  object 
 7   edu_type       26457 non-null  object 
 8   family_type    26457 non-null  object 
 9   house_type     26457 non-null  object 
 10  DAYS_BIRTH     26457 non-null  int64  
 11  DAYS_EMPLOYED  26457 non-null  int64  
 12  FLAG_MOBIL     26457 non-null  int64  
 13  work_phone     26457 non-null  int64  
 14  phone          26457 non-null  int64  
 15  email          26457 non-null  int64  
 16  occyp_type     18286 non-null  object 
 17  family_size    26457 non-null  float64
 18  begin_

In [25]:
train.describe()

Unnamed: 0,index,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month,credit
count,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0
mean,13228.0,0.428658,187306.5,-15958.053899,59068.750728,1.0,0.224742,0.294251,0.09128,2.196848,-26.123294,1.51956
std,7637.622372,0.747326,101878.4,4201.589022,137475.427503,0.0,0.41742,0.455714,0.288013,0.916717,16.55955,0.702283
min,0.0,0.0,27000.0,-25152.0,-15713.0,1.0,0.0,0.0,0.0,1.0,-60.0,0.0
25%,6614.0,0.0,121500.0,-19431.0,-3153.0,1.0,0.0,0.0,0.0,2.0,-39.0,1.0
50%,13228.0,0.0,157500.0,-15547.0,-1539.0,1.0,0.0,0.0,0.0,2.0,-24.0,2.0
75%,19842.0,1.0,225000.0,-12446.0,-407.0,1.0,0.0,1.0,0.0,3.0,-12.0,2.0
max,26456.0,19.0,1575000.0,-7705.0,365243.0,1.0,1.0,1.0,1.0,20.0,0.0,2.0


In [26]:
test.describe()

Unnamed: 0,index,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,31456.5,0.4347,185043.3,-16020.4664,59776.6904,1.0,0.2276,0.2963,0.0856,2.2027,-26.2724
std,2886.89568,0.729102,101539.8,4197.672887,138121.224504,0.0,0.419304,0.456648,0.279786,0.898272,16.348557
min,26457.0,0.0,27000.0,-25152.0,-15661.0,1.0,0.0,0.0,0.0,1.0,-60.0
25%,28956.75,0.0,121500.0,-19483.25,-3153.0,1.0,0.0,0.0,0.0,2.0,-39.0
50%,31456.5,0.0,157500.0,-15606.0,-1577.0,1.0,0.0,0.0,0.0,2.0,-25.0
75%,33956.25,1.0,225000.0,-12539.0,-410.0,1.0,0.0,1.0,0.0,3.0,-12.0
max,36456.0,5.0,1575000.0,-7489.0,365243.0,1.0,1.0,1.0,1.0,7.0,0.0


## 불필요 특성 제거

In [27]:
train.drop(['index', 'FLAG_MOBIL'], axis=1, inplace=True)
test.drop(['index', 'FLAG_MOBIL'], axis=1, inplace=True)

## 결측치 채우기

In [28]:
train.fillna('NaN', inplace=True) 
test.fillna('NaN', inplace=True)

## 컬럼명 변경

In [29]:
train.columns = ['성별', '차량여부', '부동산여부', '자녀수', '연간소득', '소득분류',
                '교육수준', '결혼여부', '생활방식', '출생일', '업무시작일', '업무용 핸드폰',
                 '집전화', '이메일', '직업유형', '가족수', '카드발급월', '신용도']
test.columns = ['성별', '차량여부', '부동산여부', '자녀수', '연간소득', '소득분류',
                '교육수준', '결혼여부', '생활방식', '출생일', '업무시작일', '업무용 핸드폰',
                '집전화', '이메일', '직업유형', '가족수', '카드발급월']

## 전처리

### 자녀 수, 가족 수

test데이터 기준으로 train 맞추기

In [30]:
train = train[(train['가족수'] <= 7) & (train['자녀수'] <= 5)]
train = train.reset_index(drop=True)

### 업무시작일 > 0 이면 무직자

In [31]:
def to_zero(x):
    if x > 0:
        x = 0
    return x

In [32]:
train['업무시작일'] = train['업무시작일'].apply(to_zero)
test['업무시작일'] = test['업무시작일'].apply(to_zero)

### 출생일, 업무시작일, 카드 발급월
음수 => 양수

In [33]:
feats = ['출생일', '카드발급월', '업무시작일']
for i in feats:
    train[i] = abs(train[i])
    test[i] = abs(test[i])

### 특정 사람 만들기
(성별, 차량, 부동산, 자녀수, 소득, 소득분류, 교육수준, 출생일)들로 새로운 변수 만들기

In [35]:
for i in [train, test]:
    i['특정인'] = i['성별'] + i['차량여부'] + i['부동산여부'] + i['자녀수'].astype(str) + \
                  i['연간소득'].astype(str) + i['소득분류'] + i['교육수준'] + i['출생일'].astype(str)

### 나이, 근무년수 컬럼 만들기
1. 파생 컬럼 = 기존 수치 / 365 \
2. 기존 컬럼 삭제

In [37]:
train['나이'] = train['출생일'] // 365
test['나이'] = test['출생일'] // 365
train['근무년수'] = train['업무시작일'] // 365
test['근무년수'] = test['업무시작일'] // 365

In [38]:
train.drop(['출생일', '업무시작일'], axis=1, inplace=True)
test.drop(['출생일', '업무시작일'], axis=1, inplace=True)

### 수치형 스케일링 및 카테고리형 라벨링

In [40]:
num = ['자녀수', '나이', '근무년수', '카드발급월', '가족수']

In [43]:
cate = train.dtypes[train.dtypes == 'object'].index.tolist()
cate

['성별', '차량여부', '부동산여부', '소득분류', '교육수준', '결혼여부', '생활방식', '직업유형', '특정인']

### 로그스케일링
소득은 수치가 커서 로그스케일링 사용

In [45]:
train['연간소득'] = np.log1p(train['연간소득'])
test['연간소득'] = np.log1p(test['연간소득'])

### OrdinalEncoder
카테고리형 변수들 정수형으로 변경

In [48]:
encoder = OrdinalEncoder(cate)
train[cate] = encoder.fit_transform(train[cate], train['신용도'])
test[cate] = encoder.transform(test[cate])

train['특정인'] = train['특정인'].astype('int64')
test['특정인'] = test['특정인'].astype('int64')



### MinMaxScale
연간소득을 제외한 수치형 데이터 정규화

In [52]:
minmax = MinMaxScaler()
train[num] = minmax.fit_transform(train[num])
test[num] = minmax.fit_transform(test[num])

## 모델링

In [55]:
target = '신용도'
X = train.drop(target, axis=1)
y = train[target]
X_test = test

In [56]:
train_data = Pool(data=X, label=y, cat_features=cate)
model_cat = CatBoostClassifier()
model_cat.fit(train_data, use_best_model=True, early_stopping_rounds=500, verbose=100)
cat_pred_test = model_cat.predict_proba(X_test)

You should provide test set for use best model. use_best_model parameter has been switched to false value.


Learning rate set to 0.093512
0:	learn: 1.0467333	total: 272ms	remaining: 4m 32s
100:	learn: 0.7090016	total: 4.99s	remaining: 44.5s
200:	learn: 0.6944437	total: 9.84s	remaining: 39.1s
300:	learn: 0.6836049	total: 14.4s	remaining: 33.5s
400:	learn: 0.6713131	total: 19.2s	remaining: 28.6s
500:	learn: 0.6612449	total: 24s	remaining: 23.9s
600:	learn: 0.6501275	total: 28.6s	remaining: 19s
700:	learn: 0.6389110	total: 33.6s	remaining: 14.3s
800:	learn: 0.6276405	total: 38.7s	remaining: 9.6s
900:	learn: 0.6167805	total: 43.3s	remaining: 4.76s
999:	learn: 0.6068659	total: 48.1s	remaining: 0us


In [58]:
sub = pd.read_csv('sample_submission.csv')
sub.loc[:,['0','1','2']] = cat_pred_test
sub.to_csv('sub_last3.csv', index=False)