# **신용카드 사용자 연체 예측 AI 경진대회**

---



# **0. 라이브러리 설치 및 불러오기**

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go

from keras.models import Model
from keras.models import Sequential
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate
from keras.layers import Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import LSTM
from keras.layers import BatchNormalization
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Concatenate, Dropout
from tensorflow.keras import Input, Model
from tensorflow.keras import optimizers
from keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler
from keras.optimizers import RMSprop, Adadelta, Adam
from sklearn import preprocessing


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
os.chdir('/content/drive/MyDrive/data/credit')

In [None]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
submission=pd.read_csv('sample_submission.csv')

In [None]:
df_train=train.copy()
df_test=test.copy()

# **1. 데이터 전처리 & EDA**







## 1.1 데이터 살펴보기 


In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_test.head()

In [None]:
df_test.info()

## 1.2 데이터 전처리

In [None]:
df_train.isnull().sum()

'occyp_type' 항목에서 결측치가 30% 존재.   
단순 제거하기엔 너무 많은 수치라 판단하여 대체할 수 있는 방안을 모색함



### 1.2.1 imputataion of missing values


#### (1) pension

'occyp_type' 항목에서 결측치의 특성을 알아보기 위해 non_occyp 및 occyp 지정

In [None]:
non_occyp=df_train.loc[df_train['occyp_type'].isnull()] #직업유형이 없는 사람
occyp=df_train.dropna() #직업유형이 있는 사람(결측치 제거)

In [None]:
occyp.info()

In [None]:
non_occyp.info()

In [None]:
#양수인 부분이 4438개->고용된 사람과 그렇지 않은 사람이 존재한다.
non_occyp['DAYS_EMPLOYED'].value_counts()

In [None]:
non_occyp.loc[non_occyp['DAYS_EMPLOYED']>=0, 'DAYS_EMPLOYED'] = 0

In [None]:
non_occyp.loc[non_occyp['DAYS_EMPLOYED']>=0, 'occyp_type'] = 'Retired'
non_occyp.head()

In [None]:
occyp['DAYS_BIRTH'] = occyp['DAYS_BIRTH'] // -365
non_occyp['DAYS_BIRTH'] = non_occyp['DAYS_BIRTH'] // -365

In [None]:
sns.displot(occyp['DAYS_BIRTH'])

In [None]:
sns.displot(non_occyp['DAYS_BIRTH'])

In [None]:
bins = list(range(0,80,10))
print(bins)
bins_label = [str(x)+"s"  for x in bins]

In [None]:
occyp['DAYS_BIRTH'] = pd.cut(occyp['DAYS_BIRTH'],bins, right=False, labels = bins_label[:-1])
non_occyp['DAYS_BIRTH'] = pd.cut(non_occyp['DAYS_BIRTH'],bins, right=False, labels = bins_label[:-1])

In [None]:
sns.countplot(x="DAYS_BIRTH", data=occyp)
plt.title("DAYS_BIRTH distribution")
plt.show()

In [None]:
sns.countplot(x="DAYS_BIRTH", data=non_occyp)
plt.title("DAYS_BIRTH distribution")
plt.show()

In [None]:
non_occyp[['income_type','occyp_type']].value_counts()

위에 지정한 'Retired'는 전부 pensioner임을 알 수 있음.

In [None]:
occyp[['income_type','occyp_type']].value_counts()

In [None]:
df_train.loc[df_train['DAYS_EMPLOYED']>=0, 'occyp_type'] = 'Retired'
df_train.info()
# df_test도 아마 비슷할 듯?

#### (2) regreesion

In [None]:
#결측값 중 고용된 사람 중에서 소득분류 분포
non_occyp.loc[non_occyp['employeed']==1]['income_type'].value_counts()

### 1.2.2 중복데이터 문제

# **2. 인코딩(추후 논의 필요)**

## 2.1 Binary variables

In [None]:
train['gender'] = train['gender'].replace(['F','M'],[0,1])
test['gender'] = test['gender'].replace(['F','M'],[0,1])
print('gender :')
print(train['gender'].value_counts())
print('--------------')

print('Having a car or not : ')
train['car'] = train['car'].replace(['N','Y'],[0,1])
test['car'] = test['car'].replace(['N','Y'],[0,1])
print(train['car'].value_counts())
print('--------------')

print('Having house reality or not: ')
train['reality'] = train['reality'].replace(['N','Y'],[0,1])
test['reality'] = test['reality'].replace(['N','Y'],[0,1])
print(train['reality'].value_counts())
print('--------------')
      
print('Having a phone or not: ')
print(train['phone'].value_counts())
print('--------------')
      

print('Having a email or not: ')
print(train['email'].value_counts())
print('--------------')
      

print('Having a work phone or not: ')
print(train['work_phone'].value_counts())
print('--------------')

## 2.2 continuous variables

### (1) child_num

In [None]:
train['child_num'].value_counts(sort=False).plot.bar()

In [None]:
train.loc[train['child_num'] >= 2,'child_num']=2
test.loc[test['child_num']>=2, 'child_num']=2

### (2) inc

In [None]:
train['income_total'] = train['income_total'].astype(object)
train['income_total'] = train['income_total']/10000 
test['income_total'] = test['income_total']/10000
##############################################################3
print(train['income_total'].value_counts(bins=10,sort=False))
train['income_total'].plot(kind='hist',bins=50,density=True)

In [None]:
count, bin_dividers =np.histogram(train['income_total'], bins=7)
bin_names=['소득'+str(i) for i in range(7) ]
#bin_dividers는 train기준!!
train['income_total']=pd.cut(x=train['income_total'], bins=bin_dividers, labels=bin_names, include_lowest=True)
test['income_total']=pd.cut(x=test['income_total'], bins=bin_dividers, labels=bin_names, include_lowest=True)

### (3) type

In [None]:
print(train['income_type'].unique())
print(train['edu_type'].unique())
print(train['family_type'].unique())
print(train['house_type'].unique())

In [None]:
from sklearn import preprocessing
label_encoder=preprocessing.LabelEncoder()
train['income_type']=label_encoder.fit_transform(train['income_type'])
test['income_type']=label_encoder.transform(test['income_type'])
########################################################################
train['edu_type']=label_encoder.fit_transform(train['edu_type'])
test['edu_type']=label_encoder.transform(test['edu_type'])
########################################################################
train['family_type']=label_encoder.fit_transform(train['family_type'])
test['family_type']=label_encoder.transform(test['family_type'])
########################################################################
train['house_type']=label_encoder.fit_transform(train['house_type'])
test['house_type']=label_encoder.transform(test['house_type'])
########################################################################
train['income_total']=label_encoder.fit_transform(train['income_total'])
test['income_total']=label_encoder.fit_transform(test['income_total'])

## 2.3 minus continuous variable

In [None]:
#minus 변경하고
#구간화 함수
def make_bin(variable, n):
    train[variable]=-train[variable]
    test[variable]=-test[variable]
    count, bin_dividers =np.histogram(train[variable], bins=n) #train의 구간화를 적용
    bin_names=[str(i) for i in range(n)]
    train[variable]=pd.cut(x=train[variable], bins=bin_dividers, labels=bin_names, include_lowest=True)
    test[variable]=pd.cut(x=test[variable], bins=bin_dividers, labels=bin_names, include_lowest=True)
    test[variable].fillna(str(0), inplace=True) #test에는 없는 것을 임의의 값으로 채움
    ##########################################################
    train[variable]=label_encoder.fit_transform(train[variable])
    test[variable]=label_encoder.transform(test[variable])

In [None]:
make_bin('DAYS_BIRTH', n=10)
make_bin('DAYS_EMPLOYED', n=6)
make_bin('begin_month', n=4)

In [None]:
train

In [None]:
test

# **3. 모델링(추후 논의 필요)**

## 3.1 decision tree

## 3.2 CNN

## 3.3 LSTN

## 3.4 XGBoost / Catboost / randomforest 등등

# **4. 결과비교**