# **신용카드 사용자 연체 예측 AI 경진대회**

---



※※ 변경사항 ※※

1.   데이터 전처리/인코딩 완료(train & test)
2.   regression 과정에서 inocc -> Retired 변경
3.   regression 과정에서 소수 부분 반올림 처리 
4.   데이터 전처리 과정에서 생략된 EDA는 추후에 추가 예정
5.   인코딩의 경우 베이스코드 그대로 적용



# **0. 라이브러리 설치 및 불러오기**

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go

from keras.models import Model
from keras.models import Sequential
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate
from keras.layers import Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import LSTM
from keras.layers import BatchNormalization
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Concatenate, Dropout
from tensorflow.keras import Input, Model
from tensorflow.keras import optimizers
from keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler
from keras.optimizers import RMSprop, Adadelta, Adam
from sklearn import preprocessing


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
os.chdir('/content/drive/MyDrive/data/credit')

In [None]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
submission=pd.read_csv('sample_submission.csv')

In [None]:
df_train=train.copy()
df_test=test.copy()

# **1. 데이터 전처리 & EDA**







## 1.1 데이터 살펴보기 


In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_test.head()

In [None]:
df_test.info()

## 1.2 데이터 전처리

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

'occyp_type' 항목에서 결측치가 30% 존재.   
단순 제거하기엔 너무 많은 수치라 판단하여 대체할 수 있는 방안을 모색함



### 1.2.1 imputataion of missing values


#### (1) pension

'occyp_type' 항목에서 결측치의 특성을 알아보기 위해 non_occyp 및 occyp 지정

In [None]:
non_occyp=df_train.loc[df_train['occyp_type'].isnull()] #직업유형이 없는 사람
occyp=df_train.dropna() #직업유형이 있는 사람(결측치 제거)

In [None]:
non_occyp.info()

In [None]:
occyp.info()

In [None]:
occyp.loc[occyp['DAYS_EMPLOYED']>0]#1이 없음
#전부다 고용된 상태임

In [None]:
#양수인 부분이 4438개->고용된 사람과 그렇지 않은 사람이 존재한다.
non_occyp['DAYS_EMPLOYED'].value_counts()

In [None]:
non_occyp['employeed']=[1 if t<0 else 0 for t in list(non_occyp['DAYS_EMPLOYED'])]

In [None]:
non_occyp.head()#양수면 0,음수면 1

In [None]:
occyp['employeed']=1

In [None]:
occyp.head()

결측치에 해당하는 값을 조사한 결과 고용이 안 된 사람과 된 사람 모두 포함되지만 결측값이 없는 상태는 모두 고용이 된 사람들이다.

In [None]:
non_occyp.loc[non_occyp['DAYS_EMPLOYED']>=0, 'DAYS_EMPLOYED'] = 0

In [None]:
non_occyp.loc[non_occyp['DAYS_EMPLOYED']>=0, 'occyp_type'] = 'Retired'
non_occyp.head()

In [None]:
#occyp['DAYS_BIRTH'] = occyp['DAYS_BIRTH'] // -365
#non_occyp['DAYS_BIRTH'] = non_occyp['DAYS_BIRTH'] // -365

In [None]:
#sns.displot(occyp['DAYS_BIRTH'])

In [None]:
#sns.displot(non_occyp['DAYS_BIRTH'])

In [None]:
#bins = list(range(0,80,10))
#print(bins)
#bins_label = [str(x)+"s"  for x in bins]

In [None]:
#occyp['DAYS_BIRTH'] = pd.cut(occyp['DAYS_BIRTH'],bins, right=False, labels = bins_label[:-1])
#non_occyp['DAYS_BIRTH'] = pd.cut(non_occyp['DAYS_BIRTH'],bins, right=False, labels = bins_label[:-1])

In [None]:
#sns.countplot(x="DAYS_BIRTH", data=occyp)
#plt.title("DAYS_BIRTH distribution")
#plt.show()

In [None]:
#sns.countplot(x="DAYS_BIRTH", data=non_occyp)
#plt.title("DAYS_BIRTH distribution")
#plt.show()

In [None]:
non_occyp[['income_type','occyp_type']].value_counts()

위에 지정한 'Retired'는 전부 pensioner임을 알 수 있음.

In [None]:
occyp[['income_type','occyp_type']].value_counts()

In [None]:
#df_train.loc[df_train['DAYS_EMPLOYED']>=0, 'occyp_type'] = 'Retired'
#df_train.info()
# df_test도 아마 비슷할 듯?

In [None]:
non_occyp.isnull().sum() #3733개 남음

In [None]:
non_occyp.head()

In [None]:
occyp.head()

#### (2) regreesion

In [None]:
data=pd.concat([occyp,non_occyp])

In [None]:
data.head()

In [None]:
#명목형 변수 라벨화
label_encoder=preprocessing.LabelEncoder()

data['gender'] = data['gender'].replace(['F','M'],[0,1])
data['car'] = data['car'].replace(['N','Y'],[0,1])
data['reality'] = data['reality'].replace(['N','Y'],[0,1])
data['income_type']=label_encoder.fit_transform(data['income_type'])
#################################################################
data['edu_type']=label_encoder.fit_transform(data['edu_type'])
########################################################################
data['family_type']=label_encoder.fit_transform(data['family_type'])
########################################################################
data['house_type']=label_encoder.fit_transform(data['house_type'])
########################################################################

In [None]:
data.head()

In [None]:
null_data=data.loc[data['occyp_type'].isnull()]

In [None]:
null_data.head()#inocc 대체 후 결측치가 있는 것들

In [None]:
fill_data=data.dropna() #결측치 제거한 데이터

In [None]:
print(fill_data.shape,null_data.shape)

In [None]:
fill_data['occyp_type_l']=label_encoder.fit_transform(fill_data['occyp_type'])
#결측치 없는 데이터 중 occyp_type를 라벨링한 변수를 추가(라벨링과 원래값 비교를 위해)

In [None]:
fill_data.columns

In [None]:
null_data.columns

In [None]:
from sklearn import linear_model 
lin_reg = linear_model.LinearRegression()
x=fill_data[fill_data.columns.drop(['index','occyp_type','credit',
                                   'employeed','occyp_type_l'])]
#employeed(0,1)을 넣지 않고 days_employed에서 양수값을 0으로 처리한 상태에서 재시도
#라벨링한 occyp_type_l는 종속변수로 사용
y=fill_data['occyp_type_l']#라벨링 상태
lin_reg_model = lin_reg.fit(x, y)

In [None]:
x.shape

In [None]:
x_test=null_data[null_data.columns.drop(['index','occyp_type','credit','employeed'])]
y_test=null_data['occyp_type']
y_test=lin_reg.predict(x_test)#결측치가 있는 데이터에 대입

In [None]:
null_data['occyp_type']=y_test

In [None]:
null_data.head()

In [None]:
null_data['occyp_type'].value_counts()

In [None]:
print(y[:10])

In [None]:
fill_data['occyp_type'].value_counts()

In [None]:
#fill_data['occyp_type_enc']=y_encoded
fill_data['occyp_type_l'].value_counts()

In [None]:
fill_data['occyp_type_l'].unique()#라벨링값

In [None]:
fill_data['occyp_type'].unique()#라벨링값 순서대로 8:Laborers,10:Managers,...

In [None]:
np.round(y_test,0)

In [None]:
null_data['occyp_type']=np.round(null_data['occyp_type'],0)#추후 회의 후 결정

In [None]:
null_data['occyp_type'].value_counts()

In [None]:
#라벨링->원래값
null_data.loc[null_data['occyp_type']==2.0,'occyp_type']= 'Cooking staff'
null_data.loc[null_data['occyp_type']==3.0,'occyp_type']='Core staff'
null_data.loc[null_data['occyp_type']==4.0,'occyp_type']='Drivers'
null_data.loc[null_data['occyp_type']==5.0,'occyp_type']='HR staff'
null_data.loc[null_data['occyp_type']==6.0,'occyp_type']='High skill tech staff'
null_data.loc[null_data['occyp_type']==7.0,'occyp_type']='IT staff'
null_data.loc[null_data['occyp_type']==8.0,'occyp_type']='Laborers'
null_data.loc[null_data['occyp_type']==9.0,'occyp_type']='Low-skill Laborers'
null_data.loc[null_data['occyp_type']==10.0,'occyp_type']='Managers'
null_data.loc[null_data['occyp_type']==11.0,'occyp_type']='Medicine staff'
null_data.loc[null_data['occyp_type']==12.0,'occyp_type']='Private service staff'
null_data.loc[null_data['occyp_type']==13.0,'occyp_type']='Realty agents'
null_data.loc[null_data['occyp_type']==14.0,'occyp_type']='Sales staff'
null_data.loc[null_data['occyp_type']==15.0,'occyp_type']='Secretaries'
null_data.loc[null_data['occyp_type']==16.0,'occyp_type']='Security staff'

In [None]:
fill_data['occyp_type'].value_counts().plot(kind='bar')

In [None]:
null_data['occyp_type'].value_counts().plot(kind='bar')

In [None]:
null_data.columns

In [None]:
fill_data.columns

In [None]:
fill_data=fill_data.drop('occyp_type_l',axis=1)

In [None]:
df=pd.concat([null_data,fill_data]).sort_values(by='index')
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df['occyp_type'].value_counts()

train & test 파일에 동일하게 전처리 수행

In [None]:
non_occyp_test=df_test.loc[df_test['occyp_type'].isnull()] 
occyp_test=df_test.dropna() 

In [None]:
non_occyp_test.info()

In [None]:
occyp_test.info()

In [None]:
non_occyp_test['employeed']=[1 if t<0 else 0 for t in list(non_occyp_test['DAYS_EMPLOYED'])]

In [None]:
occyp_test['employeed']=1

In [None]:
non_occyp_test.loc[non_occyp_test['DAYS_EMPLOYED']>=0, 'DAYS_EMPLOYED'] = 0

In [None]:
non_occyp_test.loc[non_occyp_test['DAYS_EMPLOYED']>=0, 'occyp_type'] = 'Retired'

In [None]:
data_test=pd.concat([occyp_test,non_occyp_test])

In [None]:
#명목형 변수 라벨화
label_encoder=preprocessing.LabelEncoder()

data_test['gender'] = data_test['gender'].replace(['F','M'],[0,1])
data_test['car'] = data_test['car'].replace(['N','Y'],[0,1])
data_test['reality'] = data_test['reality'].replace(['N','Y'],[0,1])
data_test['income_type']=label_encoder.fit_transform(data_test['income_type'])
#################################################################
data_test['edu_type']=label_encoder.fit_transform(data_test['edu_type'])
########################################################################
data_test['family_type']=label_encoder.fit_transform(data_test['family_type'])
########################################################################
data_test['house_type']=label_encoder.fit_transform(data_test['house_type'])
########################################################################

In [None]:
null_data_test=data_test.loc[data_test['occyp_type'].isnull()]

In [None]:
fill_data_test=data_test.dropna()

In [None]:
fill_data_test['occyp_type_l']=label_encoder.fit_transform(fill_data_test['occyp_type'])

In [None]:
fill_data_test.columns

In [None]:
null_data_test.columns

In [None]:
lin_reg_test = linear_model.LinearRegression()
x=fill_data_test[fill_data_test.columns.drop(['index','occyp_type','employeed','occyp_type_l'])]
# test파일에는 'credit'이 없음.
y=fill_data_test['occyp_type_l']
lin_reg_test_model = lin_reg_test.fit(x, y)

In [None]:
x_test=null_data_test[null_data_test.columns.drop(['index','occyp_type','employeed'])]
y_test=null_data_test['occyp_type']
y_test=lin_reg_test.predict(x_test)#결측치가 있는 데이터에 대입

In [None]:
null_data_test['occyp_type']=y_test

In [None]:
null_data_test['occyp_type'].value_counts()

In [None]:
print(y[:10])

In [None]:
fill_data_test['occyp_type'].value_counts()

In [None]:
np.round(y_test,0)

In [None]:
null_data_test['occyp_type']=np.round(null_data_test['occyp_type'],0)

In [None]:
null_data_test['occyp_type'].value_counts()

In [None]:
#라벨링->원래값
null_data_test.loc[null_data_test['occyp_type']==2.0,'occyp_type']= 'Cooking staff'
null_data_test.loc[null_data_test['occyp_type']==3.0,'occyp_type']='Core staff'
null_data_test.loc[null_data_test['occyp_type']==4.0,'occyp_type']='Drivers'
null_data_test.loc[null_data_test['occyp_type']==5.0,'occyp_type']='HR staff'
null_data_test.loc[null_data_test['occyp_type']==6.0,'occyp_type']='High skill tech staff'
null_data_test.loc[null_data_test['occyp_type']==7.0,'occyp_type']='IT staff'
null_data_test.loc[null_data_test['occyp_type']==8.0,'occyp_type']='Laborers'
null_data_test.loc[null_data_test['occyp_type']==9.0,'occyp_type']='Low-skill Laborers'
null_data_test.loc[null_data_test['occyp_type']==10.0,'occyp_type']='Managers'
null_data_test.loc[null_data_test['occyp_type']==11.0,'occyp_type']='Medicine staff'
null_data_test.loc[null_data_test['occyp_type']==12.0,'occyp_type']='Private service staff'
null_data_test.loc[null_data_test['occyp_type']==13.0,'occyp_type']='Realty agents'
null_data_test.loc[null_data_test['occyp_type']==14.0,'occyp_type']='Sales staff'
null_data_test.loc[null_data_test['occyp_type']==15.0,'occyp_type']='Secretaries'
null_data_test.loc[null_data_test['occyp_type']==16.0,'occyp_type']='Security staff'

In [None]:
fill_data_test=fill_data_test.drop('occyp_type_l',axis=1)

In [None]:
df_test=pd.concat([null_data_test,fill_data_test]).sort_values(by='index')
df_test.head()

In [None]:
df_test.isnull().sum()

In [None]:
train = df.drop('employeed', axis=1)

In [None]:
train.info()

In [None]:
test = df_test.drop('employeed', axis=1)

In [None]:
test.info()

# **2. 인코딩(베이스 코드 라인)**

## 2.1 Binary variables (앞서 regression에서 수행했기에 생략가능)

In [None]:
train['gender'] = train['gender'].replace(['F','M'],[0,1])
test['gender'] = test['gender'].replace(['F','M'],[0,1])
print('gender :')
print(train['gender'].value_counts())
print('--------------')

print('Having a car or not : ')
train['car'] = train['car'].replace(['N','Y'],[0,1])
test['car'] = test['car'].replace(['N','Y'],[0,1])
print(train['car'].value_counts())
print('--------------')

print('Having house reality or not: ')
train['reality'] = train['reality'].replace(['N','Y'],[0,1])
test['reality'] = test['reality'].replace(['N','Y'],[0,1])
print(train['reality'].value_counts())
print('--------------')
      
print('Having a phone or not: ')
print(train['phone'].value_counts())
print('--------------')
      
print('Having a email or not: ')
print(train['email'].value_counts())
print('--------------')
      
print('Having a work phone or not: ')
print(train['work_phone'].value_counts())
print('--------------')

## 2.2 continuous variables

### (1) child_num

In [None]:
train['child_num'].value_counts(sort=False).plot.bar()

In [None]:
train.loc[train['child_num'] >= 2,'child_num']=2
test.loc[test['child_num']>=2, 'child_num']=2

### (2) inc

In [None]:
train['income_total'] = train['income_total'].astype(object)
train['income_total'] = train['income_total']/10000 
test['income_total'] = test['income_total']/10000
##############################################################3
print(train['income_total'].value_counts(bins=10,sort=False))
train['income_total'].plot(kind='hist',bins=50,density=True)

In [None]:
count, bin_dividers =np.histogram(train['income_total'], bins=7)
bin_names=['소득'+str(i) for i in range(7) ]
#bin_dividers는 train기준!!
train['income_total']=pd.cut(x=train['income_total'], bins=bin_dividers, labels=bin_names, include_lowest=True)
test['income_total']=pd.cut(x=test['income_total'], bins=bin_dividers, labels=bin_names, include_lowest=True)

### (3) type 

In [None]:
print(train['income_type'].unique())
print(train['edu_type'].unique())
print(train['family_type'].unique())
print(train['house_type'].unique())

In [None]:
from sklearn import preprocessing
label_encoder=preprocessing.LabelEncoder()

train['income_type']=label_encoder.fit_transform(train['income_type'])
test['income_type']=label_encoder.transform(test['income_type'])
########################################################################
train['edu_type']=label_encoder.fit_transform(train['edu_type'])
test['edu_type']=label_encoder.transform(test['edu_type'])
########################################################################
train['family_type']=label_encoder.fit_transform(train['family_type'])
test['family_type']=label_encoder.transform(test['family_type'])
########################################################################
train['house_type']=label_encoder.fit_transform(train['house_type'])
test['house_type']=label_encoder.transform(test['house_type'])
########################################################################
train['income_total']=label_encoder.fit_transform(train['income_total'])
test['income_total']=label_encoder.fit_transform(test['income_total'])
########################################################################
train['occyp_type']=label_encoder.fit_transform(train['occyp_type'])
test['occyp_type']=label_encoder.fit_transform(test['occyp_type'])
########################################################################

## 2.3 minus continuous variable

In [None]:
#minus 변경하고
#구간화 함수
def make_bin(variable, n):
    train[variable]=-train[variable]
    test[variable]=-test[variable]
    count, bin_dividers =np.histogram(train[variable], bins=n) #train의 구간화를 적용
    bin_names=[str(i) for i in range(n)]
    train[variable]=pd.cut(x=train[variable], bins=bin_dividers, labels=bin_names, include_lowest=True)
    test[variable]=pd.cut(x=test[variable], bins=bin_dividers, labels=bin_names, include_lowest=True)
    test[variable].fillna(str(0), inplace=True) #test에는 없는 것을 임의의 값으로 채움
    ##########################################################
    train[variable]=label_encoder.fit_transform(train[variable])
    test[variable]=label_encoder.transform(test[variable])

In [None]:
make_bin('DAYS_BIRTH', n=10)
make_bin('DAYS_EMPLOYED', n=6)
make_bin('begin_month', n=4)

## train & test 준비완료

In [None]:
train

In [None]:
test

# **3. 모델링(추후 논의 필요)**

## 3.0 RandomForestClassifier (베이스코드라인)

In [None]:
train_x=train.drop('credit', axis=1)
train_y=train[['credit']]
test_x=test

In [None]:
print(train_x.shape, train_y.shape, test.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.metrics import f1_score
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, 
                                                  stratify=train_y, test_size=0.25, random_state = 10086)

print("Train set: ")
print(X_train.shape)
print(y_train.shape)
print("===========")
print("Validation set: ")
print(X_val.shape)
print(y_val.shape)


clf=RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred=clf.predict_proba(X_val)

print(f"log_loss: {log_loss(to_categorical(y_val['credit']), y_pred)}")

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold

def run_kfold(clf):
    folds=StratifiedKFold(n_splits=5, shuffle=True, random_state=55)
    outcomes=[]
    sub=np.zeros((test_x.shape[0], 3))  
    for n_fold, (train_index, val_index) in enumerate(folds.split(train_x, train_y)):
        X_train, X_val = train_x.iloc[train_index], train_x.iloc[val_index]
        y_train, y_val = train_y.iloc[train_index], train_y.iloc[val_index]
        clf.fit(X_train, y_train)
        
        predictions=clf.predict_proba(X_val)
        
        logloss=log_loss(to_categorical(y_val['credit']), predictions)
        outcomes.append(logloss)
        print(f"FOLD {n_fold} : logloss:{logloss}")
        
        sub+=clf.predict_proba(test_x)
        
        
    mean_outcome=np.mean(outcomes)
    
    print("Mean:{}".format(mean_outcome))
    return sub/folds.n_splits

my_submission = run_kfold(clf)

In [None]:
my_submission

In [None]:
submission

In [None]:
submission.loc[:,1:]=my_submission

In [None]:
submission

## 3.1 decision tree

## 3.2 CNN

## 3.3 LSTN

## 3.4 XGBoost / Catboost / randomforest 등등

# **4. 결과비교**