In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
import numpy as np
import os
import pandas as pd
import random


def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)


random_seed = 42
seed_everything(random_seed) # Seed 고정

In [18]:
train = pd.read_csv('/content/drive/MyDrive/고객 대출등급 분류 해커톤/data/train.csv')
test = pd.read_csv('/content/drive/MyDrive/고객 대출등급 분류 해커톤/data/test.csv')

## **데이터 전처리**

#### 데이터 통합 및 제거

In [19]:
### 원핫인코딩을 사용하므로 학습데이터 한 개를 버리더라도 컬럼 한개를 줄이는 것이 더 낫다 생각하여 제거
train = train.drop(train[train.주택소유상태=='ANY'].index)

In [20]:
### 원핫인코딩을 사용하므로 대출목적에서 등장횟수가 매우 많은 '부채 통합', '신용 카드', '주택 개선'을 제외하고는 모두 '그외'로 통합
loan_purpose = dict()

for purpose in set(train['대출목적']).union(set(test['대출목적'])):
  if purpose == '부채 통합':
    loan_purpose[purpose] = '부채 통합'
  elif purpose == '신용 카드':
    loan_purpose[purpose] = '신용 카드'
  elif purpose == '주택 개선':
    loan_purpose[purpose] = '주택 개선'
  else:
    loan_purpose[purpose] = '그외'

train['대출목적_통합'] = train['대출목적'].replace(loan_purpose)
test['대출목적_통합'] = test['대출목적'].replace(loan_purpose)

In [21]:
train.loc[train['근로기간']=='< 1 year','근로기간'] = '0 year'
train.loc[train['근로기간']=='<1 year','근로기간'] = '0 year'
train.loc[train['근로기간']=='1 years','근로기간'] = '1 year'
train.loc[train['근로기간']=='3','근로기간'] = '3 years'
train.loc[train['근로기간']=='10+ years','근로기간'] = '10 years'
train.loc[train['근로기간']=='10+years','근로기간'] = '10 years'
test.loc[test['근로기간']=='< 1 year','근로기간'] = '0 year'
test.loc[test['근로기간']=='<1 year','근로기간'] = '0 year'
test.loc[test['근로기간']=='1 years','근로기간'] = '1 year'
test.loc[test['근로기간']=='3','근로기간'] = '3 years'
test.loc[test['근로기간']=='10+ years','근로기간'] = '10 years'
test.loc[test['근로기간']=='10+years','근로기간'] = '10 years'

In [22]:
train.loc[train['대출기간'] == ' 36 months','대출기간'] = '36 months'
train.loc[train['대출기간'] == ' 60 months','대출기간'] = '60 months'
test.loc[test['대출기간'] == ' 36 months','대출기간'] = '36 months'
test.loc[test['대출기간'] == ' 60 months','대출기간'] = '60 months'

### 범주형 데이터 처리

In [23]:
from sklearn.preprocessing import LabelEncoder

cols = ['대출기간','주택소유상태','대출목적_통합','근로기간']
enc = LabelEncoder()

for col in cols:
  train[col] = enc.fit_transform(train[col])
  test[col] = enc.transform(test[col])

In [24]:
# from sklearn.preprocessing import OneHotEncoder

# enc=OneHotEncoder(handle_unknown='ignore')
# col=['주택소유상태','대출목적_통합','근로기간']
# enc.fit(train[col])

# encoded_data = enc.fit_transform(train[col]).toarray()
# encoded_df = pd.DataFrame(encoded_data, columns=enc.get_feature_names_out(col))
# train = pd.concat([train.drop(columns=col).reset_index(), encoded_df], axis=1).drop('index',axis=1)

# encoded_data = enc.transform(test[col]).toarray()
# encoded_df = pd.DataFrame(encoded_data, columns=enc.get_feature_names_out(col))
# test = pd.concat([test.drop(columns=col).reset_index(), encoded_df], axis=1).drop('index',axis=1)

In [25]:
# work_period_dic = {'0 year':0,'1 year':1, '2 years':2, '3 years':3, '4 years':4, '5 years':5, '6 years': 6, '7 years': 7, '8 years': 8, '9 years': 9, '10 years': 10, 'Unknown': 11}
# train['근로기간'] = train['근로기간'].replace(work_period_dic)
# test['근로기간'] = test['근로기간'].replace(work_period_dic)

### 수치형 데이터 처리

In [26]:
cols = ['대출금액','연간소득','총상환원금','총상환이자']

for col in cols:
  train[col] = np.log1p(train[col])
  test[col] = np.log1p(test[col])

#### 스케일링(트리 기반 알고리즘에서는 큰 영향 X)

In [27]:
# from sklearn.preprocessing import StandardScaler
# scaler=StandardScaler()
# cols = ['총계좌수']

# train_scaled = scaler.fit_transform(train[cols])
# train_scaled_data = pd.DataFrame(train_scaled, columns=cols)
# test_sacled = scaler.transform(test[cols])
# test_scaled_data = pd.DataFrame(test_sacled, columns=cols)
# for col in cols:
#   train[col] = np.abs(train_scaled_data[col])
#   test[col] = np.abs(test_scaled_data[col])

In [28]:
train = train.drop(['ID','대출목적','부채_대비_소득_비율'], axis=1)
test = test.drop(['ID','대출목적','부채_대비_소득_비율'], axis=1)

In [29]:
train

Unnamed: 0,대출금액,대출기간,근로기간,주택소유상태,연간소득,총계좌수,최근_2년간_연체_횟수,총상환원금,총상환이자,총연체금액,연체계좌수,대출등급,대출목적_통합
0,16.339638,0,7,2,18.092177,15,0,0.000000,0.000000,0.0,0.0,C,1
1,16.482739,1,2,0,18.689180,21,0,12.830869,12.363337,0.0,0.0,B,3
2,16.300417,0,6,0,18.379859,14,0,13.741482,11.931274,0.0,0.0,A,1
3,16.482739,0,9,0,18.698312,15,0,12.694116,11.938905,0.0,0.0,C,1
4,16.705882,1,11,2,18.088503,19,0,12.339471,11.911413,0.0,0.0,B,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
96289,16.482739,0,2,0,19.162618,33,0,13.789763,13.106577,0.0,0.0,C,2
96290,17.175886,1,2,0,18.698312,25,0,13.277192,13.658956,0.0,0.0,E,3
96291,16.482739,0,1,0,18.246327,22,0,14.213702,12.393535,0.0,0.0,A,2
96292,16.562782,0,6,0,18.010153,21,2,14.136411,13.614712,0.0,0.0,D,1


### 전처리 완료 후 csv 형태로 저장

In [30]:
train.to_csv('/content/drive/MyDrive/고객 대출등급 분류 해커톤/data/processed_train.csv', index=False)
test.to_csv('/content/drive/MyDrive/고객 대출등급 분류 해커톤/data/processed_test.csv', index=False)