# 라이브러리

In [1]:
import numpy as np
from scipy import stats
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
import matplotlib.pyplot as plt
import re
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from scipy.stats import ttest_ind
from scipy.stats import chi2_contingency
from itertools import combinations

# 데이터 로드

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 파생변수 생성

In [3]:
train['대출기간'] = train['대출기간'].apply(lambda x: int(re.search(r'\d+', x).group()))
test['대출기간'] = test['대출기간'].apply(lambda x: int(re.search(r'\d+', x).group()))

train['월상환액_대출금액_비율'] = (train['총상환원금'] + train['총상환이자']) / (train['대출금액'] / (train['대출기간']))
test['월상환액_대출금액_비율'] = (test['총상환원금'] + test['총상환이자']) / (test['대출금액'] / (test['대출기간']))

train['상환_대비_대출금_비율'] = train['총상환원금'] / train['대출금액']
test['상환_대비_대출금_비율'] = test['총상환원금'] / test['대출금액']

train['대출_상환_비율'] = (train['총상환원금'] + train['총상환이자']) / train['대출금액']
test['대출_상환_비율'] = (test['총상환원금'] + test['총상환이자']) / test['대출금액']

# 통계 검정

In [4]:
df = train
def chi2_test(dependent_var, independent_var):
    contingency_table = pd.crosstab(df[dependent_var], df[independent_var])
    chi2, p, _, _ = chi2_contingency(contingency_table)
    return chi2, p

# 예시: 대출등급과 주택소유상태 간의 카이제곱 검정 수행
cols = df.select_dtypes(include = object).columns
for col in cols:
    print(f'>>>>>>>>>>>>>>>>>{col} 검정 시작>>>>>>>>>>>>>>>>>')
    chi2_stat, p_value = chi2_test('대출등급', col)
    print(f"Chi2 Statistic: {chi2_stat}")
    print(f"P-value: {p_value}")

def t_test(dependent_var, independent_var):
    group1 = df[df[dependent_var] == 'A'][independent_var]
    group2 = df[df[dependent_var] == 'B'][independent_var]
    t_stat, p_value = ttest_ind(group1, group2, equal_var=False)  # 등분산 가정 안 함
    return t_stat, p_value

cols = df.select_dtypes(exclude = object).columns
for col in cols:
    print(f'>>>>>>>>>>>>>>>>>{col} 검정 시작>>>>>>>>>>>>>>>>>')
    t_stat, p_value = t_test('대출등급', col)
    print(f"T-statistic: {t_stat}")
    print(f"P-value: {p_value}")

>>>>>>>>>>>>>>>>>ID>>>>>>>>>>>>>>>>>
Chi2 Statistic: 577763.9999999998
P-value: 0.49752584642206804
>>>>>>>>>>>>>>>>>근로기간>>>>>>>>>>>>>>>>>
Chi2 Statistic: 187.56633181905977
P-value: 7.714129711631081e-09
>>>>>>>>>>>>>>>>>주택소유상태>>>>>>>>>>>>>>>>>
Chi2 Statistic: 663.4327372511038
P-value: 3.225488826409481e-129
>>>>>>>>>>>>>>>>>대출목적>>>>>>>>>>>>>>>>>
Chi2 Statistic: 5744.213056799291
P-value: 0.0
>>>>>>>>>>>>>>>>>대출등급>>>>>>>>>>>>>>>>>
Chi2 Statistic: 577764.0000000001
P-value: 0.0
>>>>>>>>>>>>>>>>>대출금액>>>>>>>>>>>>>>>>>
T-statistic: 11.279409144817087
P-value: 1.8557439688926076e-29
>>>>>>>>>>>>>>>>>대출기간>>>>>>>>>>>>>>>>>
T-statistic: -54.028589761779614
P-value: 0.0
>>>>>>>>>>>>>>>>>연간소득>>>>>>>>>>>>>>>>>
T-statistic: 13.149245118776847
P-value: 2.2068624911263098e-39
>>>>>>>>>>>>>>>>>부채_대비_소득_비율>>>>>>>>>>>>>>>>>
T-statistic: -20.88524996571793
P-value: 2.593439782473503e-96
>>>>>>>>>>>>>>>>>총계좌수>>>>>>>>>>>>>>>>>
T-statistic: 14.747932193446172
P-value: 4.45498589649121e-49
>>>>>>>>>>>>>>>

# 변수 선택

In [5]:
train.drop(columns = ['연체계좌수', '최근_2년간_연체_횟수', '주택소유상태', '대출목적', '총계좌수', '근로기간', '총연체금액'], inplace=True)
test.drop(columns = ['연체계좌수', '최근_2년간_연체_횟수', '주택소유상태', '대출목적', '총계좌수', '근로기간', '총연체금액'], inplace=True)

train.set_index('ID', inplace=True)
test.set_index('ID', inplace=True)

# 이상치 처리 및 스케일링

In [6]:
def remove_outliers(data, column_name, grade_column='대출등급', alpha=0.00044):
  data_no_outliers = pd.DataFrame()
  for grade in data[grade_column].unique():
      subset = data[data[grade_column] == grade]
      lower_limit = subset[column_name].quantile(alpha / 2)
      upper_limit = subset[column_name].quantile(1 - alpha / 2)
      subset_no_outliers = subset[(subset[column_name] >= lower_limit) & (subset[column_name] <= upper_limit)]
      data_no_outliers = pd.concat([data_no_outliers, subset_no_outliers])
  return data_no_outliers
cols = train.select_dtypes(exclude=object).columns
for col in cols:
  train = remove_outliers(train, col, '대출등급')

cols = train.select_dtypes(exclude=object).columns
scaler = MinMaxScaler()
train[cols] = scaler.fit_transform(train[cols])
test[cols] = scaler.transform(test[cols])

# tenure_mapping = {
#     '10+ years': 11,
#     '2 years': 3,
#     '< 1 year': 1,
#     '3 years': 4,
#     '1 year': 2,
#     'Unknown': 0,
#     '5 years': 6,
#     '4 years': 5,
#     '8 years': 9,
#     '6 years': 7,
#     '7 years': 8,
#     '9 years': 10,
#     '10+years': 11,
#     '<1 year': 1,
#     '3': 4,
#     '1 years': 2
# }
# train['근로기간'] = train['근로기간'].map(tenure_mapping)
# test['근로기간'] = test['근로기간'].map(tenure_mapping)

# 라벨링

In [7]:
if '대출목적' in test.columns:
  test['대출목적'] = test['대출목적'].replace('결혼', '휴가')

# cols = test.select_dtypes(include=object).columns
# for col in cols:
#     le = LabelEncoder()
#     train[col] = le.fit_transform(train[col])
#     test[col] = le.transform(test[col])

map_list = {'A' : 6, 'B' : 5, 'C' : 4, 'D' : 3,  'E' : 2,  'F' : 1, 'G' : 0}
for key, item in map_list.items():
  train['대출등급'] = train['대출등급'].replace(key, item)

In [None]:
X = train.drop(columns=['대출등급'])
y = train['대출등급']

# 클래스 불균형 문제 해결 (사용안함)

In [8]:
# sampling_strategy_under = {
#   0: y.value_counts()[0],
#   1: y.value_counts()[1],
#   2: y.value_counts()[2]-(i*100),
#   3: y.value_counts()[3]-(i*450),
#   4: y.value_counts()[4]-(i*1000),
#   5: y.value_counts()[5]-(i*1000),
#   6: y.value_counts()[6]-(i*500)
# }

# sampling_strategy_over = {
#     0: y.value_counts()[0]+(10*i),
#     1: y.value_counts()[1]+(20*i),
#     2: y.value_counts()[2]+(30*i),
#     3: y.value_counts()[3],
#     4: y.value_counts()[4],
#     5: y.value_counts()[5],
#     6: y.value_counts()[6]
# }

# 오버샘플링
# smote = SMOTE(sampling_strategy=sampling_strategy_over, random_state=1111)
# smote = SMOTE(random_state=1111)
# smote = SMOTETomek(tomek=TomekLinks(sampling_strategy='auto'))
# X, y = smote.fit_resample(X, y)


# 언더샘플링
# under_sampler = RandomUnderSampler(sampling_strategy=sampling_strategy_under,random_state=1111)
# X, y = under_sampler.fit_resample(X, y)
# print(y.value_counts())

# 5-fold 교차 검증

In [9]:
# # 5-fold 교차 검증을 위한 StratifiedKFold 객체 생성
# mac_f1score = []
# best_model = None
# best_score = 0.0
# kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# for i, (train_index, val_index) in enumerate(kfold.split(X, y)):
#   X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
#   y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

#   # 모델 생성 및 훈련
#   model = RandomForestClassifier(n_estimators= 190,
#                                 max_depth=15,
#                                 min_samples_split = 4,
#                                 # max_features  = None,
#                                 random_state=1111
#                                 )

#   model.fit(X_train_fold, y_train_fold)

#   # 검증 세트에 대한 예측
#   pred_val = model.predict(X_val_fold)

#   # 검증 세트 정확도 평가
#   # pred_val = np.round(pred_val) # regressor
#   mac_f1 = f1_score(y_val_fold, pred_val, average='macro')
#   print(f'{i+1} Fold macro f1 score: {mac_f1}')
#   mac_f1score.append(mac_f1)

# print('5-fold 평균 macro f1 score:', sum(mac_f1score) / len(mac_f1score))

# 결과

In [10]:
# 모델 생성 및 훈련
model = RandomForestClassifier(n_estimators= 190,
                              max_depth=15,
                              min_samples_split = 4,
                              max_features  = None,
                              random_state=1111
                              )

model.fit(X, y)
pred = model.predict(test)

sub = pd.read_csv('sample_submission.csv')
sub['대출등급'] = pred
for key, item in map_list.items():
    sub['대출등급'] = sub['대출등급'].replace(item, key)
sub

Unnamed: 0,ID,대출등급
0,TEST_00000,B
1,TEST_00001,B
2,TEST_00002,A
3,TEST_00003,C
4,TEST_00004,C
...,...,...
64192,TEST_64192,D
64193,TEST_64193,D
64194,TEST_64194,D
64195,TEST_64195,C
