In [2]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score


In [85]:
df = pd.read_csv("train.csv", index_col='id')
test = pd.read_csv("test.csv", index_col='id')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 58645 entries, 0 to 58644
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  58645 non-null  int64  
 1   person_income               58645 non-null  int64  
 2   person_home_ownership       58645 non-null  object 
 3   person_emp_length           58645 non-null  float64
 4   loan_intent                 58645 non-null  object 
 5   loan_grade                  58645 non-null  object 
 6   loan_amnt                   58645 non-null  int64  
 7   loan_int_rate               58645 non-null  float64
 8   loan_percent_income         58645 non-null  float64
 9   cb_person_default_on_file   58645 non-null  object 
 10  cb_person_cred_hist_length  58645 non-null  int64  
 11  loan_status                 58645 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 5.8+ MB


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39098 entries, 58645 to 97742
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  39098 non-null  int64  
 1   person_income               39098 non-null  int64  
 2   person_home_ownership       39098 non-null  object 
 3   person_emp_length           39098 non-null  float64
 4   loan_intent                 39098 non-null  object 
 5   loan_grade                  39098 non-null  object 
 6   loan_amnt                   39098 non-null  int64  
 7   loan_int_rate               39098 non-null  float64
 8   loan_percent_income         39098 non-null  float64
 9   cb_person_default_on_file   39098 non-null  object 
 10  cb_person_cred_hist_length  39098 non-null  int64  
dtypes: float64(3), int64(4), object(4)
memory usage: 3.6+ MB


이상치 있을 수 있을 수 있음 
person_income, age 쪽 

In [89]:
#정규화 X 범주형 데이터 변환 X

# 범주형 변수를 'category' 타입으로 변환
categorical_features = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
df[categorical_features] = df[categorical_features].astype('category')

# XGBoost 모델 훈련 시 enable_categorical=True 설정
X = df.drop('loan_status', axis=1)
y = df['loan_status']

X_train_origin, X_test_origin, y_train_origin, y_test_origin = train_test_split(X, y, test_size=0.25, random_state=42)

In [90]:
#정규화만 진행

numerical_features = df.select_dtypes(include=['number']).columns
X = df[numerical_features].drop('loan_status', axis=1)
y = df['loan_status']

X_train, X_test, y_train_final, y_test_final = train_test_split(X, y, test_size=0.25, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 범주형 변수를 'category' 타입으로 변환
categorical_features = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
categorical_data  = df[categorical_features].astype('category')

# 숫자형 데이터 정규화된 결과를 DataFrame으로 변환
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

# 정규화된 숫자형 데이터에 범주형 데이터 병합
X_train_final = pd.concat([X_train_scaled_df, categorical_data.loc[X_train.index]], axis=1)
X_test_final = pd.concat([X_test_scaled_df, categorical_data.loc[X_test.index]], axis=1)

In [80]:
# 범주형 처리 추가 / 정규화 X 
df_encoded = pd.get_dummies(df, columns=['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file'])
df_encoded

X = df_encoded[numerical_features].drop('loan_status', axis=1)
y = df_encoded['loan_status']

X_train_encoded, X_test_encoded, y_train_encoded, y_test_encoded = train_test_split(X, y, test_size=0.25, random_state=42)

In [71]:
# 범주형 처리 추가 / 정규화 O
df_encoded = pd.get_dummies(df, columns=['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file'])
df_encoded

X = df_encoded[numerical_features].drop('loan_status', axis=1)
y = df_encoded['loan_status']

X_train, X_test, y_train_scaled, y_test_scaled = train_test_split(X, y, test_size=0.25, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Logistic Regression

In [7]:
# 로지스틱 회귀 모델 훈련
model = LogisticRegression(solver='saga', max_iter=100)
model.fit(X_train_scaled, y_train)

# 예측 및 성능 평가
y_pred = model.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9120174601009412
              precision    recall  f1-score   support

           0       0.93      0.97      0.95     12613
           1       0.77      0.52      0.62      2049

    accuracy                           0.91     14662
   macro avg       0.85      0.75      0.79     14662
weighted avg       0.91      0.91      0.90     14662



RandomForestRegressor

In [None]:
# SVM 해보기
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100)

rf.fit(X_train, y_train)

print(rf.score(X_test, y_test))
y_pred = rf.predict(X_test)
y_pred = y_pred.round()
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [15]:
rf.score(X_test_scaled, y_test)

0.642563542692653

In [17]:
y_pred = rf.predict(X_test_scaled)
y_pred = y_pred.round()
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9513708907379621
              precision    recall  f1-score   support

           0       0.96      0.99      0.97     12613
           1       0.92      0.72      0.81      2049

    accuracy                           0.95     14662
   macro avg       0.94      0.85      0.89     14662
weighted avg       0.95      0.95      0.95     14662



SVM

In [60]:
from sklearn.svm import SVC

df_encoded = pd.get_dummies(df, columns=['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file'])

X = df_encoded.drop('loan_status', axis=1)
y = df_encoded['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
# 스케일링 적용
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svm_model = SVC(kernel='rbf')
svm_model.fit(X_train_scaled, y_train)

y_pred_svm = svm_model.predict(X_test_scaled)
print("F1-score (SVM):", f1_score(y_test, y_pred_svm))

F1-score (SVM): 0.7654879773691655


In [20]:
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97     12613
           1       0.91      0.66      0.77      2049

    accuracy                           0.94     14662
   macro avg       0.93      0.82      0.87     14662
weighted avg       0.94      0.94      0.94     14662



Gradient Boosting

In [81]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(eval_metric='logloss', random_state=55)
xgb_model.fit(X_train, y_train)

# 예측 및 평가
y_pred = xgb_model.predict(X_test)
print("F1-score (XGBoost):", f1_score(y_test, y_pred))

F1-score (XGBoost): 0.6622055674518201


In [72]:
xgb_model = xgb.XGBClassifier(eval_metric='logloss', random_state=55)
xgb_model.fit(X_train_scaled, y_train)

# 예측 및 평가
y_pred = xgb_model.predict(X_test_scaled)
print("F1-score (XGBoost):", f1_score(y_test, y_pred))

F1-score (XGBoost): 0.6622055674518201


In [79]:
xgb_model = xgb.XGBClassifier(eval_metric='logloss', enable_categorical=True, random_state=55)
xgb_model.fit(X_train_final, y_train)

# 예측 및 평가
y_pred = xgb_model.predict(X_test_final)
print("F1-score (XGBoost):", f1_score(y_test, y_pred))

F1-score (XGBoost): 0.8135411069317571


In [84]:
xgb_model = xgb.XGBClassifier(eval_metric='logloss' , enable_categorical=True, random_state=55)
xgb_model.fit(X_train, y_train)

# 예측 및 평가
y_pred = xgb_model.predict(X_test)
print("F1-score (XGBoost):", f1_score(y_test, y_pred))

F1-score (XGBoost): 0.8135411069317571


In [None]:
# 범주형 처리 / 정규화 O


In [9]:
test_encoded = pd.get_dummies(test, columns=['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file'])
test_encoded_scaled = scaler.fit_transform(test_encoded)