### 데이터 살펴보기

메서드 불러오기

In [1]:
# 기본 패키지
import pandas as pd
import numpy as np

# 시각화 패키지
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.rc('font', family='NanumBarunGothic') 
plt.style.use(['ggplot'])

# 한글 시각화시 필요 모듈
import platform # 사용자 운영체제 확인
platform.system()
## 운영체제별 한글 폰트 설정
if platform.system() == 'Darwin': # Mac 환경 폰트 설정
    plt.rc('font', family='AppleGothic')
elif platform.system() == 'Windows': # Windows 환경 폰트 설정
    plt.rc('font', family='Malgun Gothic')

plt.rc('axes', unicode_minus=False) # 마이너스 폰트 설정
## 글씨 선명하게 출력하는 설정
%config InlineBackend.figure_format = 'retina'

# 경고 메시지 무시
import warnings
warnings.filterwarnings(action='ignore') 

# 버전 확인
print('Pandas : %s'%(pd.__version__))
print('seaborn : %s'%(sns.__version__))
print('matplotlib : %s'%(matplotlib.__version__))
!python --version

Pandas : 1.4.3
seaborn : 0.11.2
matplotlib : 3.5.1
Python 3.8.8


- 데이터 불러오기

In [2]:
# 데이터 불러오기
data = pd.read_csv('../data/make_feature.csv')

## Preprocessing

In [3]:
# 모델링 패키지
import sklearn
from sklearn.model_selection import train_test_split

# split
from sklearn.model_selection import train_test_split

# encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# eval
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score, accuracy_score

    - 데이터 분리

In [4]:
# 데이터에 있는 중복을 제거해줌
data = data.drop_duplicates(subset=['application_id', 'product_id'])

In [5]:
# 데이터 불러오기
test_ = pd.read_csv('../data/데이터분석분야_퓨처스부문_평가데이터.csv')

In [6]:
# 제출 데이터와 동일하게 해주기 위한 작업
test = data[data['month'] == 6].drop('is_applied', axis=1).reset_index(drop=True)
x_test = pd.merge(test_, test, on=['application_id', 'product_id']).drop('is_applied', axis=1)

In [7]:
# x_test의 index를 가장 앞으로 가져오기 위한 작업
x_test_index = x_test['index']
x_test = x_test.drop(columns=['index'])
x_test.insert(0,'index',x_test_index)

In [8]:
x = data[data['month'] != 6].drop('is_applied', axis=1).reset_index(drop=True)
y = data[data['month'] != 6]['is_applied'].reset_index(drop=True)

In [9]:
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, shuffle=True, stratify=y, random_state=3)

    - Preprocessing

In [10]:
# 범주형 변수와 수치형 변수를 분리
## train
cat_feature_tr = x_train.iloc[:,1:].select_dtypes(include=['object']).columns.to_list()
num_feature_tr = x_train.iloc[:,1:].select_dtypes(exclude='object').columns.to_list() 

## valid
cat_feature_val = x_valid.iloc[:,1:].select_dtypes(include=['object']).columns.to_list()
num_feature_val = x_valid.iloc[:,1:].select_dtypes(exclude='object').columns.to_list()

## test
cat_feature_te = x_test.iloc[:,1:].select_dtypes(include=['object']).columns.to_list()
num_feature_te = x_test.iloc[:,1:].select_dtypes(exclude='object').columns.to_list() 

    - Encoding

In [11]:
# Labelencoder
encoder = LabelEncoder()
encoded_cat_tr = []
encoded_cat_val = []
encoded_cat_te = []
for f in cat_feature_tr:
    encoder = encoder.fit(data[f])
    encoded_cat_tr.append(pd.DataFrame(encoder.transform(x_train[f]),columns = [f])) # train
    encoded_cat_val.append(pd.DataFrame(encoder.transform(x_valid[f]),columns = [f])) # test
    encoded_cat_te.append(pd.DataFrame(encoder.transform(x_test[f]),columns = [f])) # test
    
# 합쳐줌
cat_tr = pd.concat(encoded_cat_tr, axis = 1) # train
cat_val = pd.concat(encoded_cat_val, axis = 1) # test
cat_te = pd.concat(encoded_cat_te, axis = 1) # test

    - scaling

In [12]:
# scaling
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MaxAbsScaler
# Model
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [13]:
# scaling
scaler = RobustScaler()
scaler.fit(data[num_feature_tr])
# transform
num_tr = pd.DataFrame(scaler.transform(x_train[num_feature_tr]))
num_val = pd.DataFrame(scaler.transform(x_valid[num_feature_val]))
num_te = pd.DataFrame(scaler.transform(x_test[num_feature_te]))

# 컬럼명 변경
num_tr.columns = num_feature_tr
num_val.columns = num_feature_val
num_te.columns = num_feature_te

    - concat

In [14]:
# 인코딩 이후 수치형 변수와 범주형 변수를 합쳐줌
x_train = pd.concat([x_train['index'].reset_index(drop=True), cat_tr, num_tr], axis=1)
x_valid = pd.concat([x_valid['index'].reset_index(drop=True), cat_val, num_val], axis=1)
x_test = pd.concat([x_test['index'].reset_index(drop=True), cat_te, num_te], axis=1)

## Modeling

    - LGBM

In [15]:
lgbm = LGBMClassifier(random_state=42)

# valid 예측
lgbm.fit(x_train.iloc[:,1:], y_train)
pred_lgbm = lgbm.predict(x_valid.iloc[:,1:])
proba_lgbm = lgbm.predict_proba(x_valid.iloc[:,1:])
proba_test_lgbm = lgbm.predict_proba(x_test.iloc[:,1:])

# lgbm F1
f1_score(y_valid, pred_lgbm)

0.45977762079180295

    - catboost

In [18]:
# cat = CatBoostClassifier(random_state=42, verbose=0)

# # valid 예측
# cat.fit(x_train.iloc[:,1:], y_train)
# pred_cat = cat.predict(x_valid.iloc[:,1:])
# proba_cat = cat.predict_proba(x_valid.iloc[:,1:])
# proba_test_cat = cat.predict_proba(x_test.iloc[:,1:])

# # catboost F1
# f1_score(y_valid, pred_cat)

0.4981404569163005

    - randomforest -> 오래 걸림

In [14]:
# rf = RandomForestClassifier(random_state=42)
# # valid 예측
# rf.fit(x_train.iloc[:,1:], y_train)
# pred_rf = rf.predict(x_valid.iloc[:,1:])
# proba_rf = rf.predict_proba(x_valid.iloc[:,1:])

# # randomforest F1
# f1_score(y_valid, pred_rf)

    - Descision Tree

In [22]:
dt = DecisionTreeClassifier(random_state=42)
# valid 예측
dt.fit(x_train.iloc[:,1:], y_train)
pred_dt = dt.predict(x_valid.iloc[:,1:])
proba_dt = dt.predict_proba(x_valid.iloc[:,1:])
proba_test_dt = dt.predict_proba(x_test.iloc[:,1:])

# Descision Tree F1
f1_score(y_valid, pred_dt)

0.4518845494835365

    - XGBClassifier

In [23]:
xgb = XGBClassifier(random_state = 42)

# valid 예측
xgb.fit(x_train.iloc[:,1:], y_train)
pred_xgb = xgb.predict(x_valid.iloc[:,1:])
proba_xgb = xgb.predict_proba(x_valid.iloc[:,1:])
proba_test_xgb = xgb.predict_proba(x_test.iloc[:,1:])

# F1
f1_score(y_valid, pred_xgb)

0.48016071159004614

## Feature Importance

In [None]:
# from lightgbm import plot_importance
# import eli5
# from eli5.sklearn import PermutationImportance

- CART Feature Importance

In [None]:
# # LGBM 모델을 통한 Feature Importance 파악
# plot_importance(lgbm, figsize=(10, 8), max_num_features=10)

- permutation importance
    - 위의 Feature importance 방식과 다르게 변수의 독립적인 중요도룰 판단하는 방식
    - 다른 변수와의 상호작용으로 인하여 변수의 중요도를 높게 판단하는 경우를 방지해줌 ( 독립 변수 자체의 변수 중요도룰 판단할 수 있음 )

In [19]:
# # LGBM 모델의 permutation importance 파악
# model = lgbm
# perm = PermutationImportance(model, scoring = "f1", random_state = 42).fit(x_train.iloc[:,1:], y_train)
# eli5.show_weights(perm,top=10, feature_names = x_train.iloc[:,1:].columns.tolist())

## ensemble

    - weighted sum

In [22]:
# # dnn 모델의 앙상블을 위해 결과를 불러옴
# prob_dnn = pd.read_csv('../data/val_prob_dnn3.csv')['pred']

In [23]:
# proba_dt = pd.DataFrame(proba_dt)[1]
# proba_lgbm = pd.DataFrame(proba_lgbm)[1]
# proba_xgb = pd.DataFrame(proba_xgb)[1]

In [24]:
# proba = (proba_dt*0.4+ proba_lgbm*0.4+ proba_xgb*0.1 + prob_dnn*0.1)

In [25]:
# # emsemble F1
# f1_score(y_valid, [1.0 if i > 0.3 else 0.0 for i in proba])

## 최종 결과값 도출

In [16]:
# dnn 모델의 앙상블을 위해 결과를 불러옴
test_dnn = pd.read_csv('../data/test_prob_dnn.csv')

In [17]:
# # lgbm의 결과를 가져옴
# test_lgbm = pd.DataFrame(proba_test_lgbm)[1]

In [19]:
test_proba = test_lgbm

In [20]:
test_['is_applied'] = [1.0 if i > 0.1 else 0.0 for i in test_proba]

In [21]:
test_.to_csv('../data/데이터분석분야_퓨처스부문_import_finda_평가데이터.csv')

## END