# 고객 구매 데이터로 성별 예측 모델링
백화점 고객(3,500명)의 1년 간 구매 데이터

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import hyperopt-sklearn

import warnings
warnings.filterwarnings(action='ignore')

In [83]:
x = pd.read_csv('./data1/X.csv', encoding='euc-kr')
y = pd.read_csv('./data1/y.csv', encoding='euc-kr')

In [84]:
# 결측치 확인
x.isnull()

Unnamed: 0,cust_id,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,True,False,False,False,False,False,False
3,False,False,False,True,False,False,False,False,False,False
4,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
3495,False,False,False,True,False,False,False,False,False,False
3496,False,False,False,False,False,False,False,False,False,False
3497,False,False,False,True,False,False,False,False,False,False
3498,False,False,False,True,False,False,False,False,False,False


In [85]:
y.isnull()

Unnamed: 0,cust_id,gender
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
3495,False,False
3496,False,False
3497,False,False
3498,False,False


In [86]:
# x 에 환불 금액 NaN 은 환불이 없는 것 이기 때문에 0으로 변경

x['환불금액'] = x['환불금액'].fillna('0')
x

Unnamed: 0,cust_id,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기
0,0,68282840,11264000,6860000.0,기타,강남점,19,3.894737,0.527027,17
1,1,2136000,2136000,300000.0,스포츠,잠실점,2,1.500000,0.000000,1
2,2,3197000,1639000,0,남성 캐주얼,관악점,2,2.000000,0.000000,1
3,3,16077620,4935000,0,기타,광주점,18,2.444444,0.318182,16
4,4,29050000,24000000,0,보석,본 점,2,1.500000,0.000000,85
...,...,...,...,...,...,...,...,...,...,...
3495,3495,3175200,3042900,0,골프,본 점,1,2.000000,1.000000,0
3496,3496,29628600,7200000,6049600.0,시티웨어,부산본점,8,1.625000,0.461538,40
3497,3497,75000,75000,0,주방용품,창원점,1,1.000000,0.000000,0
3498,3498,1875000,1000000,0,화장품,본 점,2,1.000000,0.000000,39


In [87]:
y

Unnamed: 0,cust_id,gender
0,0,0
1,1,0
2,2,1
3,3,1
4,4,0
...,...,...
3495,3495,1
3496,3496,1
3497,3497,0
3498,3498,0


In [88]:
# cust_id 는 단순 인덱싱으로 볼 수 있기 때문에 제거

x = x.drop("cust_id", axis=1)
y = y.drop("cust_id", axis=1)

In [89]:
x

Unnamed: 0,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기
0,68282840,11264000,6860000.0,기타,강남점,19,3.894737,0.527027,17
1,2136000,2136000,300000.0,스포츠,잠실점,2,1.500000,0.000000,1
2,3197000,1639000,0,남성 캐주얼,관악점,2,2.000000,0.000000,1
3,16077620,4935000,0,기타,광주점,18,2.444444,0.318182,16
4,29050000,24000000,0,보석,본 점,2,1.500000,0.000000,85
...,...,...,...,...,...,...,...,...,...
3495,3175200,3042900,0,골프,본 점,1,2.000000,1.000000,0
3496,29628600,7200000,6049600.0,시티웨어,부산본점,8,1.625000,0.461538,40
3497,75000,75000,0,주방용품,창원점,1,1.000000,0.000000,0
3498,1875000,1000000,0,화장품,본 점,2,1.000000,0.000000,39


In [92]:
# 데이터 표준화를 위해 카테고리형 데이터를 숫자형으로 변환

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

x['주구매상품'] = le.fit_transform(x['주구매상품'])
x['주구매지점'] = le.fit_transform(x['주구매지점'])

In [94]:
merge = pd.concat([x, y], axis=1)

merge.corr()


Unnamed: 0,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기,gender
총구매액,1.0,0.70008,0.467686,-0.082916,-0.038724,0.659084,0.090022,0.014396,-0.212944,-0.150141
최대구매액,0.70008,1.0,0.429504,-0.090729,-0.024819,0.374147,0.01898,0.022277,-0.115837,-0.114323
환불금액,0.467686,0.429504,1.0,-0.056604,-0.045686,0.37757,-0.003871,-0.024707,-0.137362,-0.114327
주구매상품,-0.082916,-0.090729,-0.056604,1.0,0.015874,-0.185275,-0.274178,-0.010018,0.032469,-0.038668
주구매지점,-0.038724,-0.024819,-0.045686,0.015874,1.0,-0.05957,-0.080804,0.01199,0.035344,0.015876
내점일수,0.659084,0.374147,0.37757,-0.185275,-0.05957,1.0,0.225264,-0.010325,-0.2932,-0.155
내점당구매건수,0.090022,0.01898,-0.003871,-0.274178,-0.080804,0.225264,1.0,0.007659,-0.091151,-0.043917
주말방문비율,0.014396,0.022277,-0.024707,-0.010018,0.01199,-0.010325,0.007659,1.0,0.003372,0.073598
구매주기,-0.212944,-0.115837,-0.137362,0.032469,0.035344,-0.2932,-0.091151,0.003372,1.0,0.04145
gender,-0.150141,-0.114323,-0.114327,-0.038668,0.015876,-0.155,-0.043917,0.073598,0.04145,1.0


In [96]:
x = x[["주말방문비율", "구매주기"]]
x

Unnamed: 0,주말방문비율,구매주기
0,0.527027,17
1,0.000000,1
2,0.000000,1
3,0.318182,16
4,0.000000,85
...,...,...
3495,1.000000,0
3496,0.461538,40
3497,0.000000,0
3498,0.000000,39


In [97]:
# 데이터 표준화

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(x, y)
x_scaled = scaler.transform(x)

In [98]:
x_scaled

array([[ 0.75862274, -0.15996211],
       [-1.06053002, -0.80655356],
       [-1.06053002, -0.80655356],
       ...,
       [-1.06053002, -0.84696552],
       [-1.06053002,  0.72910114],
       [ 0.55277658, -0.5236698 ]], shape=(3500, 2))

In [99]:
x_scaled.mean(), x_scaled.std()

(np.float64(1.7256037868459575e-17), np.float64(1.0))

In [100]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=42)


## LogisticRegression

In [101]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=100000000000)
lr.fit(x_train, y_train)


## LogisticRegression 성능 평가

In [150]:
from sklearn.metrics import accuracy_score, classification_report
# 기본값

lr_pred = lr.predict(x_test)
lr_acc = accuracy_score(y_test, lr_pred)
lr_acc

0.6057142857142858

In [151]:
print(classification_report(y_test, lr_pred))

              precision    recall  f1-score   support

           0       0.61      0.99      0.75       427
           1       0.33      0.01      0.02       273

    accuracy                           0.61       700
   macro avg       0.47      0.50      0.39       700
weighted avg       0.50      0.61      0.47       700



## DecisionTreeClassifier

In [104]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()

dtc.fit(x_train, y_train)
dtc_pred = dtc.predict(x_test)
dtc_acc = accuracy_score(y_test, dtc_pred)
dtc_acc

0.5557142857142857

## DecisionTreeClassifier 성능 평가 및 최적 파라미터 설정

In [105]:
# 기본값
print(classification_report(y_test, lr_pred))

              precision    recall  f1-score   support

           0       0.61      0.99      0.75       427
           1       0.33      0.01      0.02       273

    accuracy                           0.61       700
   macro avg       0.47      0.50      0.39       700
weighted avg       0.50      0.61      0.47       700



In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    'max_depth': [0, 1, 2, 3, 5, 6, 7, None],
    'min_samples_leaf': [0, 1, 2, 3, 4, 5, 6, 7, 8, None],
    'max_leaf_nodes': [4, 5, 6, 7, 8, 9, None]
}

dtc_clf = DecisionTreeClassifier()
dtc_cv = GridSearchCV(dtc_clf, param_grid=params, cv=2, n_jobs=-1)
dtc_cv.fit(x_train, y_train)


In [149]:
dtc_cv.best_score_, dtc_cv.best_params_

(np.float64(0.6292857142857142),
 {'max_depth': 3, 'max_leaf_nodes': 7, 'min_samples_leaf': 1})

In [108]:
dtc = DecisionTreeClassifier(max_depth= 3, max_leaf_nodes= 5, min_samples_leaf= 1)

dtc.fit(x_train, y_train)
dtc_pred = dtc.predict(x_test)

print(classification_report(y_test, lr_pred))

              precision    recall  f1-score   support

           0       0.61      0.99      0.75       427
           1       0.33      0.01      0.02       273

    accuracy                           0.61       700
   macro avg       0.47      0.50      0.39       700
weighted avg       0.50      0.61      0.47       700



## RandomForestClassifier

In [109]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

rf.fit(x_train, y_train)
rf_pred = rf.predict(x_test)
accuracy_score(y_test, rf_pred)

0.5514285714285714

## RandomForestClassifier 성능 평가 및 최적 파라미터 설정

In [110]:
# 기본값
print(classification_report(y_test, lr_pred))

              precision    recall  f1-score   support

           0       0.61      0.99      0.75       427
           1       0.33      0.01      0.02       273

    accuracy                           0.61       700
   macro avg       0.47      0.50      0.39       700
weighted avg       0.50      0.61      0.47       700



In [111]:
params = {
    'n_estimators': [130, 135, 140, None],
    'max_depth': [7, 8, 9, 10, 11, 12, None],
    'min_samples_leaf': [3, 4, 5, 6, 7, 8, 9, 10, None],
    'max_leaf_nodes': [8, 9, 10, None]
}

rf_clf = RandomForestClassifier()
rf_cv = GridSearchCV(rf_clf, param_grid=params, n_jobs=-1)
rf_cv.fit(x_train, y_train)

In [148]:
rf_cv.best_score_, rf_cv.best_params_

(np.float64(0.635),
 {'max_depth': 10,
  'max_leaf_nodes': 8,
  'min_samples_leaf': 9,
  'n_estimators': 135})

In [113]:
rf = RandomForestClassifier()

rf.fit(x_train, y_train)
rf_pred = rf.predict(x_test)

print(classification_report(y_test, lr_pred))

              precision    recall  f1-score   support

           0       0.61      0.99      0.75       427
           1       0.33      0.01      0.02       273

    accuracy                           0.61       700
   macro avg       0.47      0.50      0.39       700
weighted avg       0.50      0.61      0.47       700



## XGBoost

In [114]:
from xgboost import XGBClassifier

evals = [(x_test, y_test)]
xgb = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3, use_label_encoder=False)
xgb.fit(x_train, y_train)
xgb_pred = xgb.predict(x_test)
accuracy_score(y_test, xgb_pred)


0.6057142857142858

## XGBoost 성능 평가 및 최적 파라미터 설정

In [115]:
# 기본 값
print(classification_report(y_test, lr_pred))

              precision    recall  f1-score   support

           0       0.61      0.99      0.75       427
           1       0.33      0.01      0.02       273

    accuracy                           0.61       700
   macro avg       0.47      0.50      0.39       700
weighted avg       0.50      0.61      0.47       700



In [146]:
params = {
    'n_estimators': [0, 10, 25, 50, 80, 90, None],
    'max_depth': [0, 1, 2, 3, 4, 5, 6, None],
    'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2, None],
    'min_split_loss': [1, 2, 3, 4, 5, 6]
}

xgb_clf = XGBClassifier()
xgb_cv = GridSearchCV(xgb_clf, param_grid=params, n_jobs=-1)
xgb_cv.fit(x_train, y_train)

In [153]:
xgb_cv.best_score_, xgb_cv.best_params_

(np.float64(0.6371428571428571),
 {'learning_rate': 0.2,
  'max_depth': 1,
  'min_split_loss': 1,
  'n_estimators': 50})

In [155]:
li = {'XGBOOST': xgb_cv.best_score_, 'RandomForest':rf_cv.best_score_, 'DecisionTree':dtc_cv.best_score_, 'LogisticRegression':lr_acc}

print(f"최고 성능: {max(li.items())}")

최고 성능: ('XGBOOST', np.float64(0.6371428571428571))
