# 고객 구매 데이터로 성별 예측 모델링
백화점 고객(3,500명)의 1년 간 구매 데이터

In [162]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from sklearn.metrics import accuracy_score, classification_report

In [163]:
x = pd.read_csv('./data1/X.csv', encoding='euc-kr')
y = pd.read_csv('./data1/y.csv', encoding='euc-kr')

In [164]:
# 결측치 확인
x.isnull()

Unnamed: 0,cust_id,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,True,False,False,False,False,False,False
3,False,False,False,True,False,False,False,False,False,False
4,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
3495,False,False,False,True,False,False,False,False,False,False
3496,False,False,False,False,False,False,False,False,False,False
3497,False,False,False,True,False,False,False,False,False,False
3498,False,False,False,True,False,False,False,False,False,False


In [165]:
y.isnull()

Unnamed: 0,cust_id,gender
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
3495,False,False
3496,False,False
3497,False,False
3498,False,False


In [166]:
# x 에 환불 금액 NaN 은 환불이 없는 것 이기 때문에 0으로 변경

x['환불금액'] = x['환불금액'].fillna('0')
x

Unnamed: 0,cust_id,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기
0,0,68282840,11264000,6860000.0,기타,강남점,19,3.894737,0.527027,17
1,1,2136000,2136000,300000.0,스포츠,잠실점,2,1.500000,0.000000,1
2,2,3197000,1639000,0,남성 캐주얼,관악점,2,2.000000,0.000000,1
3,3,16077620,4935000,0,기타,광주점,18,2.444444,0.318182,16
4,4,29050000,24000000,0,보석,본 점,2,1.500000,0.000000,85
...,...,...,...,...,...,...,...,...,...,...
3495,3495,3175200,3042900,0,골프,본 점,1,2.000000,1.000000,0
3496,3496,29628600,7200000,6049600.0,시티웨어,부산본점,8,1.625000,0.461538,40
3497,3497,75000,75000,0,주방용품,창원점,1,1.000000,0.000000,0
3498,3498,1875000,1000000,0,화장품,본 점,2,1.000000,0.000000,39


In [167]:
y

Unnamed: 0,cust_id,gender
0,0,0
1,1,0
2,2,1
3,3,1
4,4,0
...,...,...
3495,3495,1
3496,3496,1
3497,3497,0
3498,3498,0


In [168]:
# cust_id 는 단순 인덱싱으로 볼 수 있기 때문에 제거

x = x.drop("cust_id", axis=1)
y = y.drop("cust_id", axis=1)

In [169]:
x

Unnamed: 0,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기
0,68282840,11264000,6860000.0,기타,강남점,19,3.894737,0.527027,17
1,2136000,2136000,300000.0,스포츠,잠실점,2,1.500000,0.000000,1
2,3197000,1639000,0,남성 캐주얼,관악점,2,2.000000,0.000000,1
3,16077620,4935000,0,기타,광주점,18,2.444444,0.318182,16
4,29050000,24000000,0,보석,본 점,2,1.500000,0.000000,85
...,...,...,...,...,...,...,...,...,...
3495,3175200,3042900,0,골프,본 점,1,2.000000,1.000000,0
3496,29628600,7200000,6049600.0,시티웨어,부산본점,8,1.625000,0.461538,40
3497,75000,75000,0,주방용품,창원점,1,1.000000,0.000000,0
3498,1875000,1000000,0,화장품,본 점,2,1.000000,0.000000,39


In [170]:
# 데이터 표준화를 위해 카테고리형 데이터를 숫자형으로 변환

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

x['주구매상품'] = le.fit_transform(x['주구매상품'])
x['주구매지점'] = le.fit_transform(x['주구매지점'])

In [171]:
merge = pd.concat([x, y], axis=1)

merge.corr()


Unnamed: 0,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기,gender
총구매액,1.0,0.70008,0.467686,-0.082916,-0.038724,0.659084,0.090022,0.014396,-0.212944,-0.150141
최대구매액,0.70008,1.0,0.429504,-0.090729,-0.024819,0.374147,0.01898,0.022277,-0.115837,-0.114323
환불금액,0.467686,0.429504,1.0,-0.056604,-0.045686,0.37757,-0.003871,-0.024707,-0.137362,-0.114327
주구매상품,-0.082916,-0.090729,-0.056604,1.0,0.015874,-0.185275,-0.274178,-0.010018,0.032469,-0.038668
주구매지점,-0.038724,-0.024819,-0.045686,0.015874,1.0,-0.05957,-0.080804,0.01199,0.035344,0.015876
내점일수,0.659084,0.374147,0.37757,-0.185275,-0.05957,1.0,0.225264,-0.010325,-0.2932,-0.155
내점당구매건수,0.090022,0.01898,-0.003871,-0.274178,-0.080804,0.225264,1.0,0.007659,-0.091151,-0.043917
주말방문비율,0.014396,0.022277,-0.024707,-0.010018,0.01199,-0.010325,0.007659,1.0,0.003372,0.073598
구매주기,-0.212944,-0.115837,-0.137362,0.032469,0.035344,-0.2932,-0.091151,0.003372,1.0,0.04145
gender,-0.150141,-0.114323,-0.114327,-0.038668,0.015876,-0.155,-0.043917,0.073598,0.04145,1.0


In [172]:
# x = x[["주말방문비율", "구매주기"]]
# x

In [173]:
# 데이터 표준화

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(x, y)
x_scaled = scaler.transform(x)

In [174]:
x_scaled

array([[-0.14458009, -0.26260786, -0.04750476, ...,  0.55424743,
         0.75862274, -0.15996211],
       [-0.54918957, -0.54796686, -0.26546133, ..., -0.69816782,
        -1.06053002, -0.80655356],
       [-0.5426996 , -0.56350404, -0.27542885, ..., -0.43667453,
        -1.06053002, -0.80655356],
       ...,
       [-0.56179637, -0.61239772, -0.27542885, ..., -0.95966112,
        -1.06053002, -0.84696552],
       [-0.55078606, -0.58348042, -0.27542885, ..., -0.95966112,
        -1.06053002,  0.72910114],
       [ 1.04709431,  0.46792117, -0.07697541, ..., -0.21646965,
         0.55277658, -0.5236698 ]], shape=(3500, 9))

In [175]:
x_scaled.mean(), x_scaled.std()

(np.float64(-4.1504718533288394e-17), np.float64(1.0))

In [176]:
from sklearn.model_selection import train_test_split, cross_val_score

x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=42)


## LogisticRegression

In [177]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=100000000000)
lr.fit(x_train, y_train)


## LogisticRegression 성능 평가

In [178]:
from sklearn.metrics import accuracy_score, classification_report
# 기본값

lr_pred = lr.predict(x_test)
lr_acc = accuracy_score(y_test, lr_pred)
lr_acc

0.6171428571428571

In [179]:
print(classification_report(y_test, lr_pred))

              precision    recall  f1-score   support

           0       0.62      0.94      0.75       427
           1       0.55      0.11      0.18       273

    accuracy                           0.62       700
   macro avg       0.59      0.53      0.46       700
weighted avg       0.59      0.62      0.53       700



## DecisionTreeClassifier

In [180]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()

dtc.fit(x_train, y_train)
dtc_pred = dtc.predict(x_test)
dtc_acc = accuracy_score(y_test, dtc_pred)
dtc_acc

0.5557142857142857

## DecisionTreeClassifier 성능 평가 및 최적 파라미터 설정

In [181]:
# 기본값
print(classification_report(y_test, lr_pred))

              precision    recall  f1-score   support

           0       0.62      0.94      0.75       427
           1       0.55      0.11      0.18       273

    accuracy                           0.62       700
   macro avg       0.59      0.53      0.46       700
weighted avg       0.59      0.62      0.53       700



In [182]:
from sklearn.model_selection import GridSearchCV

params = {
    'max_depth': [0, 1, 2, 3, 5, 6, 7, None],
    'min_samples_leaf': [0, 1, 2, 3, 4, 5, 6, 7, 8, None],
    'max_leaf_nodes': [4, 5, 6, 7, 8, 9, None]
}

dtc_clf = DecisionTreeClassifier()
dtc_cv = GridSearchCV(dtc_clf, param_grid=params, cv=2, n_jobs=-1)
dtc_cv.fit(x_train, y_train)


In [183]:
dtc_cv.best_score_, dtc_cv.best_params_

(np.float64(0.6485714285714286),
 {'max_depth': 3, 'max_leaf_nodes': 5, 'min_samples_leaf': 1})

In [184]:
dtc = DecisionTreeClassifier(max_depth= 3, max_leaf_nodes= 5, min_samples_leaf= 1)

dtc.fit(x_train, y_train)
dtc_pred = dtc.predict(x_test)

print(classification_report(y_test, lr_pred))

              precision    recall  f1-score   support

           0       0.62      0.94      0.75       427
           1       0.55      0.11      0.18       273

    accuracy                           0.62       700
   macro avg       0.59      0.53      0.46       700
weighted avg       0.59      0.62      0.53       700



## RandomForestClassifier

In [185]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

rf.fit(x_train, y_train)
rf_pred = rf.predict(x_test)
accuracy_score(y_test, rf_pred)

0.6214285714285714

## RandomForestClassifier 성능 평가 및 최적 파라미터 설정

In [186]:
# 기본값
print(classification_report(y_test, lr_pred))

              precision    recall  f1-score   support

           0       0.62      0.94      0.75       427
           1       0.55      0.11      0.18       273

    accuracy                           0.62       700
   macro avg       0.59      0.53      0.46       700
weighted avg       0.59      0.62      0.53       700



In [187]:
params = {
    'n_estimators': [130, 135, 140, None],
    'max_depth': [7, 8, 9, 10, 11, 12, None],
    'min_samples_leaf': [3, 4, 5, 6, 7, 8, 9, 10, None],
    'max_leaf_nodes': [8, 9, 10, None]
}

rf_clf = RandomForestClassifier()
rf_cv = GridSearchCV(rf_clf, param_grid=params, n_jobs=-1)
rf_cv.fit(x_train, y_train)

KeyboardInterrupt: 

In [None]:
rf_cv.best_score_, rf_cv.best_params_

(np.float64(0.635),
 {'max_depth': 10,
  'max_leaf_nodes': 8,
  'min_samples_leaf': 9,
  'n_estimators': 135})

In [None]:
rf = RandomForestClassifier()

rf.fit(x_train, y_train)
rf_pred = rf.predict(x_test)

print(classification_report(y_test, lr_pred))

              precision    recall  f1-score   support

           0       0.61      0.99      0.75       427
           1       0.33      0.01      0.02       273

    accuracy                           0.61       700
   macro avg       0.47      0.50      0.39       700
weighted avg       0.50      0.61      0.47       700



## XGBoost

In [None]:
from xgboost import XGBClassifier

evals = [(x_test, y_test)]
xgb = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3, use_label_encoder=False)
xgb.fit(x_train, y_train)
xgb_pred = xgb.predict(x_test)
accuracy_score(y_test, xgb_pred)


0.6057142857142858

## XGBoost 성능 평가 및 최적 파라미터 설정

In [None]:
# 기본 값
print(classification_report(y_test, lr_pred))

              precision    recall  f1-score   support

           0       0.61      0.99      0.75       427
           1       0.33      0.01      0.02       273

    accuracy                           0.61       700
   macro avg       0.47      0.50      0.39       700
weighted avg       0.50      0.61      0.47       700



In [None]:
params = {
    'n_estimators': [0, 10, 25, 50, 80, 90, None],
    'max_depth': [0, 1, 2, 3, 4, 5, 6, None],
    'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2, None],
    'min_split_loss': [1, 2, 3, 4, 5, 6]
}

xgb_clf = XGBClassifier()
xgb_cv = GridSearchCV(xgb_clf, param_grid=params, n_jobs=-1)
xgb_cv.fit(x_train, y_train)

In [None]:
xgb_cv.best_score_, xgb_cv.best_params_

(np.float64(0.6371428571428571),
 {'learning_rate': 0.2,
  'max_depth': 1,
  'min_split_loss': 1,
  'n_estimators': 50})

In [None]:
li = {'XGBOOST': xgb_cv.best_score_, 'RandomForest':rf_cv.best_score_, 'DecisionTree':dtc_cv.best_score_, 'LogisticRegression':lr_acc}

print(f"최고 성능: {max(li.items())}")

최고 성능: ('XGBOOST', np.float64(0.6371428571428571))


---------------------

In [188]:
def objective_lr(params):
    model = LogisticRegression(
        C=params['C'],
        penalty=params['penalty'],
        solver=params['solver'],
        max_iter=1000,
        random_state=42
    )
    
    # 교차검증 점수 계산
    scores = cross_val_score(model, x_train, y_train, cv=5, scoring='accuracy')
    accuracy = scores.mean()
    
    # hyperopt는 최소화를 목표로 하므로 음수 반환
    return {'loss': -accuracy, 'status': STATUS_OK}

space_lr = {
    'C': hp.loguniform('C', np.log(0.01), np.log(100)),
    'penalty': hp.choice('penalty', ['l1', 'l2']),
    'solver': hp.choice('solver', ['liblinear', 'saga'])
}

In [189]:
trials = Trials()

best = fmin(
    fn=objective_lr,
    space=space_lr,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials,  # trials 객체 전달
    verbose=False
)

In [190]:
penalty_list = ['l1', 'l2']
solver_list = ['liblinear', 'saga']

best_model = LogisticRegression(
    C=best['C'],
    penalty=penalty_list[best['penalty']],
    solver=solver_list[best['solver']],
    max_iter=1000,
    random_state=42
)

# 모델 학습 및 평가
best_model.fit(x_train, y_train)
y_pred = best_model.predict(x_test)
test_accuracy = accuracy_score(y_test, y_pred)

# 최적 CV 점수
best_cv_score_lr = -trials.best_trial['result']['loss']


In [191]:
best_cv_score_lr

0.6389285714285714

In [200]:
def objective_xgb(params):
    model = XGBClassifier(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        learning_rate=params['learning_rate'],
        subsample=params['subsample'],
        # colsample_bytree=params['colsample_bytree'],
        random_state=42,
        eval_metric='logloss'
    )
    
    # 교차검증 점수 계산
    scores = cross_val_score(model, x_train, y_train, cv=5, scoring='accuracy')
    accuracy = scores.mean()
    
    # hyperopt는 최소화를 목표로 하므로 음수 반환
    return {'loss': -accuracy, 'status': STATUS_OK}

# XGBClassifier 탐색 공간 정의
space_xgb = {
    'n_estimators': hp.quniform('n_estimators', 50, 300, 50),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'subsample': hp.uniform('subsample', 0.6, 1.0)
    # 'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0)
}

In [201]:
best = fmin(
    fn=objective_xgb,
    space=space_xgb,
    algo=tpe.suggest,
    max_evals=100,  # 최대 평가 횟수
    trials=trials,
    verbose=True
)

# 최적 파라미터로 모델 학습
best_model = XGBClassifier(
    n_estimators=int(best['n_estimators']),
    max_depth=int(best['max_depth']),
    learning_rate=best['learning_rate'],
    subsample=best['subsample'],
    colsample_bytree=best['colsample_bytree'],
    random_state=42,
    eval_metric='logloss'
)

# 모델 학습 및 평가
best_model.fit(x_train, y_train)
y_pred = best_model.predict(x_test)
test_accuracy = accuracy_score(y_test, y_pred)

# 최적 CV 점수
best_cv_xgb = -trials.best_trial['result']['loss']

 50%|█████     | 50/100 [00:00<?, ?trial/s, best loss=?]


KeyError: 'learning_rate'