In [1]:
# 필요한 라이브러리 불러오기
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import KFold

In [4]:
data = pd.read_csv('../data/현대_고객_데이터_완성.csv', encoding='utf-8')

In [5]:
data

Unnamed: 0,이름,생년월일,연령,성별,휴대폰번호,이메일,주소,아이디,가입일,차량구분,...,친환경차,제품 구매 날짜,거래 금액,거래 방식,제품 구매 빈도,제품 구매 경로,제품 출시년월,지역,고객 세그먼트,Cluster
0,Robert Brown,1971-10-28,54,남,4332181960,robinsonwilliam@hoffman.net,"402 Peterson Drives Apt. 511, Davisstad, KS 06196",yherrera,2022-07-31,준대형 SUV,...,False,2024-12-04,21100000,신용카드,2,온라인,2020-09,미국,2,5
1,Michael Wilson,1984-03-12,41,남,+1-959-310-3413x1647,mitchellclark@yahoo.com,"283 Steven Groves, Lake Mark, WI 07832",lydiatrujillo,2023-12-14,준중형 SUV,...,True,2024-07-08,80410000,현금,1,오프라인,2023-01,미국,1,1
2,Robert Johnson,1968-10-01,57,남,953.767.2423,zhurst@yahoo.com,"710 Eric Estate, Carlsonfurt, PA 61849",tasha01,2025-02-22,준중형 세단,...,False,2025-03-20,63160000,현금,2,온라인,2021-04,미국,1,2
3,David Moore,1982-05-27,43,남,001-514-627-0482x81489,nathanielmartin@sellers.com,"USNV Allen, FPO AE 74865",millertodd,2022-07-16,픽업트럭,...,True,2024-10-20,88060000,신용카드,1,오프라인,2022-05,미국,1,1
4,David Wilson,1972-10-20,53,여,(782)489-6383,pcarney@yahoo.com,"33150 Brianna Avenue Apt. 031, Port Markhaven,...",ilewis,2024-09-03,중형 SUV,...,False,2025-03-05,26010000,신용카드,2,온라인,2020-09,미국,2,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1887,신시우,1958-07-11,67,여,422076955,gyeonghyi78@gmail.com,전라북도 전주시 완산구 덕진동 33-7,jihungim,2023-03-06,준중형 세단,...,False,2024-12-20,86770000,카드,2,온라인,2020-03,대한민국,1,2
1888,최춘자,1976-06-27,49,여,311707739,hancunja@yu.kr,충청남도 천안시 동남구 성성동 22-3,jinho50,2024-08-26,준대형 세단,...,False,2023-08-24,69510000,현금,3,오프라인,2020-05,대한민국,3,3
1889,차서준,1968-01-22,57,여,647413740,qgim@simryujo.com,경상남도 창원시 의창구 봉곡동 48-6,jihun41,2023-01-02,준대형 세단,...,True,2024-01-12,39890000,현금,2,온라인,2018-01,대한민국,2,5
1890,안정희,1984-01-10,41,남,514061803,donghyeonbae@joyunson.com,대구광역시 수성구 범어동 55-3,yeongil47,2023-04-08,소형 해치백,...,False,2024-01-30,44530000,현금,2,온라인,2020-03,대한민국,2,5


In [6]:
# 범주형과 수치형 변수 구분
categorical_cols = ['성별', '차량구분', '거래 방식', '제품 출시년월', '제품 구매 날짜', '고객 세그먼트', '친환경차']
numerical_cols = ['연령', '거래 금액', '제품 구매 빈도']

In [7]:
# 타겟 변수 설정
X = data[categorical_cols + numerical_cols]
y = data['Cluster']

In [8]:
# 파이프라인 생성
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [10]:
# 모델 1: Random Forest
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

In [11]:
# 모델 2: Gradient Boosting
gb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier())
])

In [12]:
# 모델 3: SVM
svm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC())
])

In [13]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# 모델 학습 및 평가
# Random Forest
rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)
print("Random Forest:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.3f}")
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("\n")

Random Forest:
Accuracy: 0.984
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        61
           1       0.99      0.96      0.97        78
           2       0.95      0.97      0.96        63
           3       0.98      1.00      0.99        50
           4       1.00      0.97      0.99        40
           5       0.99      1.00      0.99        87

    accuracy                           0.98       379
   macro avg       0.98      0.98      0.98       379
weighted avg       0.98      0.98      0.98       379





In [15]:
# Gradient Boosting
gb_pipeline.fit(X_train, y_train)
y_pred_gb = gb_pipeline.predict(X_test)
print("Gradient Boosting:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_gb):.3f}")
print("Classification Report:\n", classification_report(y_test, y_pred_gb))
print("\n")

Gradient Boosting:
Accuracy: 0.992
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        61
           1       0.99      0.99      0.99        78
           2       0.98      0.98      0.98        63
           3       1.00      1.00      1.00        50
           4       1.00      1.00      1.00        40
           5       0.99      0.99      0.99        87

    accuracy                           0.99       379
   macro avg       0.99      0.99      0.99       379
weighted avg       0.99      0.99      0.99       379





In [16]:
# SVM
svm_pipeline.fit(X_train, y_train)
y_pred_svm = svm_pipeline.predict(X_test)
print("SVM:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm):.3f}")
print("Classification Report:\n", classification_report(y_test, y_pred_svm))

SVM:
Accuracy: 0.984
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        61
           1       0.97      0.97      0.97        78
           2       0.97      0.95      0.96        63
           3       0.98      1.00      0.99        50
           4       1.00      0.97      0.99        40
           5       0.99      1.00      0.99        87

    accuracy                           0.98       379
   macro avg       0.99      0.98      0.98       379
weighted avg       0.98      0.98      0.98       379



In [17]:
import joblib

In [18]:
joblib.dump(gb_pipeline, 'model/gb_pipeline_c.pkl')

['model/gb_pipeline_c.pkl']