In [1]:
# 필요한 라이브러리 불러오기
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import KFold

In [2]:
data = pd.read_csv('../data/기아_고객_데이터_완성.csv', encoding='utf-8')

In [3]:
data

Unnamed: 0,생년월일,연령,성별,휴대폰번호,이메일,주소,아이디,가입일,차량구분,구매한 제품,친환경차,제품 구매 날짜,거래 금액,거래 방식,제품 구매 빈도,제품 구매 경로,제품 출시년월,지역,고객 세그먼트,Cluster
0,1987-05-25,38,남,+1-983-243-5951,jeremyberger@newton-watson.com,인천광역시 부평구 부평대로 88,user762756,2023-09-21,픽업트럭,K3,False,2023-10-13,55987533,현금,6,온라인,2020-05,미국,0,3
1,1979-11-24,46,여,001-718-884-2716,robert55@hotmail.com,경기도 고양시 일산동구 중앙로 1036,user787282,2025-01-12,준대형 SUV,Pegas,False,2025-01-30,26135305,계좌이체,1,온라인,2020-12,미국,3,1
2,1984-08-25,41,남,(532)836-9308,garciaelizabeth@burns.net,인천광역시 부평구 부평대로 88,user293846,2024-12-17,준대형 세단,Optima / K5,False,2025-01-05,20425328,현금,3,온라인,2021-01,미국,2,0
3,1971-08-07,54,여,640.695.8051,phillipstroy@yahoo.com,전라북도 군산시 백릉로 300,user935434,2024-01-17,중형 세단,Accent (HC),False,2024-02-14,28976142,신용카드,6,오프라인,2020-06,미국,0,3
4,1991-12-26,34,여,+1-023-521-7921x18289,anthonygraves@smith.com,세종특별자치시 나성북로 21,user384834,2024-12-07,준중형 SUV,EV6,True,2024-12-29,53143909,계좌이체,1,온라인,2019-03,미국,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1895,1972-07-18,53,남,010-7863-2944,user5781@example.com,서울특별시 강남구 테헤란로 123,user452994,2021-09-12,준중형 해치백,Carnival,False,2021-10-09,25020000,현금,2,오프라인,2020-08,대한민국,2,0
1896,1965-07-21,60,여,010-5926-8982,user1371@example.com,대구광역시 수성구 범어동 789,user333338,2015-09-20,준중형 해치백,K4,False,2015-10-12,44660000,신용카드,3,오프라인,2014-04,대한민국,0,3
1897,1986-04-28,39,여,010-3164-6681,user7652@example.com,서울특별시 강남구 테헤란로 123,user292930,2020-06-10,대형 세단,Optima / K5,False,2020-07-04,24570000,신용카드,1,오프라인,2019-12,대한민국,2,2
1898,1977-12-20,48,여,010-1747-7304,user9677@example.com,경기도 성남시 분당구 정자동 202,user973159,2015-09-22,준중형 SUV,KX3,False,2015-10-09,16210000,현금,2,오프라인,2015-03,대한민국,2,2


In [4]:
# 범주형과 수치형 변수 구분
categorical_cols = ['성별', '차량구분', '거래 방식', '제품 출시년월', '제품 구매 날짜', '고객 세그먼트', '친환경차']
numerical_cols = ['연령', '거래 금액', '제품 구매 빈도']

In [5]:
# 타겟 변수 설정
X = data[categorical_cols + numerical_cols]
y = data['Cluster']

In [6]:
# 파이프라인 생성
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [8]:
# 모델 1: Random Forest
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

In [9]:
# 모델 2: Gradient Boosting
gb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier())
])

In [10]:
# 모델 3: SVM
svm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC())
])

In [13]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=545)

In [26]:
# 모델 학습 및 평가
# Random Forest
rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)
print("Random Forest:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.3f}")
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("\n")

Random Forest:
Accuracy: 1.000
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       1.00      1.00      1.00        71
           2       1.00      1.00      1.00        72
           3       1.00      1.00      1.00        79
           4       1.00      1.00      1.00        61
           5       1.00      1.00      1.00        54

    accuracy                           1.00       380
   macro avg       1.00      1.00      1.00       380
weighted avg       1.00      1.00      1.00       380





In [27]:
# Gradient Boosting
gb_pipeline.fit(X_train, y_train)
y_pred_gb = gb_pipeline.predict(X_test)
print("Gradient Boosting:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_gb):.3f}")
print("Classification Report:\n", classification_report(y_test, y_pred_gb))
print("\n")

Gradient Boosting:
Accuracy: 1.000
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       1.00      1.00      1.00        71
           2       1.00      1.00      1.00        72
           3       1.00      1.00      1.00        79
           4       1.00      1.00      1.00        61
           5       1.00      1.00      1.00        54

    accuracy                           1.00       380
   macro avg       1.00      1.00      1.00       380
weighted avg       1.00      1.00      1.00       380





In [28]:
# SVM
svm_pipeline.fit(X_train, y_train)
y_pred_svm = svm_pipeline.predict(X_test)
print("SVM:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm):.3f}")
print("Classification Report:\n", classification_report(y_test, y_pred_svm))

SVM:
Accuracy: 1.000
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       1.00      1.00      1.00        71
           2       1.00      1.00      1.00        72
           3       1.00      1.00      1.00        79
           4       1.00      1.00      1.00        61
           5       1.00      1.00      1.00        54

    accuracy                           1.00       380
   macro avg       1.00      1.00      1.00       380
weighted avg       1.00      1.00      1.00       380



In [29]:
import joblib

In [30]:
joblib.dump(rf_pipeline, 'rf_pipeline_kc.pkl')

['rf_pipeline_kc.pkl']