In [1]:
# 필요한 라이브러리 불러오기
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import KFold

In [2]:
data = pd.read_csv('../data/현대_고객데이터_신규입력용.csv', encoding='utf-8')

In [3]:
categorical_cols = ['성별', '차량구분', '거래방식', '제품출시년월', '제품 구매 날짜', '고객 세그먼트', '친환경차']
numerical_cols = ['연령', '거래 금액', '제품구매빈도','Cluster']

In [5]:
data[categorical_cols+numerical_cols].head()

Unnamed: 0,성별,차량구분,거래방식,제품출시년월,제품 구매 날짜,고객 세그먼트,친환경차,연령,거래 금액,제품구매빈도,Cluster
0,남,준대형 SUV,신용카드,2020-09,2024-12-04,2,False,54,21100000,2,5
1,남,준중형 SUV,현금,2023-01,2024-07-08,1,True,41,80410000,1,1
2,남,준중형 세단,현금,2021-04,2025-03-20,1,False,57,63160000,2,2
3,남,픽업트럭,신용카드,2022-05,2024-10-20,1,True,43,88060000,1,1
4,여,중형 SUV,신용카드,2020-09,2025-03-05,2,False,53,26010000,2,5


In [6]:
# 범주형과 수치형 변수 구분
categorical_cols = ['성별', '차량구분', '거래 방식', '제품 출시년월', '제품 구매 날짜', '고객 세그먼트', '친환경차']
numerical_cols = ['연령', '거래 금액', '제품 구매 빈도']

In [7]:
# 타겟 변수 설정
X = data[categorical_cols + numerical_cols]
y = data['Cluster']

In [8]:
# 파이프라인 생성
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [10]:
# 모델 1: Random Forest
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

In [11]:
# 모델 2: Gradient Boosting
gb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier())
])

In [12]:
# 모델 3: SVM
svm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC())
])

In [13]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# 모델 학습 및 평가
# Random Forest
rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)
print("Random Forest:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.3f}")
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("\n")

Random Forest:
Accuracy: 0.984
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        61
           1       0.99      0.96      0.97        78
           2       0.95      0.97      0.96        63
           3       0.98      1.00      0.99        50
           4       1.00      0.97      0.99        40
           5       0.99      1.00      0.99        87

    accuracy                           0.98       379
   macro avg       0.98      0.98      0.98       379
weighted avg       0.98      0.98      0.98       379





In [15]:
# Gradient Boosting
gb_pipeline.fit(X_train, y_train)
y_pred_gb = gb_pipeline.predict(X_test)
print("Gradient Boosting:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_gb):.3f}")
print("Classification Report:\n", classification_report(y_test, y_pred_gb))
print("\n")

Gradient Boosting:
Accuracy: 0.992
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        61
           1       0.99      0.99      0.99        78
           2       0.98      0.98      0.98        63
           3       1.00      1.00      1.00        50
           4       1.00      1.00      1.00        40
           5       0.99      0.99      0.99        87

    accuracy                           0.99       379
   macro avg       0.99      0.99      0.99       379
weighted avg       0.99      0.99      0.99       379





In [16]:
# SVM
svm_pipeline.fit(X_train, y_train)
y_pred_svm = svm_pipeline.predict(X_test)
print("SVM:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm):.3f}")
print("Classification Report:\n", classification_report(y_test, y_pred_svm))

SVM:
Accuracy: 0.984
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        61
           1       0.97      0.97      0.97        78
           2       0.97      0.95      0.96        63
           3       0.98      1.00      0.99        50
           4       1.00      0.97      0.99        40
           5       0.99      1.00      0.99        87

    accuracy                           0.98       379
   macro avg       0.99      0.98      0.98       379
weighted avg       0.98      0.98      0.98       379



In [17]:
import joblib

In [18]:
joblib.dump(gb_pipeline, 'model/gb_pipeline_c.pkl')

['model/gb_pipeline_c.pkl']