In [None]:
# ==========================================
# STEP 0: Import libraries
# ==========================================
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report


# ==========================================
# STEP 1: Load & Preprocess Data
# ==========================================
# โหลดข้อมูล

url = "https://raw.githubusercontent.com/pakornlee/ml_example/e4fe04e97b387f17aaabb53709391f4c364a06c0/customer_data_100.csv"
df = pd.read_csv(url)

# Ordinal Encoding
education_map = {
    "HighSchool": 1,
    "Bachelor": 2,
    "Master": 3,
    "PhD": 4
}
df["education_level"] = df["education_level"].map(education_map)

# One-Hot Encoding (Nominal)
df = pd.get_dummies(
    df,
    columns=["job_type", "city"],
    drop_first=True
)

# แยก Feature / Target
X = df.drop("buy_product", axis=1)
y = df["buy_product"]

print("Data shape:", X.shape)


# ==========================================
# STEP 2: Split Evaluation Set (10%)
# ==========================================
X_cv, X_eval, y_cv, y_eval = train_test_split(
    X,
    y,
    test_size=0.10,      # 10% evaluation set
    random_state=42,
    stratify=y
)

print("CV set size:", X_cv.shape)
print("Evaluation set size:", X_eval.shape)


# ==========================================
# STEP 3: 10-Fold Cross Validation on 90%
# ==========================================
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier())
])

param_grid = {
    "knn__n_neighbors": [3, 5, 7, 9, 11, 13, 15],
    "knn__weights": ["uniform", "distance"],
    "knn__metric": ["euclidean", "manhattan"]
}

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=10,                 # 10-fold CV
    scoring="accuracy",
    n_jobs=-1
)

grid.fit(X_cv, y_cv)

print("\nBest parameters from 10-fold CV:")
print(grid.best_params_)

print("Best CV accuracy:")
print(grid.best_score_)


# ==========================================
# STEP 4: Final Evaluation on 10% Hold-out Set
# ==========================================
best_model = grid.best_estimator_

y_eval_pred = best_model.predict(X_eval)

print("\nEvaluation Set Accuracy:")
print(accuracy_score(y_eval, y_eval_pred))

print("\nClassification Report (Evaluation Set):")
print(classification_report(y_eval, y_eval_pred))


Data shape: (100, 6)
CV set size: (90, 6)
Evaluation set size: (10, 6)

Best parameters from 10-fold CV:
{'knn__metric': 'manhattan', 'knn__n_neighbors': 7, 'knn__weights': 'uniform'}
Best CV accuracy:
0.9555555555555555

Evaluation Set Accuracy:
1.0

Classification Report (Evaluation Set):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00         4

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

