In [115]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


In [116]:
import os
os.chdir('../data')

In [117]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submission_df = pd.read_csv('sample_submission.csv')

In [118]:
selected_features = [
    'Administrative',
    'Informational',
    'ProductRelated',
    'PageValues',
    'BounceRates',
    'ExitRates',
    'Month',
    'VisitorType',
    'Weekend',
    'TrafficType'
]


In [119]:
X = train_df[selected_features].copy()
y = train_df['Revenue']
X_test = test_df[selected_features].copy()

In [120]:
categorical_cols = ['Month', 'VisitorType', 'Weekend', 'TrafficType']
for col in categorical_cols:
    X[col] = X[col].astype(str)
    X_test[col] = X_test[col].astype(str)

numeric_cols = [col for col in selected_features if col not in categorical_cols]

建立columnTransformers

In [121]:
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numeric_cols),
    
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]), categorical_cols)
])

建模&訓練

In [122]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "LightGBM": LGBMClassifier(random_state=42)
}

results = {}
best_auc = -1
best_model = None
best_name = ""

## 7. 分割訓練/驗證
X_train_raw, X_val_raw, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# 8. 訓練與評估模型
results = {}
best_model = None
best_auc = -1
best_name = ""

for name, model in models.items():
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    pipe.fit(X_train_raw, y_train)
    val_probs = pipe.predict_proba(X_val_raw)[:, 1]
    
    auc = roc_auc_score(y_val, val_probs)
    acc = accuracy_score(y_val, val_probs > 0.5)
    results[name] = {"AUC": auc, "Accuracy": acc}
    
    print(f"{name}: AUC = {auc:.4f}, Accuracy = {acc:.4f}")
    
    if auc > best_auc:
        best_auc = auc
        best_model = pipe
        best_name = name
 
print(f"\n Best Model: {best_name} (AUC = {best_auc:.4f})")

Logistic Regression: AUC = 0.8686, Accuracy = 0.8778
Random Forest: AUC = 0.9160, Accuracy = 0.8975
XGBoost: AUC = 0.9111, Accuracy = 0.8864
[LightGBM] [Info] Number of positive: 1022, number of negative: 5458
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000401 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1041
[LightGBM] [Info] Number of data points in the train set: 6480, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.157716 -> initscore=-1.675321
[LightGBM] [Info] Start training from score -1.675321


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


LightGBM: AUC = 0.9195, Accuracy = 0.8907

 Best Model: LightGBM (AUC = 0.9195)




In [123]:
# 訓練完的模型變數
final_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', models[best_name])
])
final_model.fit(X, y)  # 使用完整訓練資料進行訓練

# 預測完的儲存變數
test_probs = final_model.predict_proba(X_test)[:, 1]  # 預測消費的機率

# 將預測結果寫入 CSV，並按照 ID 排序
submission_df = test_df[['ID']].copy()  # 確保與 test.csv 的 ID 一致
submission_df['HasRevenue'] = test_probs  # 儲存預測機率
submission_df = submission_df.sort_values('ID', ascending=True)  # 按 ID 排序
submission_df.to_csv('my_submission1.csv', index=False)  # 輸出 CSV 檔案

print("預測完成，輸出檔案：my_submission.csv")



[LightGBM] [Info] Number of positive: 1277, number of negative: 6823
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000772 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1111
[LightGBM] [Info] Number of data points in the train set: 8100, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.157654 -> initscore=-1.675786
[LightGBM] [Info] Start training from score -1.675786
預測完成，輸出檔案：my_submission.csv


In [125]:

# 先定義數值與類別欄位
categorical_cols = ['Month', 'VisitorType', 'Weekend', 'TrafficType']
numeric_cols = [col for col in selected_features if col not in categorical_cols]

# 建立前處理器
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numeric_cols),
    
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]), categorical_cols)
])

# 擬合並轉換訓練與測試資料
X_cleaned = preprocessor.fit_transform(X)
X_test_cleaned = preprocessor.transform(X_test)

# 取得經 OneHotEncoder 編碼後的欄位名稱
ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
cat_feature_names = ohe.get_feature_names_out(categorical_cols)

# 合併所有欄位名稱
final_feature_names = numeric_cols + list(cat_feature_names)

# 建立 DataFrame 並加上標籤欄
train_cleaned_df = pd.DataFrame(X_cleaned, columns=final_feature_names)
train_cleaned_df['Revenue'] = y.values

test_cleaned_df = pd.DataFrame(X_test_cleaned, columns=final_feature_names)

# 匯出為 CSV
train_cleaned_df.to_csv("train_cleaned.csv", index=False)
test_cleaned_df.to_csv("test_cleaned.csv", index=False)

print("✅ 匯出完成：train_cleaned.csv 與 test_cleaned.csv")


ValueError: Shape of passed values is (8100, 1), indices imply (8100, 45)