In [1]:
# =============================================================================
# 步驟 0：導入必要的函式庫
# =============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, r2_score
import lightgbm as lgb
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)
print("函式庫導入完成。\n")


# =============================================================================
# 步驟 1：資料載入與視覺化設定
# =============================================================================
# --- 設定 Matplotlib 顯示中文字體 ---
try:
    plt.rcParams['font.sans-serif'] = ['Microsoft JhengHei'] # Windows
except:
    try:
        plt.rcParams['font.sans-serif'] = ['Arial Unicode MS'] # Mac
    except:
        print("警告：未找到指定的中文字體，圖表標題可能顯示為亂碼。")
plt.rcParams['axes.unicode_minus'] = False

# --- 載入資料 ---
file_path = u'2023資料剖析後內容1~8月芝山到天母0609整理二版.csv' 
try:
    df_raw = pd.read_csv(file_path, encoding='utf-8-sig')
    print(f"成功使用 'utf-8-sig' 編碼讀取檔案: {file_path}")
except UnicodeDecodeError:
    df_raw = pd.read_csv(file_path, encoding='big5')
    print(f"成功使用 'big5' 編碼讀取檔案: {file_path}")
except FileNotFoundError:
    print(f"錯誤：找不到檔案 '{file_path}'。請確認檔案名稱和路徑是否正確。")
    exit()

print(f"\n1. 原始資料載入成功，共 {len(df_raw)} 筆。")


# =============================================================================
# 步驟 2 & 3：資料清理、預處理與特徵工程
# =============================================================================
print("\n--- 開始進行資料處理與特徵工程 ---")

# --- 2.1 創建副本並初步清理 ---
df = df_raw.copy()
df.dropna(subset=['Date'], inplace=True)
print(f"2. 移除非資料行後，剩餘 {len(df)} 筆。")

# --- 2.2 轉換核心欄位 ---
def parse_time_to_seconds(time_str):
    try:
        return pd.to_timedelta(str(time_str).strip()).total_seconds()
    except (ValueError, TypeError):
        return np.nan

df['TotalSeconds'] = df['Time2'].apply(parse_time_to_seconds)

df['Date_str'] = df['Date'].astype(int).astype(str)
df['Time1_str'] = df['Time1'].astype(str).str.strip()
df['Datetime'] = pd.to_datetime(df['Date_str'] + ' ' + df['Time1_str'], format='%Y%m%d %H:%M:%S', errors='coerce')

# --- 2.3 進行一次性的資料篩選，並創建最終乾淨的 DataFrame ---
df_clean = df[
    (df['TotalSeconds'].notna()) &
    (df['Datetime'].notna()) &
    (df['TotalSeconds'] >= 180) & 
    (df['TotalSeconds'] <= 3600)
].copy() 

print(f"3. 經過所有條件篩選後，最終用於建模的乾淨資料共 {len(df_clean)} 筆。")

if len(df_clean) == 0:
    print("\n錯誤：所有資料都在清理過程中被移除了。請檢查原始 CSV 檔案的 'Time2' 和 'Time1' 欄位格式。")
    exit()

# --- 2.4 在乾淨的資料上進行後續處理與特徵工程 ---
df_clean['TotalSeconds'] = df_clean['TotalSeconds'].astype(int)

df_clean['Hour'] = df_clean['Datetime'].dt.hour
df_clean['DayOfWeek'] = df_clean['Datetime'].dt.dayofweek
df_clean['Month'] = df_clean['Datetime'].dt.month
df_clean['IsWeekend'] = df_clean['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)
df_clean['IsRushHour'] = df_clean['Hour'].apply(lambda h: 1 if (7 <= h <= 9) or (17 <= h <= 19) else 0)

def normalize_exit(stp_name):
    return 'Exit_1' if '1號' in str(stp_name) else 'Exit_2'
df_clean['StartExit'] = df_clean['Stp1'].apply(normalize_exit)

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False, dtype=np.int64)
encoded_features = ohe.fit_transform(df_clean[['StartExit']])
new_feature_names = ohe.get_feature_names_out(['StartExit'])
encoded_df = pd.DataFrame(encoded_features, columns=new_feature_names, index=df_clean.index)
df_final = pd.concat([df_clean, encoded_df], axis=1)

target = 'TotalSeconds'
features = ['Hour', 'DayOfWeek', 'Month', 'IsWeekend', 'IsRushHour'] + list(new_feature_names)
df_model = df_final[features + [target]].reset_index(drop=True)

print("4. 特徵工程完成。")
print("最終用於建模的 DataFrame 預覽：")
print(df_model.head())

# =============================================================================
# 步驟 4：模型訓練 (Model Training)
# =============================================================================
print("\n--- 開始訓練模型 ---")

X = df_model[features]
y = df_model[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"訓練集大小: {X_train.shape[0]} 筆, 測試集大小: {X_test.shape[0]} 筆。")

lgbm = lgb.LGBMRegressor(random_state=42)
lgbm.fit(X_train, y_train)
print("模型訓練完成！")

# =============================================================================
# 步驟 5：模型評估與視覺化 (Model Evaluation & Visualization)
# =============================================================================
print("\n--- 模型評估結果 ---")
y_pred = lgbm.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"平均絕對誤差 (MAE): {mae:.2f} 秒 (約 {mae/60:.2f} 分鐘)")
print(f"R-squared (R²): {r2:.2f}")

# 繪製「預測值 vs. 實際值」散點圖
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--', color='red', linewidth=2, label='完美預測線')
plt.title('模型預測時間 vs. 實際騎乘時間', fontsize=16)
plt.xlabel('實際時間 (秒)', fontsize=12)
plt.ylabel('預測時間 (秒)', fontsize=12)
plt.legend()
plt.grid(True)
plt.show()

# 繪製特徵重要性圖
print("\n--- 特徵重要性分析 ---")
feature_imp = pd.DataFrame(sorted(zip(lgbm.feature_importances_, X.columns)), columns=['Value','Feature'])
plt.figure(figsize=(12, 8))
sns.barplot(x="Value", y="Feature

SyntaxError: unterminated string literal (detected at line 149) (2578213291.py, line 149)