In [6]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsRegressor  # 补充K近邻回归
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


#Reading all the file
yello_taxi_2023_01 = pd.read_parquet("Data/2023_01.parquet")
yello_taxi_2023_02 = pd.read_parquet("Data/2023_02.parquet")
yello_taxi_2023_03 = pd.read_parquet("Data/2023_03.parquet")
yello_taxi_2023_04 = pd.read_parquet("Data/2023_04.parquet")
yello_taxi_2023_05 = pd.read_parquet("Data/2023_05.parquet")
yello_taxi_2023_06 = pd.read_parquet("Data/2023_06.parquet")
yello_taxi_2023_07 = pd.read_parquet("Data/2023_07.parquet")
yello_taxi_2023_08 = pd.read_parquet("Data/2023_08.parquet")
yello_taxi_2023_09 = pd.read_parquet("Data/2023_09.parquet")
yello_taxi_2023_10 = pd.read_parquet("Data/2023_10.parquet")
yello_taxi_2023_11 = pd.read_parquet("Data/2023_11.parquet")
yello_taxi_2023_12 = pd.read_parquet("Data/2023_12.parquet")

yello_taxi_2024_01 = pd.read_parquet("Data/2024_01.parquet")
yello_taxi_2024_02 = pd.read_parquet("Data/2024_02.parquet")
yello_taxi_2024_03 = pd.read_parquet("Data/2024_03.parquet")
yello_taxi_2024_04 = pd.read_parquet("Data/2024_04.parquet")
yello_taxi_2024_05 = pd.read_parquet("Data/2024_05.parquet")
yello_taxi_2024_06 = pd.read_parquet("Data/2024_06.parquet")
yello_taxi_2024_07 = pd.read_parquet("Data/2024_07.parquet")
yello_taxi_2024_08 = pd.read_parquet("Data/2024_08.parquet")
yello_taxi_2024_09 = pd.read_parquet("Data/2024_09.parquet")
yello_taxi_2024_10 = pd.read_parquet("Data/2024_10.parquet")
yello_taxi_2024_11 = pd.read_parquet("Data/2024_11.parquet")


sum_dataset = pd.concat([yello_taxi_2023_01,yello_taxi_2023_02,yello_taxi_2023_03,
            yello_taxi_2023_04,yello_taxi_2023_05,yello_taxi_2023_06,
            yello_taxi_2023_07,yello_taxi_2023_08,yello_taxi_2023_09, 
            yello_taxi_2023_10, yello_taxi_2023_11, yello_taxi_2023_12,
            yello_taxi_2024_01, yello_taxi_2024_02, yello_taxi_2024_03,
            yello_taxi_2024_04, yello_taxi_2024_05, yello_taxi_2024_06,
            yello_taxi_2024_07, yello_taxi_2024_08, yello_taxi_2024_09, 
            yello_taxi_2024_10, yello_taxi_2024_11
            ])


数据预处理：

In [7]:
def safe_preprocess(df):
    # 保留原始数据副本
    raw_df = df.copy()
    
    # 逐步过滤（添加调试输出）
    print("\n[1] 初始数据量:", len(raw_df))
    
    # 第一轮过滤：基础有效性过滤
    cond = (
        raw_df["passenger_count"].between(1, 6) &
        raw_df["trip_distance"].between(0.1, 100) &
        (raw_df["total_amount"] > 0) &
        (raw_df["tip_amount"] >= 0) &
        raw_df["tpep_pickup_datetime"].notna() &
        raw_df["tpep_dropoff_datetime"].notna()
    )
    stage1 = raw_df[cond]
    print("[2] 基础过滤后:", len(stage1))
    
    # 时间逻辑过滤
    time_cond = (stage1["tpep_dropoff_datetime"] > stage1["tpep_pickup_datetime"])
    stage1 = stage1[time_cond]
    print("[3] 时间过滤后:", len(stage1))
    
    # 计算时间相关特征
    stage1["trip_duration"] = (stage1["tpep_dropoff_datetime"] - stage1["tpep_pickup_datetime"]).dt.total_seconds() / 3600
    stage1["speed_kmh"] = stage1["trip_distance"] / stage1["trip_duration"]
    
    # 速度过滤（放宽限制）
    speed_cond = stage1["speed_kmh"].between(0.5, 150)  # 包含低速和高速公路速度
    stage1 = stage1[speed_cond]
    print("[4] 速度过滤后:", len(stage1))
    
    # 小费比例计算与过滤
    stage1["tip_ratio"] = stage1["tip_amount"] / stage1["total_amount"]
    ratio_cond = stage1["tip_ratio"].between(0, 1)  # 允许100%小费
    final_df = stage1[ratio_cond]
    print("[5] 最终数据量:", len(final_df))
    
    # 时间特征（确保列存在）
    final_df["dropoff_hour"] = final_df["tpep_dropoff_datetime"].dt.hour
    final_df["dropoff_dayofweek"] = final_df["tpep_dropoff_datetime"].dt.dayofweek
    final_df["is_weekend"] = final_df["dropoff_dayofweek"].isin([5, 6]).astype(int)
    
    # 时段分箱（确保包含所有小时）
    final_df["time_period"] = pd.cut(
        final_df["dropoff_hour"],
        bins=[0, 6, 9, 16, 20, 24],
        labels=["Early_Morning", "Morning", "Day", "Evening", "Night"],
        include_lowest=True
    )
    
    return final_df

# 正确调用方式：从原始数据开始处理
positive_pay_sum_dataset = safe_preprocess(sum_dataset)


[1] 初始数据量: 75811575
[2] 基础过滤后: 67854154
[3] 时间过滤后: 67851501
[4] 速度过滤后: 67781859
[5] 最终数据量: 67781856


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df["dropoff_hour"] = final_df["tpep_dropoff_datetime"].dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df["dropoff_dayofweek"] = final_df["tpep_dropoff_datetime"].dt.dayofweek
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df["is_weekend"] = final_df["dropoff_dayofweek"].is

In [None]:
print(positive_pay_sum_dataset.count())

列出考虑因素和特征，以及裁剪数据

In [19]:
# 基础特征列表（不包含需要编码的类别特征）
base_features = [
    "passenger_count",
    "trip_distance",
    "speed_kmh",
    "dropoff_hour",
    "is_weekend",       # 已经是二值特征
    "dropoff_dayofweek" # 作为有序类别处理
]

# 原始数据获取（包含需要编码的time_period和特征）
processed_df = positive_pay_sum_dataset[base_features + ["time_period", "tip_ratio"]].copy()

# 对类别特征进行独热编码（关键修改点）
processed_df = pd.get_dummies(
    processed_df, 
    columns=["time_period"], 
    drop_first=True,
    prefix="period"
)

# 安全抽样（先抽样再划分数据集）
sample_df = processed_df.sample(frac=0.01, random_state=42)

# 划分数据集（必须在此步骤之后进行标准化）
X = sample_df[base_features]
y = sample_df["tip_ratio"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42
)

# 定义需要标准化的连续特征（注意排除类别特征）
continuous_features = [
    "passenger_count",   # 离散但数值范围较大
    "trip_distance",     # 连续
    "speed_kmh",         # 连续
    "dropoff_hour"       # 周期性特征
]

# 创建数据副本避免修改原始数据
X_train_processed = X_train.copy()
X_test_processed = X_test.copy()

# 初始化标准化器（仅在训练集上fit）
scaler = StandardScaler()

# 对训练集连续特征进行拟合和转换
X_train_processed[continuous_features] = scaler.fit_transform(
    X_train[continuous_features]
)

# 对测试集使用训练集的参数进行转换
X_test_processed[continuous_features] = scaler.transform(
    X_test[continuous_features]
)
print(X_train_processed.count())

passenger_count      542255
trip_distance        542255
speed_kmh            542255
dropoff_hour         542255
is_weekend           542255
dropoff_dayofweek    542255
dtype: int64


模型评估函数：

In [9]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return {
        "MAE": mean_absolute_error(y_test, y_pred),
        "MSE": mean_squared_error(y_test, y_pred),
        "R2": r2_score(y_test, y_pred)
    }

线性回归模型训练

In [10]:
# 线性回归
lr_model = LinearRegression()
lr_model.fit(X_train_processed, y_train)

#评估模型
lr_metrics = evaluate_model(lr_model, X_test_processed, y_test)
print(f"MAE: {lr_metrics['MAE']:.4f}")
print(f"MSE: {lr_metrics['MSE']:.4f}")
print(f"R²: {lr_metrics['R2']:.4f}")

MAE: 0.0634
MSE: 0.0055
R²: 0.0048


In [11]:
import torch
print(torch.__version__)          # 应显示 2.3.0+
print(torch.backends.mps.is_available())  # 应返回 True
import sys
print(sys.version)
# 输出示例：3.11.7 (main, Dec 15 2023, 18:24:52) [Clang 15.0.0 (clang-1500.1.0.2.5)]

2.5.1
True
3.10.16 (main, Dec 11 2024, 10:22:29) [Clang 14.0.6 ]


Kmean

In [9]:
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train_processed, y_train)

#评估模型
knn_metrics = evaluate_model(knn_model, X_test_processed, y_test)
print(f"MAE: {knn_metrics['MAE']:.4f}")
print(f"MSE: {knn_metrics['MSE']:.4f}")
print(f"R²: {knn_metrics['R2']:.4f}")

MAE: 0.0656
MSE: 0.0065
R²: -0.1699
