In [1]:
!unzip nyc-taxi-trip-duration.zip && rm nyc-taxi-trip-duration.zip
!unzip sample_submission.zip && rm sample_submission.zip
!unzip test.zip && rm test.zip
!unzip train.zip && rm train.zip

Archive:  nyc-taxi-trip-duration.zip
  inflating: sample_submission.zip   
  inflating: test.zip                
  inflating: train.zip               
Archive:  sample_submission.zip
  inflating: sample_submission.csv   
Archive:  test.zip
  inflating: test.csv                
Archive:  train.zip
  inflating: train.csv               


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import autogluon as ag
from autogluon.tabular import TabularDataset, TabularPredictor

In [14]:
#特征工程

def simple_taxi_features(df):
    """
    最简单的出租车特征工程 - 直接替换原函数
    基于原始代码改进，保持简单但更有效
    """
    df = df.copy()

    # 时间特征
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
    df['hour'] = df['pickup_datetime'].dt.hour
    df['weekday'] = df['pickup_datetime'].dt.weekday
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)

    # 距离特征
    df['lat_diff'] = df['dropoff_latitude'] - df['pickup_latitude']
    df['lon_diff'] = df['dropoff_longitude'] - df['pickup_longitude']

    # 真实地理距离（Haversine公式）
    R = 6371  # 地球半径
    lat1, lon1 = np.radians(df['pickup_latitude']), np.radians(df['pickup_longitude'])
    lat2, lon2 = np.radians(df['dropoff_latitude']), np.radians(df['dropoff_longitude'])
    dlat, dlon = lat2 - lat1, lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    df['distance'] = R * 2 * np.arcsin(np.sqrt(a))

    # 曼哈顿距离（城市道路距离）
    df['manhattan_distance'] = np.abs(df['lat_diff']) + np.abs(df['lon_diff'])

    # 方向特征
    df['direction'] = np.arctan2(df['lat_diff'], df['lon_diff'])
    df['direction_simplified'] = ((df['direction'] + np.pi/4) // (np.pi/2)).astype(int) % 4

    # 简单方向（南北/东西）
    df['is_north'] = (df['lat_diff'] > 0).astype(int)
    df['is_east'] = (df['lon_diff'] > 0).astype(int)

    # 时间段特征
    df['time_period'] = pd.cut(df['hour'],
                              bins=[0, 6, 12, 18, 24],
                              labels=[0, 1, 2, 3],
                              include_lowest=True).astype(int)  # 0:夜晚, 1:早晨, 2:下午, 3:晚上

    return df

# 数据处理
print("处理数据...")
data = pd.read_csv('train.csv').drop(['id','dropoff_datetime'], axis=1)
test = pd.read_csv('test.csv').drop('id', axis=1)
test_id = pd.read_csv('test.csv')['id']


# 移除极端异常值
data = data[
    (data['trip_duration'] >= 60) &
    (data['trip_duration'] <= 3600)  # 1小时内
]

print(f"清洗后数据量: {len(data)}")

# 特征工程
data = simple_taxi_features(data)
test = simple_taxi_features(test)

# 打印数据信息
print(f"特征数量: {data.shape[1]}")
print(test.shape)

# 分割数据
data_train, data_eval = train_test_split(data, test_size=0.2, random_state=42)

处理数据...
清洗后数据量: 1437732
特征数量: 21
(625134, 20)


In [15]:
#构建模型
predictor = TabularPredictor(label='trip_duration', eval_metric='mean_squared_error')
model = predictor.fit(train_data = data_train, time_limit = 1800, presets='best_quality')

No path specified. Models will be saved in: "AutogluonModels/ag-20250614_034324"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.3.1
Python Version:     3.11.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Sun Mar 30 16:01:29 UTC 2025
CPU Count:          8
Memory Avail:       45.77 GB / 50.99 GB (89.8%)
Disk Space Avail:   182.11 GB / 225.83 GB (80.6%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validatio

In [16]:
y_pred  = model.predict(data_eval)
y_pred.head()

Unnamed: 0,trip_duration
814813,482.271393
1316143,509.040131
1170504,641.817871
337034,1298.302002
605181,400.448517


In [17]:
model.evaluate(data_eval)

{'mean_squared_error': -57291.17578125,
 'root_mean_squared_error': np.float64(-239.3557507574018),
 'mean_absolute_error': -159.6666717529297,
 'r2': 0.8322453498840332,
 'pearsonr': 0.9122951513128346,
 'median_absolute_error': np.float64(-109.73553466796875)}

In [18]:
model.leaderboard(data_eval)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMXT_BAG_L2,-57171.148438,-58135.488281,mean_squared_error,334.685431,1748.753594,996.593174,95.442217,313.293764,316.771775,2,True,3
1,WeightedEnsemble_L3,-57291.175781,-58035.714844,mean_squared_error,337.082276,1756.005178,1023.662806,0.004374,0.012382,0.750655,3,True,5
2,LightGBM_BAG_L2,-57830.953125,-58335.539062,mean_squared_error,241.635685,1442.699032,706.140376,2.392471,7.239202,26.318977,2,True,4
3,LightGBMXT_BAG_L1,-58406.292969,-58813.894531,mean_squared_error,239.243214,1435.459829,679.821399,239.243214,1435.459829,679.821399,1,True,1
4,WeightedEnsemble_L2,-58406.292969,-58813.894531,mean_squared_error,239.247271,1435.468795,679.876172,0.004056,0.008965,0.054773,2,True,2


In [19]:
#测试集测试
test_pred = model.predict(test)
print(test_pred.shape)
test_pred.head()

(625134,)


Unnamed: 0,trip_duration
0,911.529053
1,641.767517
2,464.134705
3,1056.880615
4,439.296783


In [21]:
#提交数据
submission = pd.DataFrame({'id':test_id, 'trip_duration':test_pred})
submission.to_csv('submission.csv', index=False)