In [1]:
# Cell 1: 导入必要的库
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from statsmodels.tsa.statespace.sarimax import SARIMAX
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

In [6]:
# Cell 2: 数据加载和预处理
def load_and_preprocess_data():
    print("开始加载数据...")
    
    # 读取数据
    edge_nodes = pd.read_csv('Edge_Names_With_Nodes.csv')
    traffic_data = pd.read_csv('MDOT_SHA_Annual_Average_Daily_Traffic_Baltimore.csv')
    
    # 按Station ID分组
    station_groups = traffic_data.groupby('Station ID')
    total_stations = len(station_groups)
    processed = 0
    successful_matches = 0
    
    print(f"总计需要处理 {total_stations} 个测站")
    
    # 准备用于预测的数据
    locations_data = []
    
    for station_id, station_data in station_groups:
        processed += 1
        if processed % 10 == 0:  # 每处理10个测站显示一次进度
            print(f"进度: {processed}/{total_stations} ({(processed/total_stations*100):.1f}%)")
        
        try:
            # 获取该测站的基本信息
            base_info = station_data.iloc[0]
            road_name = str(base_info['Road Name']).strip()
            
            # 提取历史数据
            for year in range(2014, 2023):
                aadt_col = f'AADT {year}'
                aawdt_col = f'AAWDT {year}'
                
                if aadt_col in station_data.columns and aawdt_col in station_data.columns:
                    # 计算平均值
                    aadt_value = station_data[aadt_col].mean()
                    aawdt_value = station_data[aawdt_col].mean()
                    
                    # 只在两个值都不是NaN时添加数据
                    if pd.notna(aadt_value) and pd.notna(aawdt_value):
                        location_data = {
                            'Station_ID': station_id,
                            'Road_Name': road_name,
                            'Year': year,
                            'AADT': aadt_value,
                            'AAWDT': aawdt_value,
                            'Route_Number': base_info.get('Route Number'),
                            'Road_Section': base_info.get('Road Section'),
                            'Station_Description': base_info.get('Station Description'),
                            'County_Name': base_info.get('County Name'),
                            'Municipality_Name': base_info.get('Municipality Name'),
                            'node_start': base_info.get('node start'),
                            'node_end': base_info.get('node(s) end')
                        }
                        locations_data.append(location_data)
                        successful_matches += 1
            
        except Exception as e:
            print(f"处理测站 {station_id} 时出错: {str(e)}")
            continue
    
    # 创建DataFrame并处理任何剩余的NaN值
    df = pd.DataFrame(locations_data)
    if not df.empty:
        df = df.dropna(subset=['AADT', 'AAWDT'])
    
    print(f"\n处理完成:")
    print(f"- 总计测站数: {total_stations}")
    print(f"- 成功匹配记录数: {successful_matches}")
    print(f"- 平均每个测站的记录数: {successful_matches/total_stations:.1f}")
    
    return df, edge_nodes

In [7]:
# Cell 3: 预测模型
def train_and_predict(data, location_info):
    # 确保数据不包含NaN值
    data = data.dropna()
    
    if data.empty:
        return None
    
    # 准备XGBoost特征
    X = data[['Year']].copy()
    
    try:
        # 训练AADT模型
        aadt_model = XGBRegressor(n_estimators=100)
        aadt_model.fit(X, data['AADT'])
        
        # 训练AAWDT模型
        aawdt_model = XGBRegressor(n_estimators=100)
        aawdt_model.fit(X, data['AAWDT'])
        
        # 预测2023和2024年的数据
        future_years = pd.DataFrame({'Year': [2023, 2024]})
        
        predictions = {
            'Street_Name': location_info['Street_Name'],
            'Nodes': location_info['Nodes'],
            'node_start': eval(location_info['Nodes'])[0],  # 使用第一个节点作为起始节点
            'node_end': eval(location_info['Nodes'])[-1],   # 使用最后一个节点作为结束节点
            'AADT_2023': aadt_model.predict(future_years.iloc[[0]])[0],
            'AADT_2024': aadt_model.predict(future_years.iloc[[1]])[0],
            'AAWDT_2023': aawdt_model.predict(future_years.iloc[[0]])[0],
            'AAWDT_2024': aawdt_model.predict(future_years.iloc[[1]])[0]
        }
        
        return predictions
    except Exception as e:
        print(f"Error processing {location_info['Street_Name']}: {str(e)}")
        return None

In [10]:
# Cell 4: 生成预测
def generate_predictions():
    try:
        # 加载数据
        historical_data, edge_nodes = load_and_preprocess_data()
        
        if historical_data.empty:
            print("没有找到有效的历史数据")
            return pd.DataFrame()
        
        print("\n开始生成预测...")
        print(f"历史数据中包含 {len(historical_data['Station_ID'].unique())} 个唯一测站")
        
        # 存储所有位置的预测结果
        all_predictions = []
        
        # 按Station_ID分组处理数据
        for station_id in historical_data['Station_ID'].unique():
            try:
                # 获取该测站的历史数据
                station_data = historical_data[historical_data['Station_ID'] == station_id]
                
                if not station_data.empty:
                    # 准备训练数据
                    X = station_data[['Year']].values
                    y_aadt = station_data['AADT'].values
                    y_aawdt = station_data['AAWDT'].values
                    
                    # 训练AADT模型
                    model_aadt = XGBRegressor(objective='reg:squarederror')
                    model_aadt.fit(X, y_aadt)
                    
                    # 训练AAWDT模型
                    model_aawdt = XGBRegressor(objective='reg:squarederror')
                    model_aawdt.fit(X, y_aawdt)
                    
                    # 预测2023和2024年的数据
                    future_years = np.array([[2023], [2024]])
                    aadt_predictions = model_aadt.predict(future_years)
                    aawdt_predictions = model_aawdt.predict(future_years)
                    
                    # 获取最新的站点信息
                    latest_info = station_data.iloc[-1]
                    
                    # 添加预测结果
                    for i, year in enumerate([2023, 2024]):
                        prediction = {
                            'Station_ID': station_id,
                            'Road_Name': latest_info['Road_Name'],
                            'Year': year,
                            'AADT_Prediction': aadt_predictions[i],
                            'AAWDT_Prediction': aawdt_predictions[i],
                            'Route_Number': latest_info['Route_Number'],
                            'Road_Section': latest_info['Road_Section'],
                            'Station_Description': latest_info['Station_Description'],
                            'County_Name': latest_info['County_Name'],
                            'Municipality_Name': latest_info['Municipality_Name']
                        }
                        all_predictions.append(prediction)
                
            except Exception as e:
                print(f"预测测站 {station_id} 时出错: {str(e)}")
                continue
        
        # 转换为DataFrame
        predictions_df = pd.DataFrame(all_predictions)
        
        if not predictions_df.empty:
            # 添加时间戳到文件名
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f'traffic_predictions_{timestamp}.csv'
            
            try:
                predictions_df.to_csv(filename, index=False)
                print(f"\n预测结果已保存到 {filename}")
            except PermissionError:
                alt_filename = f'traffic_predictions_{timestamp}_alt.csv'
                predictions_df.to_csv(alt_filename, index=False)
                print(f"\n由于权限问题，预测结果已保存到备用文件: {alt_filename}")
        
        print(f"\n预测完成:")
        print(f"- 总计测站数: {len(historical_data['Station_ID'].unique())}")
        print(f"- 成功预测数: {len(predictions_df) // 2}")  # 除以2因为每个测站有2023和2024两年的预测
        print(f"- 预测成功率: {(len(predictions_df) / 2 / len(historical_data['Station_ID'].unique()) * 100):.1f}%")
        
        return predictions_df
    
    except Exception as e:
        print(f"生成预测时发生错误: {str(e)}")
        return pd.DataFrame()

In [11]:
# Cell 5: 可视化结果
def plot_predictions(predictions_df, historical_data):
    # 创建图形
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 12))
    
    # 绘制AADT预测分布
    sns.boxplot(data=predictions_df, x=['AADT_2023', 'AADT_2024'], ax=ax1)
    ax1.set_title('AADT Predictions Distribution')
    ax1.set_ylabel('AADT Value')
    
    # 绘制AAWDT预测分布
    sns.boxplot(data=predictions_df, x=['AAWDT_2023', 'AAWDT_2024'], ax=ax2)
    ax2.set_title('AAWDT Predictions Distribution')
    ax2.set_ylabel('AAWDT Value')
    
    plt.tight_layout()
    plt.show()

# 运行预测
predictions_df = generate_predictions()

开始加载数据...
总计需要处理 2221 个测站
进度: 10/2221 (0.5%)
进度: 20/2221 (0.9%)
进度: 30/2221 (1.4%)
进度: 40/2221 (1.8%)
进度: 50/2221 (2.3%)
进度: 60/2221 (2.7%)
进度: 70/2221 (3.2%)
进度: 80/2221 (3.6%)
进度: 90/2221 (4.1%)
进度: 100/2221 (4.5%)
进度: 110/2221 (5.0%)
进度: 120/2221 (5.4%)
进度: 130/2221 (5.9%)
进度: 140/2221 (6.3%)
进度: 150/2221 (6.8%)
进度: 160/2221 (7.2%)
进度: 170/2221 (7.7%)
进度: 180/2221 (8.1%)
进度: 190/2221 (8.6%)
进度: 200/2221 (9.0%)
进度: 210/2221 (9.5%)
进度: 220/2221 (9.9%)
进度: 230/2221 (10.4%)
进度: 240/2221 (10.8%)
进度: 250/2221 (11.3%)
进度: 260/2221 (11.7%)
进度: 270/2221 (12.2%)
进度: 280/2221 (12.6%)
进度: 290/2221 (13.1%)
进度: 300/2221 (13.5%)
进度: 310/2221 (14.0%)
进度: 320/2221 (14.4%)
进度: 330/2221 (14.9%)
进度: 340/2221 (15.3%)
进度: 350/2221 (15.8%)
进度: 360/2221 (16.2%)
进度: 370/2221 (16.7%)
进度: 380/2221 (17.1%)
进度: 390/2221 (17.6%)
进度: 400/2221 (18.0%)
进度: 410/2221 (18.5%)
进度: 420/2221 (18.9%)
进度: 430/2221 (19.4%)
进度: 440/2221 (19.8%)
进度: 450/2221 (20.3%)
进度: 460/2221 (20.7%)
进度: 470/2221 (21.2%)
进度: 480/2221 (21.6