In [None]:
##
##合并所有航班
##
import os
import pandas as pd

# 文件夹路径
processed_flight_folder = 'processed_flight'
output_file = 'all_flights.csv'

# 初始化一个空的 DataFrame 来存储所有航班数据
all_flights = pd.DataFrame()

# 遍历所有处理后的航班文件，将它们合并
for flight_file in os.listdir(processed_flight_folder):
    if flight_file.endswith('.csv'):
        flight_path = os.path.join(processed_flight_folder, flight_file)
        
        # 读取航班数据文件
        flights_data = pd.read_csv(flight_path, parse_dates=['DEP_DATETIME_CST'])
        
        # 将数据添加到 `all_flights` DataFrame 中
        all_flights = pd.concat([all_flights, flights_data], ignore_index=True)
all_flights = all_flights.sort_values('DEP_DATETIME_CST')
# 将合并后的数据保存成 `all_flights.csv`
all_flights.to_csv(output_file, index=False)
print(f"All flights data has been saved to {output_file}.")

In [None]:
#第一步merge ORIGIN#
import os
import pandas as pd

# 文件夹路径
combined_weather_folder = 'combined_weather'
all_flights_file = 'processed_flight/all_flights.csv'
output_folder = 'final_output'
os.makedirs(output_folder, exist_ok=True)

# 读取所有航班数据
all_flights = pd.read_csv(all_flights_file, parse_dates=['DEP_DATETIME_CST'])

# 遍历每个机场的天气文件并处理出发地天气数据
for weather_file in os.listdir(combined_weather_folder):
    if weather_file.startswith("weather_") and weather_file.endswith(".csv"):
        airport_id = int(weather_file.split('_')[1].split('.')[0])  # 提取机场ID
        print(f"正在处理机场ID: {airport_id}")

        # 设置输出文件路径
        output_file_path = os.path.join(output_folder, f"all_flights_with_origin_weather_{airport_id}.csv")
        
        # 检查文件是否已经存在，如果存在则跳过
        if os.path.exists(output_file_path):
            print(f"{output_file_path} 已存在，跳过处理...")
            continue

        # 读取机场的天气数据
        weather_file_path = os.path.join(combined_weather_folder, weather_file)
        weather_data = pd.read_csv(weather_file_path, parse_dates=['DATE_CST'])

        # 提取出发地为该机场的所有航班
        origin_flights = all_flights[all_flights['ORIGIN_AIRPORT_ID'] == airport_id].copy()

        # 使用 merge_asof 合并出发地天气数据
        origin_aligned = pd.merge_asof(
            origin_flights, weather_data,
            left_on='DEP_DATETIME_CST', right_on='DATE_CST',
            direction='nearest'
        )

        # 为所有天气数据列加上前缀 ORIGIN_
        weather_columns = weather_data.columns  # 包括 DATE_CST
        origin_aligned.rename(columns={col: f"ORIGIN_{col}" for col in weather_columns}, inplace=True)

        # 保存处理后的出发地天气数据为独立的 CSV 文件
        origin_aligned.to_csv(output_file_path, index=False)
        print(f"Processed and saved: {output_file_path}")

print("所有机场的出发地数据已处理并保存！")



In [2]:
##把所有含有ORIGIN数据的文件合并
import os
import pandas as pd

# 文件夹路径
origin_output_folder = 'final_output'
final_output_file = 'all_flights_with_origin.csv'

# 获取所有包含出发地天气数据的文件路径
all_origin_files = [os.path.join(origin_output_folder, file) for file in os.listdir(origin_output_folder) if file.startswith("all_flights_with_origin_weather_")]

# 读取并合并所有出发地天气数据文件，去除全为空的列
all_origin_data = pd.concat(
    [pd.read_csv(file, parse_dates=['DEP_DATETIME_CST']).dropna(how='all', axis=1) for file in all_origin_files],
    ignore_index=True
)
all_origin_data = all_origin_data.sort_values('DEP_DATETIME_CST')
# 保存合并后的数据
all_origin_data.to_csv(final_output_file, index=False)
print(f"所有出发地天气数据已合并并保存到: {final_output_file}")


  [pd.read_csv(file, parse_dates=['DEP_DATETIME_CST']).dropna(how='all', axis=1) for file in all_origin_files],
  [pd.read_csv(file, parse_dates=['DEP_DATETIME_CST']).dropna(how='all', axis=1) for file in all_origin_files],
  [pd.read_csv(file, parse_dates=['DEP_DATETIME_CST']).dropna(how='all', axis=1) for file in all_origin_files],
  [pd.read_csv(file, parse_dates=['DEP_DATETIME_CST']).dropna(how='all', axis=1) for file in all_origin_files],
  [pd.read_csv(file, parse_dates=['DEP_DATETIME_CST']).dropna(how='all', axis=1) for file in all_origin_files],
  [pd.read_csv(file, parse_dates=['DEP_DATETIME_CST']).dropna(how='all', axis=1) for file in all_origin_files],
  [pd.read_csv(file, parse_dates=['DEP_DATETIME_CST']).dropna(how='all', axis=1) for file in all_origin_files],
  [pd.read_csv(file, parse_dates=['DEP_DATETIME_CST']).dropna(how='all', axis=1) for file in all_origin_files],
  [pd.read_csv(file, parse_dates=['DEP_DATETIME_CST']).dropna(how='all', axis=1) for file in all_origin_

所有出发地天气数据已合并并保存到: all_flights_with_origin.csv


In [None]:
#并入DEST
import os
import pandas as pd

# 文件夹路径
combined_weather_folder = 'combined_weather'
all_flights_with_origin_file = 'all_flights_with_origin.csv'
output_folder = 'final_output_dest'
os.makedirs(output_folder, exist_ok=True)

# 读取包含出发地天气数据的航班文件
all_flights_with_origin = pd.read_csv(all_flights_with_origin_file, parse_dates=['DEP_DATETIME_CST'])

# 遍历每个机场的天气文件并处理到达地天气数据
for weather_file in os.listdir(combined_weather_folder):
    if weather_file.startswith("weather_") and weather_file.endswith(".csv"):
        airport_id = int(weather_file.split('_')[1].split('.')[0])  # 提取机场ID
        print(f"正在处理到达机场天气数据，机场ID: {airport_id}")

        # 设置输出文件路径
        output_file_path = os.path.join(output_folder, f"all_flights_with_dest_weather_{airport_id}.csv")
        
        # 检查文件是否已经存在，如果存在则跳过
        if os.path.exists(output_file_path):
            print(f"{output_file_path} 已存在，跳过处理...")
            continue

        # 读取机场的天气数据
        weather_file_path = os.path.join(combined_weather_folder, weather_file)
        weather_data = pd.read_csv(weather_file_path, parse_dates=['DATE_CST'])

        # 提取到达地为该机场的所有航班
        dest_flights = all_flights_with_origin[all_flights_with_origin['DEST_AIRPORT_ID'] == airport_id].copy()

        # 使用 merge_asof 合并到达地天气数据，并加上前缀 DEST_
        dest_aligned = pd.merge_asof(
            dest_flights, weather_data,
            left_on='DEP_DATETIME_CST', right_on='DATE_CST',
            direction='nearest'
        )
        weather_columns = weather_data.columns  # 包括 DATE_CST
        dest_aligned.rename(columns={col: f"DEST_{col}" for col in weather_columns}, inplace=True)
        # 保存处理后的到达地天气数据为独立的 CSV 文件
        dest_aligned.to_csv(output_file_path, index=False)
        print(f"Processed and saved: {output_file_path}")

print("所有到达地天气数据已处理并保存！")

In [2]:
import os
import pandas as pd

# 文件夹路径
dest_output_folder = 'final_output_dest'
final_output_file = 'all_flights_with_dest_weather.csv'

# 获取所有包含到达地天气数据的文件路径
all_dest_files = [
    os.path.join(dest_output_folder, file) 
    for file in os.listdir(dest_output_folder) 
    if file.startswith("all_flights_with_dest_weather_")
]

# 读取并合并所有到达地天气数据文件，去除全为空的列
all_dest_data = pd.concat(
    [
        pd.read_csv(file, parse_dates=['DEP_DATETIME_CST']).dropna(how='all', axis=1) 
        for file in all_dest_files
    ],
    ignore_index=True
)

# 保存合并后的数据
all_dest_data.to_csv(final_output_file, index=False)
print(f"所有到达地天气数据已合并并保存到: {final_output_file}")


  pd.read_csv(file, parse_dates=['DEP_DATETIME_CST']).dropna(how='all', axis=1)
  pd.read_csv(file, parse_dates=['DEP_DATETIME_CST']).dropna(how='all', axis=1)
  pd.read_csv(file, parse_dates=['DEP_DATETIME_CST']).dropna(how='all', axis=1)
  pd.read_csv(file, parse_dates=['DEP_DATETIME_CST']).dropna(how='all', axis=1)
  pd.read_csv(file, parse_dates=['DEP_DATETIME_CST']).dropna(how='all', axis=1)
  pd.read_csv(file, parse_dates=['DEP_DATETIME_CST']).dropna(how='all', axis=1)
  pd.read_csv(file, parse_dates=['DEP_DATETIME_CST']).dropna(how='all', axis=1)
  pd.read_csv(file, parse_dates=['DEP_DATETIME_CST']).dropna(how='all', axis=1)
  pd.read_csv(file, parse_dates=['DEP_DATETIME_CST']).dropna(how='all', axis=1)
  pd.read_csv(file, parse_dates=['DEP_DATETIME_CST']).dropna(how='all', axis=1)
  pd.read_csv(file, parse_dates=['DEP_DATETIME_CST']).dropna(how='all', axis=1)
  pd.read_csv(file, parse_dates=['DEP_DATETIME_CST']).dropna(how='all', axis=1)
  pd.read_csv(file, parse_dates=['DEP_DA

所有到达地天气数据已合并并保存到: all_flights_with_dest_weather.csv


In [2]:
import pandas as pd
df = pd.read_csv('all_flights_with_dest_weather.csv', parse_dates=['DEP_DATETIME_CST'])
df.tail(5)

  df = pd.read_csv('all_flights_with_dest_weather.csv', parse_dates=['DEP_DATETIME_CST'])


Unnamed: 0,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,MKT_UNIQUE_CARRIER,BRANDED_CODE_SHARE,MKT_CARRIER_AIRLINE_ID,MKT_CARRIER,...,DEST_HourlyPressureChange,DEST_HourlyPressureTendency,DEST_HourlyRelativeHumidity,DEST_HourlySeaLevelPressure,DEST_HourlyStationPressure,DEST_HourlyVisibility,DEST_HourlyWetBulbTemperature,DEST_HourlyWindDirection,DEST_HourlyWindSpeed,DEST_DATE_CST
7413249,2024,1,1,31,3,1/31/2024 12:00:00 AM,UA,UA_CODESHARE,19977,UA,...,1.5,1.0,61.0,983.6,931.2,16.093,4.6,280.0,6.2,2019-10-18 07:53:00
7413250,2024,1,1,31,3,1/31/2024 12:00:00 AM,DL,DL_CODESHARE,19790,DL,...,1.5,1.0,61.0,983.6,931.2,16.093,4.6,280.0,6.2,2019-10-18 07:53:00
7413251,2024,1,1,31,3,1/31/2024 12:00:00 AM,UA,UA_CODESHARE,19977,UA,...,1.5,1.0,61.0,983.6,931.2,16.093,4.6,280.0,6.2,2019-10-18 07:53:00
7413252,2024,1,1,31,3,1/31/2024 12:00:00 AM,DL,DL_CODESHARE,19790,DL,...,1.5,1.0,61.0,983.6,931.2,16.093,4.6,280.0,6.2,2019-10-18 07:53:00
7413253,2024,1,1,31,3,1/31/2024 12:00:00 AM,UA,UA_CODESHARE,19977,UA,...,1.5,1.0,61.0,983.6,931.2,16.093,4.6,280.0,6.2,2019-10-18 07:53:00
