In [16]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
import pandas as pd
import os
import numpy as np

# 1. 读入数据
df = pd.read_csv('/content/drive/MyDrive/A2FDA/32130_AT2_25971060.csv')

# 2. 缺失值处理（用中位数填充）
for col in ["Arrival Delay in Minutes", "Departure Delay in Minutes"]:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median(skipna=True))
#3. 等宽分箱
for col in ["Arrival Delay in Minutes", "Departure Delay in Minutes"]:
    min_val = df[col].min()
    max_val = df[col].max()
    bins = np.linspace(min_val, max_val, 11)  # 生成 10 个等宽区间，需要 11 个边界值
    df[f"{col}_width_bin"] = pd.cut(df[col], bins=bins, include_lowest=True)
# 4. 统一等深分箱策略（5 个 bin）
for col in ["Arrival Delay in Minutes", "Departure Delay in Minutes"]:
    df[f"{col}_depth_bin"] = pd.qcut(df[col], q=5, duplicates="drop")

# 5. 新建文件夹 PartB 并保存主数据
output_dir = "/content/drive/MyDrive/A2FDA/PartB"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "flights_binned.csv")
df.to_csv(output_file, index=False)
print(f"✅ 主数据已保存为 {output_file}")

# 6. 统计每个等宽分箱样本数量并保存
for col in ["Arrival Delay in Minutes", "Departure Delay in Minutes"]:
    counts_width = df[f"{col}_width_bin"].value_counts().sort_index().reset_index()
    counts_width.columns = [f"{col}_width_bin", "sample_count"]
    counts_width.to_csv(os.path.join(output_dir, f"{col}_width_bin_counts.csv"), index=False)
    print(f"✅ {col} 等宽分箱样本数量已保存")

# 7. 统计每个等深分箱样本数量并保存
for col in ["Arrival Delay in Minutes", "Departure Delay in Minutes"]:
    counts_depth = df[f"{col}_depth_bin"].value_counts().sort_index().reset_index()
    counts_depth.columns = [f"{col}_depth_bin", "sample_count"]
    counts_depth.to_csv(os.path.join(output_dir, f"{col}_depth_bin_counts.csv"), index=False)
    print(f"✅ {col} 等深分箱样本数量已保存")


✅ 主数据已保存为 /content/drive/MyDrive/A2FDA/PartB/flights_binned.csv
✅ Arrival Delay in Minutes 等宽分箱样本数量已保存
✅ Departure Delay in Minutes 等宽分箱样本数量已保存
✅ Arrival Delay in Minutes 等深分箱样本数量已保存
✅ Departure Delay in Minutes 等深分箱样本数量已保存


In [18]:
# 5. Flight Distance 归一化
if "Flight Distance" in df.columns:
    # Min-Max Normalization [0,1]
    min_val = df["Flight Distance"].min()
    max_val = df["Flight Distance"].max()
    df["flight_distance_minmax"] = (df["Flight Distance"] - min_val) / (max_val - min_val)

    # Z-Score Normalization
    mean_val = df["Flight Distance"].mean()
    std_val = df["Flight Distance"].std()
    df["flight_distance_zscore"] = (df["Flight Distance"] - mean_val) / std_val

# 6. 新建文件夹 PartB 并保存文件
output_dir = "/content/drive/MyDrive/A2FDA/PartB"
os.makedirs(output_dir, exist_ok=True)   # 确保目录存在
output_file = os.path.join(output_dir, "flights_binned_normalised.csv")

df.to_csv(output_file, index=False)
print(f"✅ 新数据已保存为 {output_file}")

✅ 新数据已保存为 /content/drive/MyDrive/A2FDA/PartB/flights_binned_normalised.csv


In [19]:
# 6. Age 离散化
if "Age" in df.columns:
    bins = [0, 21, 34, 44, 64, np.inf]
    labels = ["Young", "Early Adulthood", "Early Middle Age", "Late Middle Age", "Late Adulthood"]
    df["age_category"] = pd.cut(df["Age"], bins=bins, labels=labels, right=True)

    # Calculate the frequency
    age_freq = df["age_category"].value_counts().reset_index()
    age_freq.columns = ["Age Category", "Frequency"]

# 7. 新建文件夹 PartB 并保存文件
output_dir = "/content/drive/MyDrive/A2FDA/PartB"
os.makedirs(output_dir, exist_ok=True)

# 保存主数据
output_file_main = os.path.join(output_dir, "flights_binned_normalised_age.csv")
df.to_csv(output_file_main, index=False)

# 保存 Age 分类频数
if "Age" in df.columns:
    output_file_agefreq = os.path.join(output_dir, "age_category_frequency.csv")
    age_freq.to_csv(output_file_agefreq, index=False)

print(f"✅ 主数据已保存为 {output_file_main}")
if "Age" in df.columns:
    print(f"✅ 年龄类别频数已保存为 {output_file_agefreq}")


✅ 主数据已保存为 /content/drive/MyDrive/A2FDA/PartB/flights_binned_normalised_age.csv
✅ 年龄类别频数已保存为 /content/drive/MyDrive/A2FDA/PartB/age_category_frequency.csv


In [20]:
# 8. Satisfaction 二值化
if "satisfaction" in df.columns:
    mapping = {
        "satisfied": 1,
        "neutral or dissatisfied": 0
    }
    df["satisfaction_bin"] = df["satisfaction"].map(mapping)

# 保存最终主数据（包括所有新列）
output_file_final = os.path.join(output_dir, "flights_binned_normalised_age_satisfaction.csv")
df.to_csv(output_file_final, index=False)

print(f"✅ 最终数据已保存为 {output_file_final}")

✅ 最终数据已保存为 /content/drive/MyDrive/A2FDA/PartB/flights_binned_normalised_age_satisfaction.csv
