In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/A2FDA/32130_AT2_25971060.csv', encoding='utf-8-sig')


In [None]:
# ========== 2) 读取数据 & 变量类型设定 & 清洗 ==========

import numpy as np
# 统一去掉列名首尾空格
df.columns = [c.strip() for c in df.columns]

# id 列，删除以防信息泄漏
if 'id' in df.columns:
    df = df.drop(columns=['id'])

# 归纳
# 名义（无序）：
nominal_cols = [c for c in [
    'Gender', 'Customer Type', 'Type of Travel', 'satisfaction'
] if c in df.columns]

# 有序（舱位 + Likert 0–5 评分）：
ordinal_cols = [c for c in [
    'Class', 'Inflight wifi service','Departure/Arrival time convenient','Ease of Online booking','Gate location',
    'Food and drink','Online boarding','Seat comfort','Inflight entertainment','On-board service',
    'Leg room service','Baggage handling','Checkin service','Inflight service','Cleanliness'
] if c in df.columns]

# 比率（连续、真零点）：
ratio_cols = [c for c in [
    'Age', 'Flight Distance', 'Departure Delay in Minutes', 'Arrival Delay in Minutes'
] if c in df.columns]

# 输出看看
print("Nominal:", nominal_cols)
print("Ordinal:", ordinal_cols)
print("Ratio:", ratio_cols)

# === 设定有序类别 ===
from pandas.api.types import CategoricalDtype

# 给舱位属性排个序
class_order = [c for c in ['Eco','Eco Plus','Business','First'] if c in df.get('Class', pd.Series(dtype=str)).unique().tolist()]
if len(class_order) > 0:
    df['Class'] = df['Class'].astype(CategoricalDtype(categories=class_order, ordered=True))

# 评分 0..5 设为有序（如果是数字或可转数字）
likert_cols = [c for c in ordinal_cols if c != 'Class']
for c in likert_cols:
    # 尝试转为整数评分
    df[c] = pd.to_numeric(df[c], errors='coerce')
    # 限定在 0..5 之间（异常值置为 NaN）
    df.loc[~df[c].isin([0,1,2,3,4,5]), c] = np.nan
    df[c] = df[c].astype('Int64')  # 可空整数
    # 保留有序类别副本（便于可视化排序）
    df[c] = df[c].astype(CategoricalDtype(categories=[0,1,2,3,4,5], ordered=True))

# satisfaction 清洗（统一大小写/空格）
if 'satisfaction' in df.columns:
    df['satisfaction'] = df['satisfaction'].astype(str).str.strip()
    # 二元化标签（供分组使用）
    df['satisfaction_bin'] = (df['satisfaction'].str.lower() == 'satisfied').astype(int)


In [None]:
import os
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style='whitegrid')

# 工具函数：安全文件名，确保保存的文件名字没有空格
def sanitize(s: str) -> str:
    return ''.join([ch if ch.isalnum() else '_' for ch in str(s)]).strip('_')

# 为每个变量保存图表与表格的目录
BASE_OUT_DIR='/content/drive/MyDrive/A2FDA'
PLOTS_DIR = os.path.join(BASE_OUT_DIR, 'plots')
TABLES_DIR = os.path.join(BASE_OUT_DIR, 'tables')
os.makedirs(PLOTS_DIR, exist_ok=True)
os.makedirs(TABLES_DIR, exist_ok=True)


##Nominal和Ordinal的频数和百分比统计

In [None]:

# ========== 3.1 计算汇总统计 ==========
attr_summary_rows = []

# --- Nominal / Ordinal: 频数、百分比---
# 返回众数及其频数，副作用生成一个存储所有信息的csv
def categorical_summary(col):
    s = df[col].dropna()
    freq = s.value_counts(dropna=False)
    pct = (freq / len(df)) * 100  # 用总体样本作分母，便于统一比较
    tab = pd.DataFrame({'Value': freq.index.astype(str), 'Count': freq.values, 'Percent': pct.values})
    tab_path = os.path.join(TABLES_DIR, f'{sanitize(col)}_frequencies.csv')
    tab.to_csv(tab_path, index=False)
    # 汇总要点
    top_val = freq.index[0] if len(freq) else None
    top_cnt = int(freq.iloc[0]) if len(freq) else 0
    missing = df[col].isna().sum()
    base = {
        'attribute': col,
        'type': 'categorical',
        'n': len(df),
        'n_missing': int(missing),
        'n_unique': int(s.nunique()),
        'top_value': str(top_val),
        'top_count': int(top_cnt),
        'table_path': tab_path
    }
    return base


In [None]:
# --- Ordinal：补充中位数和分位数（注意：主要解读中位数和分布，而非均值） ---
def ordinal_extras(col):
    # 将类别转回数值以便计算分位数（1..5）
    s = pd.to_numeric(df[col], errors='coerce')
    if s.notna().sum() == 0:
        return {'median': None, 'p25': None, 'p75': None}
    return {
        'median': float(s.median()),
        'p25': float(s.quantile(0.25)),
        'p75': float(s.quantile(0.75))
    }


In [None]:
# --- Ratio: mean, median, standard deviation, range, quartiles, skewness, kurtosis ---
def ratio_summary(col):
    s = pd.to_numeric(df[col], errors='coerce')
    desc = s.describe(percentiles=[0.25,0.5,0.75])

    missing_idx = s[s.isna()].index.tolist()  # Return the indices of missing values
    missing = s.isna().sum()

    zeros = int((s == 0).sum()) if s.notna().any() else 0
    return {
        'attribute': col,
        'type': 'ratio',
        'n': int(desc.get('count', 0)),
        'missing_idx': missing_idx,   # New: indices of missing values
        'n_missing': int(missing),
        'mean': float(desc.get('mean', np.nan)) if not np.isnan(desc.get('mean', np.nan)) else None,
        'median': float(desc.get('50%', np.nan)) if not np.isnan(desc.get('50%', np.nan)) else None,
        'std': float(desc.get('std', np.nan)) if not np.isnan(desc.get('std', np.nan)) else None,
        'min': float(desc.get('min', np.nan)) if not np.isnan(desc.get('min', np.nan)) else None,
        'p25': float(desc.get('25%', np.nan)) if not np.isnan(desc.get('25%', np.nan)) else None,
        'p75': float(desc.get('75%', np.nan)) if not np.isnan(desc.get('75%', np.nan)) else None,
        'max': float(desc.get('max', np.nan)) if not np.isnan(desc.get('max', np.nan)) else None,
        'n_zeros': zeros,
        'skew': float(s.skew()) if s.notna().sum() > 2 else None,
        'kurt': float(s.kurt()) if s.notna().sum() > 3 else None
    }


In [None]:
# 逐列处理
for col in nominal_cols:
    base = categorical_summary(col)
    base['subtype'] = 'nominal'
    attr_summary_rows.append(base)

for col in ordinal_cols:
    base = categorical_summary(col)
    base['subtype'] = 'ordinal'
    base.update(ordinal_extras(col))
    attr_summary_rows.append(base)

for col in ratio_cols:
    attr_summary_rows.append(ratio_summary(col))

attr_summary = pd.DataFrame(attr_summary_rows)
attr_summary_path = os.path.join(TABLES_DIR, 'attribute_summary.csv')
attr_summary.to_csv(attr_summary_path, index=False)
display(attr_summary)
print("Saved:", attr_summary_path)

In [None]:
# ========== 3.2 可视化 ==========
def save_fig(fig, path):
    fig.tight_layout()
    fig.savefig(path, dpi=150, bbox_inches='tight')
    plt.close(fig)

In [None]:
# 条形图：名义/有序
for col in nominal_cols + ordinal_cols:
    fig, ax = plt.subplots(figsize=(6,4))
    order = None
    # 有序变量按类别顺序排序（Class 或 0..5）
    if col in ordinal_cols:
        if col == 'Class' and hasattr(df[col].dtype, 'categories') and df[col].dtype.ordered:
            order = df[col].dtype.categories
        elif col in likert_cols:
            order = [0,1,2,3,4,5]
    sns.countplot(data=df, x=col, order=order, ax=ax, color='#4C78A8')
    ax.set_title(f'Frequency of {col}')
    ax.set_xlabel(col)
    ax.set_ylabel('Count')
    for label in ax.get_xticklabels():
        label.set_rotation(30)
        label.set_ha('right')
    plot_path = os.path.join(PLOTS_DIR, f'{sanitize(col)}_bar.png')
    save_fig(fig, plot_path)

In [None]:
# ------------------ Age ------------------
def plot_age(df, out_dir):
    col = 'Age'
    s = pd.to_numeric(df[col], errors='coerce').dropna()

    # 直方图 + KDE
    fig, ax = plt.subplots(figsize=(6,4))
    sns.histplot(s, kde=True, bins='auto', color='#72B7B2', ax=ax)
    ax.set_title('Age Distribution')
    ax.set_xlabel('Age'); ax.set_ylabel('Count')
    save_fig(fig, os.path.join(out_dir, 'Age_hist.png'))

    # 箱线图
    fig, ax = plt.subplots(figsize=(6,2.8))
    sns.boxplot(x=s, color='#E45756', ax=ax, showfliers=True)
    ax.set_title('Age Boxplot')
    ax.set_xlabel('Age')
    save_fig(fig, os.path.join(out_dir, 'Age_box.png'))
# ------------------ Flight Distance ------------------
def plot_flight_distance(df, out_dir):
    col = 'Flight Distance'
    s = pd.to_numeric(df[col], errors='coerce').dropna()

    fig, ax = plt.subplots(figsize=(6,4))
    sns.histplot(s, kde=True, bins='auto', color='#4C78A8', ax=ax)
    ax.set_title('Flight Distance Distribution')
    ax.set_xlabel('Flight Distance'); ax.set_ylabel('Count')
    save_fig(fig, os.path.join(out_dir, 'Flight_Distance_hist.png'))

    fig, ax = plt.subplots(figsize=(6,2.8))
    sns.boxplot(x=s, color='#F28E2B', ax=ax, showfliers=True)
    ax.set_title('Flight Distance Boxplot')
    ax.set_xlabel('Flight Distance')
    save_fig(fig, os.path.join(out_dir, 'Flight_Distance_box.png'))

# ------------------ Departure Delay ------------------
def plot_departure_delay(df, out_dir):
    col = 'Departure Delay in Minutes'
    s = pd.to_numeric(df[col], errors='coerce').dropna()

    fig, ax = plt.subplots(figsize=(6,4))
    sns.histplot(s, bins=np.arange(0, s.max()+5, 5), kde=False, color='#59A14F', ax=ax)
    ax.set_title('Departure Delay Distribution')
    ax.set_xlabel('Minutes'); ax.set_ylabel('Count')
    save_fig(fig, os.path.join(out_dir, 'Departure_Delay_hist.png'))

    fig, ax = plt.subplots(figsize=(6,2.8))
    sns.boxplot(x=s, color='#E45756', ax=ax, showfliers=True)
    ax.set_title('Departure Delay Boxplot')
    ax.set_xlabel('Minutes')
    save_fig(fig, os.path.join(out_dir, 'Departure_Delay_box.png'))

# ------------------ Arrival Delay ------------------
def plot_arrival_delay(df, out_dir):
    col = 'Arrival Delay in Minutes'
    s = pd.to_numeric(df[col], errors='coerce').dropna()

    fig, ax = plt.subplots(figsize=(6,4))
    sns.histplot(s, bins=np.arange(0, s.max()+5, 5), kde=False, color='#F28E2B', ax=ax)
    ax.set_title('Arrival Delay Distribution')
    ax.set_xlabel('Minutes'); ax.set_ylabel('Count')
    save_fig(fig, os.path.join(out_dir, 'Arrival_Delay_hist.png'))

    fig, ax = plt.subplots(figsize=(6,2.8))
    sns.boxplot(x=s, color='#E45756', ax=ax, showfliers=True)
    ax.set_title('Arrival Delay Boxplot')
    ax.set_xlabel('Minutes')
    save_fig(fig, os.path.join(out_dir, 'Arrival_Delay_box.png'))

In [None]:
# 调用各个 ratio 变量的绘图函数
plot_age(df, PLOTS_DIR)
plot_flight_distance(df, PLOTS_DIR)
plot_departure_delay(df, PLOTS_DIR)
plot_arrival_delay(df, PLOTS_DIR)

print("All ratio variable plots have been saved to:", PLOTS_DIR)


In [None]:
# 选取评分项列
likert_cols = [
    'Inflight wifi service','Departure/Arrival time convenient','Ease of Online booking',
    'Gate location','Food and drink','Online boarding','Seat comfort','Inflight entertainment',
    'On-board service','Leg room service','Baggage handling','Checkin service',
    'Inflight service','Cleanliness'
]

# satisfaction 转为数值
df['satisfaction_num'] = df['satisfaction'].map({'neutral or dissatisfied': 0, 'satisfied': 1})


In [None]:
def plot_spearman_corr(df, cols, out_dir):
    """
    绘制 Spearman 相关热力图并保存
    df: DataFrame
    cols: 用于计算相关性的列列表
    out_dir: 保存路径
    """
    # 计算 Spearman 相关矩阵
    corr = df[cols].corr(method='spearman')

    # 确保保存目录存在
    os.makedirs(out_dir, exist_ok=True)

    # 创建图
    fig, ax = plt.subplots(figsize=(12, 10))
    sns.heatmap(corr, annot=False, cmap="RdBu_r", center=0, ax=ax)
    ax.set_title("Spearman Correlation Heatmap")

    # 保存图
    save_fig(fig, os.path.join(out_dir, 'Spearman_Correlation_Heatmap.png'))


In [None]:
plot_spearman_corr(df, likert_cols + ['satisfaction_num'], out_dir=PLOTS_DIR)
