In this project, you will be tasked with predicting the 'Status' of flights based on historical flight data.

In [3]:
# You might use this block to download the training data
import os
import gdown

if not os.path.exists("flights-data.tgz"):
    data_url = "https://drive.google.com/file/d/10d1zrylGKQQp_Sl-5ejuphn4Q1vKDIo8/view?usp=share_link"
    gdown.download(data_url, fuzzy=True)
    !tar -xvzf "flights-data.tgz"

Downloading...
From (original): https://drive.google.com/uc?id=10d1zrylGKQQp_Sl-5ejuphn4Q1vKDIo8
From (redirected): https://drive.google.com/uc?id=10d1zrylGKQQp_Sl-5ejuphn4Q1vKDIo8&confirm=t&uuid=471200df-2839-47e4-9518-264d5535e164
To: /Users/yushan/workplace/DSAD1001/flights-data.tgz
100%|██████████| 32.9M/32.9M [00:17<00:00, 1.92MB/s]

x flights-data/2016_10.csv
x flights-data/2016_11.csv
x flights-data/2016_12.csv
x flights-data/2016_1.csv
x flights-data/2016_2.csv
x flights-data/2016_3.csv
x flights-data/2016_4.csv
x flights-data/2016_5.csv





x flights-data/2016_6.csv
x flights-data/2016_7.csv
x flights-data/2016_8.csv
x flights-data/2016_9.csv
x flights-data/2017_10.csv
x flights-data/2017_1.csv
x flights-data/2017_2.csv
x flights-data/2017_3.csv
x flights-data/2017_4.csv
x flights-data/2017_5.csv
x flights-data/2017_6.csv
x flights-data/2017_7.csv
x flights-data/2017_8.csv
x flights-data/2017_9.csv


Now, you need to implement your training code

In [5]:
import os
import glob
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

# 学生信息
student_id = "50012962"
student_name = "Bowen_LIU"

# 读取训练数据
def load_train_data():
    all_files = glob.glob("flights-data/*.csv")
    df_list = []
    for file in all_files:
        df = pd.read_csv(file)
        df_list.append(df)
    return pd.concat(df_list, axis=0, ignore_index=True)

# 数据预处理
def preprocess_data(df):
    # 选择有用特征
    features = ['Quarter', 'Month', 'DayOfWeek', 'UniqueCarrier', 'Origin', 
                'Dest', 'CRSDepTime', 'CRSArrTime', 'Distance', 'DistanceGroup']
    
    df_processed = df[features].copy()
    
    # 处理分类变量
    le_dict = {}
    for col in ['UniqueCarrier', 'Origin', 'Dest']:
        le = LabelEncoder()
        df_processed[col] = le.fit_transform(df_processed[col])
        le_dict[col] = le
        
    # 时间特征处理
    df_processed['CRSDepTime'] = df_processed['CRSDepTime'].apply(lambda x: int(str(int(x)).zfill(4)[:2]))
    df_processed['CRSArrTime'] = df_processed['CRSArrTime'].apply(lambda x: int(str(int(x)).zfill(4)[:2]))
    
    return df_processed, le_dict

# 主程序
def main():
    # 加载训练数据
    print("Loading training data...")
    train_data = load_train_data()
    X_train, le_dict = preprocess_data(train_data)
    y_train = train_data['Status']
    
    # 5折交叉验证
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []
    
    print("\nTraining with 5-fold cross validation...")
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
        X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        rf.fit(X_fold_train, y_fold_train)
        
        y_pred = rf.predict(X_fold_val)
        f1 = f1_score(y_fold_val, y_pred, average='macro')
        f1_scores.append(f1)
        print(f"Fold {fold+1} F1-score (macro): {f1:.4f}")
    
    print(f"\nAverage F1-score (macro): {np.mean(f1_scores):.4f}")
    
    # 加载测试数据并预测
    print("\nPredicting test data...")
    if not os.path.exists("test_data.csv"):
        data_url = "https://drive.google.com/file/d/1eLZRSP9zb9KkdeRg-8CGuak3a_xLS6YF/view?usp=drive_link"
        gdown.download(data_url, fuzzy=True)
    
    test_data = pd.read_csv("test_data.csv")
    X_test, _ = preprocess_data(test_data)
    
    # 使用全量训练数据训练最终模型
    final_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    final_model.fit(X_train, y_train)
    
    test_data['Status'] = final_model.predict(X_test)
    test_data.to_csv(f"{student_id}_{student_name}.csv", index=False)
    print("Prediction completed and saved!")

if __name__ == "__main__":
    main()

Loading training data...

Training with 5-fold cross validation...
Fold 1 F1-score (macro): 0.3671
Fold 2 F1-score (macro): 0.3671
Fold 3 F1-score (macro): 0.3665
Fold 4 F1-score (macro): 0.3662
Fold 5 F1-score (macro): 0.3661

Average F1-score (macro): 0.3666

Predicting test data...
Prediction completed and saved!


Then, it's time to predict and submit the results.

In [18]:
# 导入必要的库
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score
import torch
import torch.nn as nn
import torch.optim as optim

# 检查是否可使用MPS加速
device = torch.device("cpu" if torch.backends.mps.is_available() else "cpu")
print(f"使用设备：{device}")

# 设置学生信息
student_name = 'Your Name'
student_id = 'Your Student ID'

# 1. 读取并合并2016-2017年的航班数据
csv_files = glob.glob('flights-data/*.csv')
data_list = []
print("开始读取CSV文件...")
for file in tqdm(csv_files, desc="读取CSV文件"):
    df = pd.read_csv(file)
    data_list.append(df)
print(f"成功读取{len(csv_files)}个CSV文件，正在合并数据...")
data = pd.concat(data_list, ignore_index=True)
print(f"数据合并完成，共{data.shape[0]}行，{data.shape[1]}列。")

# 2. 处理缺失值
print("开始处理缺失值...")
initial_rows = data.shape[0]
data.dropna(inplace=True)
removed_rows = initial_rows - data.shape[0]
print(f"缺失值处理完成，删除了{removed_rows}行，剩余{data.shape[0]}行。")

# 3. 编码分类变量
print("开始编码分类变量...")
categorical_features = ['UniqueCarrier', 'Origin', 'Dest', 'DepTimeBlk', 'ArrTimeBlk']
label_encoders = {}
for col in tqdm(categorical_features, desc="编码分类变量"):
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# 4. 处理时间特征（无需额外处理，已是数值型）
time_features = ['Quarter', 'Month', 'DayofMonth', 'DayOfWeek']

# 5. 选择有效特征
features = time_features + categorical_features + ['Flights', 'Distance', 'DistanceGroup']
X = data[features]
y = data['Status']

# 将目标变量编码
print("编码目标变量Status...")
status_le = LabelEncoder()
y = status_le.fit_transform(y)

# 转换为NumPy数组
X = X.values
y = y.reshape(-1, 1)

# 转换为Tensor并移动到设备上
X_tensor = torch.tensor(X, dtype=torch.float32).to(device)
y_tensor = torch.tensor(y, dtype=torch.long).to(device)

# 6. 定义神经网络模型
class FlightDelayModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(FlightDelayModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.bn1 = nn.BatchNorm1d(64)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(64, 32)
        self.bn2 = nn.BatchNorm1d(32)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(32, num_classes)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.bn1(out)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.bn2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        return out

# 参数设置
input_size = X.shape[1]
num_classes = len(np.unique(y))
num_epochs = 5  # 为了加快示例运行次数，可根据需要增加
batch_size = 1024
learning_rate = 0.001

# 定义模型、损失函数和优化器
model = FlightDelayModel(input_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# 7. 进行5折交叉验证
print("开始5折交叉验证训练...")
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
f1_scores = []
accuracies = []

for train_index, val_index in skf.split(X, y):
    print(f"\nFold {fold}训练中...")
    X_train, X_val = X_tensor[train_index], X_tensor[val_index]
    y_train, y_val = y_tensor[train_index], y_tensor[val_index]
    
    train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
    val_dataset = torch.utils.data.TensorDataset(X_val, y_val)
    
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size)
    
    # 重置模型参数
    def reset_weights(m):
        if isinstance(m, nn.Linear):
            m.reset_parameters()
    model.apply(reset_weights)
    
    # 训练
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        for inputs, labels in tqdm(train_loader, desc=f"Fold {fold} Epoch {epoch+1}/{num_epochs}", leave=False):
            inputs = inputs.to(device)
            labels = labels.view(-1).to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels.squeeze())
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        avg_loss = epoch_loss / len(train_loader)
        print(f"Fold {fold} Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")
    
    # 验证
    model.eval()
    y_val_pred = []
    with torch.no_grad():
        for inputs, _ in val_loader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            y_val_pred.extend(predicted.cpu().numpy())
    y_val_cpu = y_val.cpu().numpy()
    f1 = f1_score(y_val_cpu, y_val_pred, average='macro')
    acc = accuracy_score(y_val_cpu, y_val_pred)
    f1_scores.append(f1)
    accuracies.append(acc)
    print(f"Fold {fold} 验证集 F1 分数: {f1:.4f}, 准确率: {acc:.4f}")
    fold += 1

# 输出平均F1分数和准确率
print(f"\n平均 F1 分数: {np.mean(f1_scores):.4f}")
print(f"平均 准确率: {np.mean(accuracies):.4f}")

# 8. 特征重要性分析（由于神经网络无法直接获取特征重要性，可使用Permuation Importance等方法）
# 此处仅作为示例，不具体实现

# 9. 对测试数据进行同样的预处理
print("\n开始处理测试数据...")
test_data = pd.read_csv('test_data.csv')
print(f"测试数据共{test_data.shape[0]}行，{test_data.shape[1]}列。")
test_data.dropna(inplace=True)
print(f"处理缺失值后，测试数据剩余{test_data.shape[0]}行。")

# 编码测试数据的分类变量
for col in tqdm(categorical_features, desc="编码测试数据分类变量"):
    le = label_encoders[col]
    test_data[col] = le.transform(test_data[col])

# 提取特征并转换为Tensor
X_test = test_data[features].values
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)

# 预测测试数据的Status
print("开始预测测试数据...")
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, y_test_pred = torch.max(outputs.data, 1)
y_test_pred_labels = status_le.inverse_transform(y_test_pred.cpu().numpy())

# 将预测结果保存为规定格式的csv文件
test_data['Status'] = y_test_pred_labels
test_data[['Status']].to_csv('prediction_results.csv', index=False)
print("预测结果已保存至 prediction_results.csv")

使用设备：cpu
开始读取CSV文件...


读取CSV文件: 100%|██████████| 22/22 [00:01<00:00, 17.44it/s]


成功读取22个CSV文件，正在合并数据...
数据合并完成，共1034931行，36列。
开始处理缺失值...
缺失值处理完成，删除了2499行，剩余1032432行。
开始编码分类变量...


编码分类变量: 100%|██████████| 5/5 [00:00<00:00, 17.31it/s]


编码目标变量Status...
开始5折交叉验证训练...

Fold 1训练中...


                                                                    

Fold 1 Epoch [1/5], Loss: 0.5666


                                                                    

Fold 1 Epoch [2/5], Loss: 0.5178


                                                                    

Fold 1 Epoch [3/5], Loss: 0.5174


                                                                    

Fold 1 Epoch [4/5], Loss: 0.5170


                                                                    

Fold 1 Epoch [5/5], Loss: 0.5168
Fold 1 验证集 F1 分数: 0.2981, 准确率: 0.8086

Fold 2训练中...


                                                                    

Fold 2 Epoch [1/5], Loss: 0.5249


                                                                    

Fold 2 Epoch [2/5], Loss: 0.5177


                                                                    

Fold 2 Epoch [3/5], Loss: 0.5173


                                                                    

Fold 2 Epoch [4/5], Loss: 0.5170


                                                                    

Fold 2 Epoch [5/5], Loss: 0.5167
Fold 2 验证集 F1 分数: 0.2981, 准确率: 0.8086

Fold 3训练中...


                                                                    

Fold 3 Epoch [1/5], Loss: 0.5242


                                                                    

Fold 3 Epoch [2/5], Loss: 0.5175


                                                                    

Fold 3 Epoch [3/5], Loss: 0.5172


                                                                    

Fold 3 Epoch [4/5], Loss: 0.5169


                                                                    

Fold 3 Epoch [5/5], Loss: 0.5167
Fold 3 验证集 F1 分数: 0.2981, 准确率: 0.8086

Fold 4训练中...


                                                                    

Fold 4 Epoch [1/5], Loss: 0.5233


                                                                    

Fold 4 Epoch [2/5], Loss: 0.5175


                                                                    

Fold 4 Epoch [3/5], Loss: 0.5172


                                                                    

Fold 4 Epoch [4/5], Loss: 0.5169


                                                                    

Fold 4 Epoch [5/5], Loss: 0.5166
Fold 4 验证集 F1 分数: 0.2981, 准确率: 0.8086

Fold 5训练中...


                                                                    

Fold 5 Epoch [1/5], Loss: 0.5257


                                                                    

Fold 5 Epoch [2/5], Loss: 0.5176


                                                                    

Fold 5 Epoch [3/5], Loss: 0.5174


                                                                    

Fold 5 Epoch [4/5], Loss: 0.5170


                                                                    

Fold 5 Epoch [5/5], Loss: 0.5169
Fold 5 验证集 F1 分数: 0.2981, 准确率: 0.8086

平均 F1 分数: 0.2981
平均 准确率: 0.8086

开始处理测试数据...
测试数据共94379行，35列。
处理缺失值后，测试数据剩余94133行。


编码测试数据分类变量: 100%|██████████| 5/5 [00:00<00:00, 222.27it/s]

开始预测测试数据...
预测结果已保存至 prediction_results.csv





In [20]:
# 导入必要的库
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix

# 设置学生信息
student_name = 'Your Name'
student_id = 'Your Student ID'

# 1. 读取并合并2016-2017年的航班数据
csv_files = glob.glob('flights-data/*.csv')
data_list = []
print("开始读取CSV文件...")
for file in tqdm(csv_files, desc="读取CSV文件"):
    df = pd.read_csv(file)
    data_list.append(df)
print(f"成功读取{len(csv_files)}个CSV文件，正在合并数据...")
data = pd.concat(data_list, ignore_index=True)
print(f"数据合并完成，共{data.shape[0]}行，{data.shape[1]}列。")

# 2. 数据清洗和预处理
print("开始处理缺失值...")
initial_rows = data.shape[0]
data.dropna(inplace=True)
removed_rows = initial_rows - data.shape[0]
print(f"缺失值处理完成，删除了{removed_rows}行，剩余{data.shape[0]}行。")

# 确认Status列的取值
print("Status列的唯一值：", data['Status'].unique())

# 将Status列编码为数字
status_le = LabelEncoder()
data['Status'] = status_le.fit_transform(data['Status'])

# 3. 编码分类变量
print("开始编码分类变量...")
categorical_features = ['UniqueCarrier', 'Origin', 'Dest', 'DepTimeBlk', 'ArrTimeBlk']
label_encoders = {}
for col in tqdm(categorical_features, desc="编码分类变量"):
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# 4. 处理时间特征
# CRSDepTime和CRSArrTime可能需要处理为小时或分钟数
def convert_time(x):
    x = int(x)
    if x == 2400:
        x = 0
    hours = x // 100
    minutes = x % 100
    return hours * 60 + minutes

data['CRSDepTime'] = data['CRSDepTime'].apply(convert_time)
data['CRSArrTime'] = data['CRSArrTime'].apply(convert_time)

# 5. 选择特征和目标变量
features = ['Quarter', 'Month', 'DayofMonth', 'DayOfWeek',
            'CRSDepTime', 'CRSArrTime', 'UniqueCarrier', 'Origin', 'Dest',
            'DepTimeBlk', 'ArrTimeBlk', 'Distance', 'DistanceGroup']

X = data[features]
y = data['Status']

# 6. 拆分训练集和测试集
print("拆分训练集和测试集...")
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print(f"训练集大小：{X_train.shape[0]}，验证集大小：{X_valid.shape[0]}")

# 7. 使用随机森林进行模型训练
print("开始模型训练和调优...")
rfc = RandomForestClassifier(random_state=42)

# 定义参数网格
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}

# 进行5折交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=cv, scoring='f1_macro', n_jobs=-1)
grid_search.fit(X_train, y_train)

# 输出最佳参数
print("最佳参数：", grid_search.best_params_)

# 在验证集上预测
print("在验证集上进行预测...")
best_rfc = grid_search.best_estimator_
y_pred = best_rfc.predict(X_valid)

# 输出F1分数和准确率
f1 = f1_score(y_valid, y_pred, average='macro')
acc = accuracy_score(y_valid, y_pred)
print(f"验证集 F1 分数: {f1:.4f}")
print(f"验证集 准确率: {acc:.4f}")

# 混淆矩阵和分类报告
print("分类报告：")
print(classification_report(y_valid, y_pred, target_names=status_le.classes_))

# 8. 特征重要性分析
importances = best_rfc.feature_importances_
feature_importance = pd.Series(importances, index=features).sort_values(ascending=False)
print("特征重要性：")
print(feature_importance)

# 可视化特征重要性
import matplotlib.pyplot as plt
plt.figure(figsize=(10,6))
feature_importance.plot(kind='bar')
plt.title('Feature Importance')
plt.tight_layout()
plt.show()

# 9. 对测试数据进行同样的预处理，并进行预测
print("开始处理测试数据并进行预测...")
test_data = pd.read_csv('test_data.csv')
print(f"测试数据共{test_data.shape[0]}行，{test_data.shape[1]}列。")
test_data.dropna(inplace=True)
print(f"处理缺失值后，测试数据剩余{test_data.shape[0]}行。")

# 编码测试数据的分类变量
for col in categorical_features:
    le = label_encoders[col]
    test_data[col] = le.transform(test_data[col])

# 处理时间特征
test_data['CRSDepTime'] = test_data['CRSDepTime'].apply(convert_time)
test_data['CRSArrTime'] = test_data['CRSArrTime'].apply(convert_time)

# 提取特征
X_test = test_data[features]

# 对测试数据进行预测
print("对测试数据进行预测...")
y_test_pred = best_rfc.predict(X_test)

# 将预测结果转换回原始标签
test_data['Status'] = status_le.inverse_transform(y_test_pred)

# 将预测结果保存为规定格式的csv文件
test_data[['Status']].to_csv('prediction_results.csv', index=False)
print("预测结果已保存至 prediction_results.csv")

开始读取CSV文件...


读取CSV文件: 100%|██████████| 22/22 [00:01<00:00, 17.39it/s]


成功读取22个CSV文件，正在合并数据...
数据合并完成，共1034931行，36列。
开始处理缺失值...
缺失值处理完成，删除了2499行，剩余1032432行。
Status列的唯一值： ['normal' 'delay' 'cancel']
开始编码分类变量...


编码分类变量: 100%|██████████| 5/5 [00:00<00:00, 16.98it/s]


拆分训练集和测试集...
训练集大小：825945，验证集大小：206487
开始模型训练和调优...


python(23943) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(23944) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(23945) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(23946) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(23947) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(23948) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(23949) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(23950) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(23951) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(23952) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


KeyboardInterrupt: 

In [22]:
# 导入必要的库
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

# 设置学生信息
student_name = 'Your Name'
student_id = 'Your Student ID'

# 1. 读取并合并2016-2017年的航班数据
csv_files = glob.glob('flights-data/*.csv')
data_list = []
print("开始读取CSV文件...")
for file in tqdm(csv_files, desc="读取CSV文件"):
    df = pd.read_csv(file)
    data_list.append(df)
print(f"成功读取{len(csv_files)}个CSV文件，正在合并数据...")
data = pd.concat(data_list, ignore_index=True)
print(f"数据合并完成，共{data.shape[0]}行，{data.shape[1]}列。")

# 2. 数据清洗和预处理
print("开始处理缺失值...")
initial_rows = data.shape[0]
data.dropna(inplace=True)
removed_rows = initial_rows - data.shape[0]
print(f"缺失值处理完成，删除了{removed_rows}行，剩余{data.shape[0]}行。")

# 确认Status列的取值
print("Status列的唯一值：", data['Status'].unique())

# 将Status列编码为数字
status_le = LabelEncoder()
data['Status'] = status_le.fit_transform(data['Status'])

# 3. 编码分类变量
print("开始编码分类变量...")
categorical_features = ['UniqueCarrier', 'Origin', 'Dest', 'DepTimeBlk', 'ArrTimeBlk']
label_encoders = {}
for col in tqdm(categorical_features, desc="编码分类变量"):
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# 4. 处理时间特征
# CRSDepTime和CRSArrTime转换为分钟数
def convert_time(x):
    x = int(x)
    if x == 2400:
        x = 0
    hours = x // 100
    minutes = x % 100
    return hours * 60 + minutes

data['CRSDepTime'] = data['CRSDepTime'].apply(convert_time)
data['CRSArrTime'] = data['CRSArrTime'].apply(convert_time)

# 5. 选择特征和目标变量
features = ['Quarter', 'Month', 'DayofMonth', 'DayOfWeek',
            'CRSDepTime', 'CRSArrTime', 'UniqueCarrier', 'Origin', 'Dest',
            'DepTimeBlk', 'ArrTimeBlk', 'Distance', 'DistanceGroup']

X = data[features]
y = data['Status']

# 6. 拆分训练集和验证集
print("拆分训练集和验证集...")
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print(f"训练集大小：{X_train.shape[0]}，验证集大小：{X_valid.shape[0]}")

# 7. 使用随机森林进行模型训练和调优
print("开始模型训练和调优...")

# 定义参数网格
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}

# 初始化随机森林分类器
rfc = RandomForestClassifier(random_state=42)

# 初始化 GridSearchCV，设置 verbose=3 以获取详细输出
grid_search = GridSearchCV(
    estimator=rfc,
    param_grid=param_grid,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring='f1_macro',
    n_jobs=-1,  # 使用所有可用的CPU核心
    verbose=3
)

# 进行网格搜索
grid_search.fit(X_train, y_train)

# 输出最佳参数
print("最佳参数：", grid_search.best_params_)

# 在验证集上预测
print("在验证集上进行预测...")
best_rfc = grid_search.best_estimator_
y_pred = best_rfc.predict(X_valid)

# 输出F1分数和准确率
f1 = f1_score(y_valid, y_pred, average='macro')
acc = accuracy_score(y_valid, y_pred)
print(f"验证集 F1 分数: {f1:.4f}")
print(f"验证集 准确率: {acc:.4f}")

# 混淆矩阵和分类报告
print("分类报告：")
print(classification_report(y_valid, y_pred, target_names=status_le.classes_))

# 8. 特征重要性分析
importances = best_rfc.feature_importances_
feature_importance = pd.Series(importances, index=features).sort_values(ascending=False)
print("特征重要性：")
print(feature_importance)

# 可视化特征重要性
plt.figure(figsize=(10,6))
feature_importance.plot(kind='bar')
plt.title('Feature Importance')
plt.tight_layout()
plt.show()

# 9. 对测试数据进行同样的预处理，并进行预测
print("开始处理测试数据并进行预测...")
test_data = pd.read_csv('test_data.csv')
print(f"测试数据共{test_data.shape[0]}行，{test_data.shape[1]}列。")
test_data.dropna(inplace=True)
print(f"处理缺失值后，测试数据剩余{test_data.shape[0]}行。")

# 编码测试数据的分类变量
for col in categorical_features:
    le = label_encoders[col]
    test_data[col] = le.transform(test_data[col])

# 处理时间特征
test_data['CRSDepTime'] = test_data['CRSDepTime'].apply(convert_time)
test_data['CRSArrTime'] = test_data['CRSArrTime'].apply(convert_time)

# 提取特征
X_test = test_data[features]

# 对测试数据进行预测
print("对测试数据进行预测...")
y_test_pred = best_rfc.predict(X_test)

# 将预测结果转换回原始标签
test_data['Status'] = status_le.inverse_transform(y_test_pred)

# 将预测结果保存为规定格式的csv文件
test_data[['Status']].to_csv('prediction_results.csv', index=False)
print("预测结果已保存至 prediction_results.csv")

开始读取CSV文件...


读取CSV文件: 100%|██████████| 22/22 [00:01<00:00, 16.02it/s]


成功读取22个CSV文件，正在合并数据...
数据合并完成，共1034931行，36列。
开始处理缺失值...
缺失值处理完成，删除了2499行，剩余1032432行。
Status列的唯一值： ['normal' 'delay' 'cancel']
开始编码分类变量...


编码分类变量: 100%|██████████| 5/5 [00:00<00:00, 16.31it/s]


拆分训练集和验证集...
训练集大小：825945，验证集大小：206487
开始模型训练和调优...
Fitting 5 folds for each of 12 candidates, totalling 60 fits


python(24625) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24626) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24627) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24628) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24629) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24630) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24631) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24632) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24634) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(24635) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[CV 1/5] END max_depth=10, min_samples_split=2, n_estimators=100;, score=0.298 total time= 1.3min
[CV 3/5] END max_depth=10, min_samples_split=2, n_estimators=100;, score=0.298 total time= 1.3min
[CV 5/5] END max_depth=10, min_samples_split=2, n_estimators=100;, score=0.298 total time= 1.3min
[CV 4/5] END max_depth=10, min_samples_split=2, n_estimators=100;, score=0.298 total time= 1.3min
[CV 2/5] END max_depth=10, min_samples_split=2, n_estimators=100;, score=0.298 total time= 1.3min
[CV 1/5] END max_depth=10, min_samples_split=2, n_estimators=200;, score=0.298 total time= 2.6min
[CV 3/5] END max_depth=10, min_samples_split=2, n_estimators=200;, score=0.298 total time= 2.6min
[CV 2/5] END max_depth=10, min_samples_split=5, n_estimators=100;, score=0.298 total time= 1.3min
[CV 1/5] END max_depth=10, min_samples_split=5, n_estimators=100;, score=0.298 total time= 1.3min
[CV 5/5] END max_depth=10, min_samples_split=5, n_estimators=100;, score=0.298 total time= 1.3min
[CV 3/5] END max_dep

python(25514) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25520) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25522) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25523) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[CV 1/5] END max_depth=10, min_samples_split=5, n_estimators=200;, score=0.298 total time= 2.5min
[CV 4/5] END max_depth=10, min_samples_split=5, n_estimators=200;, score=0.298 total time= 2.5min
[CV 5/5] END max_depth=10, min_samples_split=5, n_estimators=200;, score=0.298 total time= 2.5min
[CV 2/5] END max_depth=10, min_samples_split=5, n_estimators=200;, score=0.298 total time= 2.6min
[CV 3/5] END max_depth=10, min_samples_split=5, n_estimators=200;, score=0.298 total time= 2.6min
[CV 1/5] END max_depth=20, min_samples_split=5, n_estimators=100;, score=0.318 total time= 2.2min
[CV 2/5] END max_depth=20, min_samples_split=5, n_estimators=100;, score=0.317 total time= 2.2min
[CV 3/5] END max_depth=20, min_samples_split=5, n_estimators=100;, score=0.318 total time= 2.2min
[CV 4/5] END max_depth=20, min_samples_split=5, n_estimators=100;, score=0.317 total time= 2.2min
[CV 5/5] END max_depth=20, min_samples_split=5, n_estimators=100;, score=0.317 total time= 2.2min


KeyboardInterrupt: 

In [None]:
# 导入必要的库
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
import lightgbm as lgb
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
import optuna
from optuna.samplers import TPESampler
import matplotlib.pyplot as plt
import seaborn as sns
import multiprocessing

student_id = "50012962"
student_name = "Bowen_LIU"

# 读取并合并数据
csv_files = glob.glob('flights-data/*.csv')
data_list = []
for file in tqdm(csv_files, desc="读取CSV文件"):
    df = pd.read_csv(file)
    data_list.append(df)
data = pd.concat(data_list, ignore_index=True)
data.dropna(inplace=True)

# 编码分类变量
categorical_features = ['UniqueCarrier', 'Origin', 'Dest', 'DepTimeBlk', 'ArrTimeBlk', 'Status']
label_encoders = {}
for col in tqdm(categorical_features, desc="编码分类变量"):
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# 处理时间特征
def convert_time(x):
    try:
        x = int(x)
    except:
        x = 0
    if x == 2400:
        x = 0
    hours = x // 100
    minutes = x % 100
    return hours * 60 + minutes

data['CRSDepTime'] = data['CRSDepTime'].apply(convert_time)
data['CRSArrTime'] = data['CRSArrTime'].apply(convert_time)

# 选择特征和目标变量
features = ['Month', 'DayofMonth', 'DayOfWeek',
            'CRSDepTime', 'CRSArrTime', 'UniqueCarrier', 'Origin', 'Dest',
            'DepTimeBlk', 'ArrTimeBlk', 'Distance']
X = data[features]
y = data['Status']

# 拆分训练集和验证集
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 定义Optuna的目标函数
# 定义Optuna的目标函数
def objective(trial):
    param = {
        'objective': 'multiclass',
        'num_class': len(np.unique(y_train)),
        'metric': 'multi_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 31, 256),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 10.0, log=True),
    }
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []
    accuracies = []
    
    for train_idx, val_idx in cv.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        model = lgb.LGBMClassifier(**param)
        model.fit(X_tr, y_tr)
        
        y_pred = model.predict(X_val)
        
        f1 = f1_score(y_val, y_pred, average='macro')
        acc = accuracy_score(y_val, y_pred)
        
        f1_scores.append(f1)
        accuracies.append(acc)
    
    mean_f1 = np.mean(f1_scores)
    mean_acc = np.mean(accuracies)
    
    # 将 F1 分数和准确率保存为用户属性，便于在 optuna-dashboard 中查看
    trial.set_user_attr('f1_score', mean_f1)
    trial.set_user_attr('accuracy', mean_acc)
    
    # 返回平均 F1 分数（最大化）
    return mean_f1

# 创建Optuna的study并优化，设置优化方向为最大化
study = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42),
    storage='sqlite:///optuna_study.db',
    study_name='lightgbm_f1_optimization',
    load_if_exists=True
)

# 使用多进程并行优化
if __name__ == '__main__':
    n_trials = 50
    n_jobs = multiprocessing.cpu_count()
    study.optimize(objective, n_trials=n_trials, n_jobs=n_jobs)

# 输出最佳参数
print("最佳参数：", study.best_params)

# 使用最佳参数训练最终模型
best_params = study.best_params
best_params['objective'] = 'multiclass'
best_params['num_class'] = len(np.unique(y_train))
best_params['metric'] = 'multi_logloss'
best_params['verbosity'] = -1
best_params['boosting_type'] = 'gbdt'

model = lgb.LGBMClassifier(**best_params)
model.fit(X_train, y_train)

# 在验证集上进行预测
y_pred = model.predict(X_valid)

# 计算F1分数和准确率
f1 = f1_score(y_valid, y_pred, average='macro')
acc = accuracy_score(y_valid, y_pred)
print(f"验证集 F1 分数: {f1:.4f}")
print(f"验证集 准确率: {acc:.4f}")

# 使用optuna-dashboard可视化优化过程
# 在命令行中运行以下命令启动dashboard（需提前安装optuna-dashboard库）
# optuna-dashboard sqlite:///optuna_study.db

# 处理测试数据并进行预测
print("开始处理测试数据并进行预测...")
test_data = pd.read_csv('test_data.csv')

# 编码测试数据的分类变量
for col in categorical_features:
    if col != 'Status':  # 测试数据中没有 'Status' 列
        le = label_encoders[col]
        test_data[col] = le.transform(test_data[col])

# 处理时间特征
test_data['CRSDepTime'] = test_data['CRSDepTime'].apply(convert_time)
test_data['CRSArrTime'] = test_data['CRSArrTime'].apply(convert_time)

# 提取特征
X_test = test_data[features]

# 对测试数据进行预测
print("对测试数据进行预测...")
y_test_pred = model.predict(X_test)

# 将预测结果转换回原始标签并添加为新列
test_data['Status'] = label_encoders['Status'].inverse_transform(y_test_pred)

# 保存预测结果
test_data[['Status']].to_csv(f"{student_id}_{student_name}.csv", index=False)
print(f"预测结果已保存至 {student_id}_{student_name}.csv")

  from .autonotebook import tqdm as notebook_tqdm
读取CSV文件: 100%|██████████| 22/22 [00:01<00:00, 16.07it/s]
编码分类变量: 100%|██████████| 6/6 [00:00<00:00, 15.66it/s]
[I 2024-12-18 15:54:28,298] A new study created in RDB with name: lightgbm_f1_optimization
[I 2024-12-18 15:55:58,727] Trial 5 finished with value: 0.29805591742113247 and parameters: {'learning_rate': 0.0012080838994751929, 'num_leaves': 129, 'max_depth': 5, 'min_child_samples': 83, 'subsample': 0.7428959954998875, 'colsample_bytree': 0.6954751184670124, 'reg_alpha': 0.00015503101038965013, 'reg_lambda': 0.06352619468795004}. Best is trial 5 with value: 0.29805591742113247.
[I 2024-12-18 15:56:02,210] Trial 3 finished with value: 0.29805591742113247 and parameters: {'learning_rate': 0.007001980240422246, 'num_leaves': 160, 'max_depth': 5, 'min_child_samples': 51, 'subsample': 0.8190883558506318, 'colsample_bytree': 0.531384115194945, 'reg_alpha': 0.10669206394352647, 'reg_lambda': 7.573636652659845}. Best is trial 3 with value

In [None]:

# 导入必要的库
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
import optuna
from optuna.samplers import TPESampler
import matplotlib.pyplot as plt
import seaborn as sns

student_id = "50012962"
student_name = "Bowen_LIU"

# 读取并合并数据
csv_files = glob.glob('flights-data/*.csv')
data_list = []
for file in tqdm(csv_files, desc="读取CSV文件"):
    df = pd.read_csv(file)
    data_list.append(df)
data = pd.concat(data_list, ignore_index=True)
data.dropna(inplace=True)

# 编码分类变量
categorical_features = ['UniqueCarrier', 'Origin', 'Dest', 'DepTimeBlk', 'ArrTimeBlk', 'Status']
label_encoders = {}
for col in tqdm(categorical_features, desc="编码分类变量"):
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# 处理时间特征
def convert_time(x):
    try:
        x = int(x)
    except:
        x = 0
    if x == 2400:
        x = 0
    hours = x // 100
    minutes = x % 100
    return hours * 60 + minutes

data['CRSDepTime'] = data['CRSDepTime'].apply(convert_time)
data['CRSArrTime'] = data['CRSArrTime'].apply(convert_time)

# 选择特征和目标变量
features = ['Month', 'DayofMonth', 'DayOfWeek',
            'CRSDepTime', 'CRSArrTime', 'UniqueCarrier', 'Origin', 'Dest',
            'DepTimeBlk', 'ArrTimeBlk', 'Distance']
X = data[features]
y = data['Status']

# 拆分训练集和验证集
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 定义 Optuna 的目标函数
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 5, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
    }
    
    clf = RandomForestClassifier(**param, random_state=42, n_jobs=-1)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []
    
    for train_idx, val_idx in cv.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        clf.fit(X_tr, y_tr)
        y_pred = clf.predict(X_val)
        f1 = f1_score(y_val, y_pred, average='macro')
        f1_scores.append(f1)
        
    return np.mean(f1_scores)

# 8. 创建Optuna的study并优化，使用RDB存储
study = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42),
    storage='sqlite:///opt_1218_1.db',  # 使用SQLite数据库存储
    study_name='random_forest_f1macro_optimization',
    load_if_exists=True
)
study.optimize(objective, n_trials=50)

# 输出最佳参数
print("最佳参数：", study.best_params)

# 使用最佳参数训练最终模型
best_params = study.best_params
best_params['random_state'] = 42
best_params['n_jobs'] = -1

best_rfc = RandomForestClassifier(**best_params)
best_rfc.fit(X_train, y_train)

# 在验证集上进行预测
y_pred = best_rfc.predict(X_valid)

# 计算 F1 分数和准确率
f1 = f1_score(y_valid, y_pred, average='macro')
acc = accuracy_score(y_valid, y_pred)
print(f"验证集 F1 分数: {f1:.4f}")
print(f"验证集 准确率: {acc:.4f}")

# 处理测试数据并进行预测
print("开始处理测试数据并进行预测...")
test_data = pd.read_csv('test_data.csv')

# 编码测试数据的分类变量
for col in categorical_features:
    if col != 'Status':  # 测试数据中没有 'Status' 列
        le = label_encoders[col]
        test_data[col] = le.transform(test_data[col])

# 处理时间特征
test_data['CRSDepTime'] = test_data['CRSDepTime'].apply(convert_time)
test_data['CRSArrTime'] = test_data['CRSArrTime'].apply(convert_time)

# 提取特征
X_test = test_data[features]

# 对测试数据进行预测
print("对测试数据进行预测...")
y_test_pred = best_rfc.predict(X_test)

# 将预测结果转换回原始标签并添加为新列
test_data['Status'] = label_encoders['Status'].inverse_transform(y_test_pred)

# 保存预测结果
test_data[['Status']].to_csv(f"{student_id}_{student_name}.csv", index=False)
print(f"预测结果已保存至 {student_id}_{student_name}.csv")

  from .autonotebook import tqdm as notebook_tqdm
读取CSV文件: 100%|██████████| 22/22 [00:01<00:00, 16.71it/s]
编码分类变量: 100%|██████████| 6/6 [00:00<00:00, 17.98it/s]
[I 2024-12-18 15:37:12,239] A new study created in RDB with name: random_forest_f1macro_optimization
[W 2024-12-18 15:39:49,564] Trial 0 failed with parameters: {'n_estimators': 437, 'max_depth': 48, 'min_samples_split': 8, 'min_samples_leaf': 6, 'bootstrap': True} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/envs/DSAAnew/lib/python3.9/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/ky/cqbmc7gj63z4z812w3fwdrf80000gp/T/ipykernel_47714/1323954868.py", line 77, in objective
    clf.fit(X_tr, y_tr)
  File "/opt/homebrew/anaconda3/envs/DSAAnew/lib/python3.9/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/opt/homebrew/anacon

KeyboardInterrupt: 

In [4]:
import os
import gdown
import numpy as np
import pandas as pd

# input your student ID and name here
student_id = "50012962"
student_name = "Bowen_LIU"

if not os.path.exists("test_data.csv"):
    data_url = "https://drive.google.com/file/d/1eLZRSP9zb9KkdeRg-8CGuak3a_xLS6YF/view?usp=drive_link"
    gdown.download(data_url, fuzzy=True)
data = pd.read_csv("test_data.csv")

# WARNING: Change this line to your prediction code!!! Do not submit this random result!!!
data["Status"] = np.random.choice(np.array(["normal", "cancel", "delay"]), size=len(data), replace=True)

# Submit this ipynb and the csv file, you will be graded according to the f1 score (macro) and the quality of your code
data.to_csv(f"{student_id}_{student_name}.csv")

In [1]:
# 1. 导入必要的库
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
import lightgbm as lgb
from sklearn.metrics import f1_score, accuracy_score
import optuna
from optuna.samplers import TPESampler
import multiprocessing

student_id = "50012962"
student_name = "Bowen_LIU"

# 2. 读取并合并数据
csv_files = glob.glob('flights-data/*.csv')
data_list = []
for file in tqdm(csv_files, desc="读取CSV文件"):
    df = pd.read_csv(file)
    data_list.append(df)
data = pd.concat(data_list, ignore_index=True)
data.dropna(inplace=True)

# 3. 编码分类变量
categorical_features = ['UniqueCarrier', 'Origin', 'Dest', 'DepTimeBlk', 'ArrTimeBlk', 'Status']
label_encoders = {}
for col in tqdm(categorical_features, desc="编码分类变量"):
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# 4. 处理时间特征
def convert_time(x):
    try:
        x = int(x)
    except:
        x = 0
    if x == 2400:
        x = 0
    hours = x // 100
    minutes = x % 100
    return hours * 60 + minutes

data['CRSDepTime'] = data['CRSDepTime'].apply(convert_time)
data['CRSArrTime'] = data['CRSArrTime'].apply(convert_time)

# 5. 选择特征和目标变量
features = ['Month', 'DayofMonth', 'DayOfWeek',
            'CRSDepTime', 'CRSArrTime', 'UniqueCarrier', 'Origin', 'Dest',
            'DepTimeBlk', 'ArrTimeBlk', 'Distance']
X = data[features]
y = data['Status']

# 6. 拆分训练集和验证集
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 7. 定义Optuna的目标函数
def objective(trial):
    param = {
        'objective': 'multiclass',
        'num_class': len(np.unique(y_train)),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
        'num_leaves': trial.suggest_int('num_leaves', 31, 256),
        'max_depth': trial.suggest_int('max_depth', 5, 50, step=5),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 10.0, log=True),
    }
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []
    accuracies = []
    
    for train_idx, val_idx in cv.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        model = lgb.LGBMClassifier(
            **param,
            random_state=42,
            n_jobs=-1
        )
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
        
        f1 = f1_score(y_val, y_pred, average='macro')
        acc = accuracy_score(y_val, y_pred)
        
        f1_scores.append(f1)
        accuracies.append(acc)
    
    trial.set_user_attr("accuracy", np.mean(accuracies))
    return np.mean(f1_scores)

# 8. 创建Optuna的study并优化
study = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42),
    study_name='lightgbm_f1_optimization',
    storage='sqlite:///optuna_lgbm_study.db',
    load_if_exists=True
)

if __name__ == '__main__':
    n_trials = 100
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

# 9. 输出最佳参数
print("最佳参数：", study.best_params)

# 10. 使用最佳参数训练最终模型
best_params = study.best_params
best_params['objective'] = 'multiclass'
best_params['num_class'] = len(np.unique(y_train))
best_params['random_state'] = 42
best_params['n_jobs'] = -1

model = lgb.LGBMClassifier(**best_params)
model.fit(X_train, y_train)

# 11. 在验证集上进行预测
y_pred = model.predict(X_valid)

# 12. 计算F1分数和准确率
f1 = f1_score(y_valid, y_pred, average='macro')
acc = accuracy_score(y_valid, y_pred)
print(f"验证集 F1 分数: {f1:.4f}")
print(f"验证集 准确率: {acc:.4f}")

# 13. 使用optuna-dashboard可视化优化过程
# 在命令行中运行以下命令启动dashboard（需提前安装optuna-dashboard库）
# optuna-dashboard sqlite:///optuna_lgbm_study.db

# 14. 处理测试数据并进行预测
print("开始处理测试数据并进行预测...")
test_data = pd.read_csv('test_data.csv')

# 编码测试数据的分类变量
for col in categorical_features:
    if col != 'Status':  # 测试数据中没有 'Status' 列
        le = label_encoders[col]
        test_data[col] = le.transform(test_data[col])

# 处理时间特征
test_data['CRSDepTime'] = test_data['CRSDepTime'].apply(convert_time)
test_data['CRSArrTime'] = test_data['CRSArrTime'].apply(convert_time)

# 提取特征
X_test = test_data[features]

# 对测试数据进行预测
print("对测试数据进行预测...")
y_test_pred = model.predict(X_test)

# 将预测结果转换回原始标签并添加为新列
test_data['Status'] = label_encoders['Status'].inverse_transform(y_test_pred)

# 保存预测结果
test_data[['Status']].to_csv(f"{student_id}_{student_name}.csv", index=False)
print(f"预测结果已保存至 {student_id}_{student_name}.csv")

  from .autonotebook import tqdm as notebook_tqdm
读取CSV文件: 100%|██████████| 22/22 [00:01<00:00, 16.15it/s]
编码分类变量: 100%|██████████| 6/6 [00:00<00:00, 16.65it/s]
[I 2024-12-18 16:01:58,761] A new study created in RDB with name: lightgbm_f1_optimization
  0%|          | 0/100 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001243 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1243
[LightGBM] [Info] Number of data points in the train set: 660756, number of used features: 11
[LightGBM] [Info] Start training from score -4.484366
[LightGBM] [Info] Start training from score -1.714110
[LightGBM] [Info] Start training from score -0.212461
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001806 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1240
[LightGBM] [Info] Number of data points in the train set: 660756, number of used features: 11
[LightGBM] [Info] Start training from score -4.484500
[LightGBM] [Info] Start training from score -1.714101
[LightGBM] [Info] Star

  0%|          | 0/100 [02:12<?, ?it/s]


[W 2024-12-18 16:04:11,095] Trial 0 failed with parameters: {'learning_rate': 0.005611516415334507, 'n_estimators': 1000, 'num_leaves': 196, 'max_depth': 30, 'min_child_samples': 19, 'subsample': 0.5779972601681014, 'colsample_bytree': 0.5290418060840998, 'reg_alpha': 2.142302175774105, 'reg_lambda': 0.10129197956845731} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/envs/DSAAnew/lib/python3.9/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/ky/cqbmc7gj63z4z812w3fwdrf80000gp/T/ipykernel_53374/3460865889.py", line 90, in objective
    model.fit(X_tr, y_tr)
  File "/opt/homebrew/anaconda3/envs/DSAAnew/lib/python3.9/site-packages/lightgbm/sklearn.py", line 1284, in fit
    super().fit(
  File "/opt/homebrew/anaconda3/envs/DSAAnew/lib/python3.9/site-packages/lightgbm/sklearn.py", line 955, in fit
    self._Booster = train(
  File "/opt/homebrew/an

KeyboardInterrupt: 