In [9]:
def get_time_period_code(time):
    hour = time // 100  # 提取小时部分
    if 6 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 18:
        return 'Afternoon'
    elif 18 <= hour < 24:
        return 'Evening'
    else:
        return 'Night'


#读取training_data
import pandas as pd

training_data = pd.read_csv('training_data.csv')
# 应用函数创建新的编码后的时间段列
training_data['DEP_CateTime'] = training_data['CRS_DEP_TIME'].apply(get_time_period_code)
training_data['ARR_CateTime'] = training_data['CRS_ARR_TIME'].apply(get_time_period_code)

training_data = training_data[training_data['CANCELLED'] != 1].fillna(0)

training_data['target'] = (training_data['ARR_DELAY'] >= 0).astype(int)
# 设置权重，假设以延迟时间的平方根作为权重
training_data['weight'] = training_data['WEATHER_DELAY'].apply(lambda x: max(1, x ** 0.5))


In [10]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

# 提取特征和目标变量
X = training_data.drop(columns=['ARR_DELAY', 'target'])
y = training_data['target']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
sample_weight = X_train['weight']

# 删除不需要的列
X_train = X_train.drop(columns=['MKT_CARRIER_FL_NUM', 'YEAR', 'DEP_DELAY', 'WEATHER_DELAY', 'CANCELLED', 'weight', 'CRS_DEP_TIME', 'CRS_ARR_TIME','CRS_ELAPSED_TIME', 'ORIGIN_HourlyPressureChange', 'ORIGIN_HourlyPressureTendency','DEST_HourlyPressureChange', 'DEST_HourlyPressureTendency','ORIGIN_HourlyWetBulbTemperature','DEST_HourlyWetBulbTemperature' , 'ORIGIN_HourlyDewPointTemperature','DEST_HourlyDewPointTemperature'], errors='ignore')
X_test = X_test.drop(columns=['MKT_CARRIER_FL_NUM', 'YEAR', 'DEP_DELAY', 'WEATHER_DELAY', 'CANCELLED', 'weight', 'CRS_DEP_TIME', 'CRS_ARR_TIME','CRS_ELAPSED_TIME','ORIGIN_HourlyPressureChange', 'ORIGIN_HourlyPressureTendency','DEST_HourlyPressureChange', 'DEST_HourlyPressureTendency','ORIGIN_HourlyWetBulbTemperature','DEST_HourlyWetBulbTemperature','ORIGIN_HourlyDewPointTemperature','DEST_HourlyDewPointTemperature' ], errors='ignore')

# 1. 对指定特征进行独热编码，不删除任何类别
onehot_categorical_columns = ['OP_CARRIER', 'DEP_CateTime', 'ARR_CateTime', 'DAY_OF_WEEK', 'Holiday_Period']
X_train_onehot = pd.get_dummies(X_train[onehot_categorical_columns], drop_first=False)
X_test_onehot = pd.get_dummies(X_test[onehot_categorical_columns], drop_first=False)

# 确保测试集和训练集具有相同的列
X_test_onehot = X_test_onehot.reindex(columns=X_train_onehot.columns, fill_value=0)

# 2. 对指定特征进行频次编码
freq_categorical_columns = ['ORIGIN', 'DEST']
X_train_freq = X_train[freq_categorical_columns].copy()
X_test_freq = X_test[freq_categorical_columns].copy()

for col in freq_categorical_columns:
    freq_encoding = X_train[col].value_counts().to_dict()
    X_train_freq[col] = X_train[col].map(freq_encoding)
    X_test_freq[col] = X_test[col].map(freq_encoding)

# 3. 将独热编码和频次编码的数据集与其他数值特征合并
X_train_encoded = pd.concat([X_train_onehot, X_train_freq, X_train.drop(columns=onehot_categorical_columns + freq_categorical_columns)], axis=1)
X_test_encoded = pd.concat([X_test_onehot, X_test_freq, X_test.drop(columns=onehot_categorical_columns + freq_categorical_columns)], axis=1)

# 转换为浮点数
X_train_encoded = X_train_encoded.astype(float)
X_test_encoded = X_test_encoded.astype(float)

# 标准化数据
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

# 设置逻辑回归模型并去掉截距
model = LogisticRegression(max_iter=2500, solver='saga', fit_intercept=False)

# 训练逻辑回归模型，使用样本权重
model.fit(X_train_scaled, y_train,sample_weight=sample_weight)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# 计算特征之间的相关系数矩阵
correlation_matrix = X_train_encoded.corr()


# 绘制热力图，不显示数值
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', cbar_kws={'label': 'Correlation Coefficient'})
plt.title("Correlation Heatmap of Features")
plt.show()


In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score, log_loss, cohen_kappa_score, matthews_corrcoef

# 在训练集上进行预测
y_train_pred = model.predict(X_train_scaled)
y_train_pred_proba = model.predict_proba(X_train_scaled)

# 在测试集上进行预测
y_test_pred = model.predict(X_test_scaled)
y_test_pred_proba = model.predict_proba(X_test_scaled)

# 计算训练集上的指标
train_metrics = {
    'Accuracy': accuracy_score(y_train, y_train_pred),
    'Precision': precision_score(y_train, y_train_pred),
    'Recall': recall_score(y_train, y_train_pred),
    'F1 Score': f1_score(y_train, y_train_pred),
    'ROC AUC': roc_auc_score(y_train, y_train_pred_proba[:, 1]),
    'Average Precision': average_precision_score(y_train, y_train_pred_proba[:, 1]),
    'Log Loss': log_loss(y_train, y_train_pred_proba),
    'Cohen Kappa': cohen_kappa_score(y_train, y_train_pred),
    'MCC': matthews_corrcoef(y_train, y_train_pred)
}

# 计算测试集上的指标
test_metrics = {
    'Accuracy': accuracy_score(y_test, y_test_pred),
    'Precision': precision_score(y_test, y_test_pred),
    'Recall': recall_score(y_test, y_test_pred),
    'F1 Score': f1_score(y_test, y_test_pred),
    'ROC AUC': roc_auc_score(y_test, y_test_pred_proba[:, 1]),
    'Average Precision': average_precision_score(y_test, y_test_pred_proba[:, 1]),
    'Log Loss': log_loss(y_test, y_test_pred_proba),
    'Cohen Kappa': cohen_kappa_score(y_test, y_test_pred),
    'MCC': matthews_corrcoef(y_test, y_test_pred)
}

# 输出结果
print("训练集指标:")
for metric, value in train_metrics.items():
    print(f"{metric}: {value:.4f}")

print("\n测试集指标:")
for metric, value in test_metrics.items():
    print(f"{metric}: {value:.4f}")


训练集指标:
Accuracy: 0.5885
Precision: 0.4485
Recall: 0.5678
F1 Score: 0.5011
ROC AUC: 0.6189
Average Precision: 0.4829
Log Loss: 0.6733
Cohen Kappa: 0.1590
MCC: 0.1622

测试集指标:
Accuracy: 0.5887
Precision: 0.4487
Recall: 0.5679
F1 Score: 0.5013
ROC AUC: 0.6192
Average Precision: 0.4833
Log Loss: 0.6731
Cohen Kappa: 0.1594
MCC: 0.1626


In [12]:
coefficients = model.coef_.flatten()  # 将系数展平为一维数组
feature_names = X_train_encoded.columns  # 使用编码后的特征名称

# 创建一个 DataFrame 将特征名称与对应的系数对应起来
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# 打印系数表格，按系数大小排序（绝对值）
coef_df = coef_df.reindex(coef_df.Coefficient.abs().sort_values(ascending=False).index)
coef_df.to_csv('coef_df.csv', index=False)

In [18]:
import joblib

# 保存模型和 scaler
joblib.dump(model, 'logistic_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
model_columns = X_train_encoded.columns  # 获取训练数据的列名
joblib.dump(model_columns, 'model_columns.pkl')  # 保存列名

['model_columns.pkl']

In [19]:
# 加载模型和 scaler
model = joblib.load('logistic_model.pkl')
scaler = joblib.load('scaler.pkl')
model_columns = joblib.load('model_columns.pkl')  # 加载训练数据的列名

In [21]:
import pandas as pd
import joblib

# 生成测试样本数据
sample_data = {
    'MONTH': [1],
    'DAY_OF_WEEK': [3],
    'DEP_CateTIME': ['MORNING'],
    'ARR_CateTIME': ['AFTERNOON'],
    'Holiday_Period': [0],
    'OP_UNIQUE_CARRIER': ['AA'],
    'ORIGIN': ['JFK'],
    'DEST': ['LAX'],
    'CANCELLED': [0],
    'CRS_ELAPSED_TIME': [300],
    'DISTANCE': [2475],
    'ORIGIN_HourlyDewPointTemperature': [10.0],
    'ORIGIN_HourlyDryBulbTemperature': [15.0],
    'ORIGIN_HourlyPrecipitation': [0.1],
    'ORIGIN_HourlyPressureChange': [0.05],
    'ORIGIN_HourlyPressureTendency': [1],
    'ORIGIN_HourlyRelativeHumidity': [75.0],
    'ORIGIN_HourlySeaLevelPressure': [1013.25],
    'ORIGIN_HourlyVisibility': [10.0],
    'ORIGIN_HourlyWetBulbTemperature': [12.0],
    'ORIGIN_HourlyWindSpeed': [5.0],
    'DEST_HourlyDewPointTemperature': [8.0],
    'DEST_HourlyDryBulbTemperature': [14.0],
    'DEST_HourlyPrecipitation': [0.0],
    'DEST_HourlyPressureChange': [0.02],
    'DEST_HourlyPressureTendency': [0],
    'DEST_HourlyRelativeHumidity': [70.0],
    'DEST_HourlySeaLevelPressure': [1012.5],
    'DEST_HourlyStationPressure': [1010.0],
    'DEST_HourlyVisibility': [9.5],
    'DEST_HourlyWetBulbTemperature': [11.5],
    'DEST_HourlyWindSpeed': [6.0]
}

# 创建 DataFrame
sample_df = pd.DataFrame(sample_data)

# 加载模型、scaler 和列信息

scaler = joblib.load('scaler.pkl')  # 加载训练好的 scaler
model = joblib.load('logistic_model.pkl')  # 加载训练好的模型

# 对测试数据进行 one-hot 编码
sample_df_encoded = pd.get_dummies(sample_df)

# 找出缺失的列
missing_columns = [col for col in model_columns if col not in sample_df_encoded.columns]

# 创建一个包含所有缺失列的 DataFrame，所有值设为0
missing_df = pd.DataFrame(0, index=sample_df_encoded.index, columns=missing_columns)

# 将缺失的列添加到原数据中
sample_df_encoded = pd.concat([sample_df_encoded, missing_df], axis=1)

# 重新排列列顺序，使其与训练数据一致
sample_df_encoded = sample_df_encoded[model_columns]

# 使用 scaler 对测试数据进行标准化
sample_df_scaled = scaler.transform(sample_df_encoded)

# 使用模型预测取消的概率
cancelled_probability = model.predict_proba(sample_df_scaled)[:, 1]  # 预测取消的概率（标签为1的概率）
print(f"预测的延迟概率: {cancelled_probability[0]:.4f}")


预测的延迟概率: 0.5980
