In [71]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv
/kaggle/input/equity-post-HCT-survival-predictions/data_dictionary.csv
/kaggle/input/equity-post-HCT-survival-predictions/train.csv
/kaggle/input/equity-post-HCT-survival-predictions/test.csv


In [72]:
!pwd

/kaggle/input/equity-post-HCT-survival-predictions


In [73]:
cd ../input/equity-post-HCT-survival-predictions/

[Errno 2] No such file or directory: '../input/equity-post-HCT-survival-predictions/'
/kaggle/input/equity-post-HCT-survival-predictions


In [74]:
!pwd

/kaggle/input/equity-post-HCT-survival-predictions


# 方案1

## 1. 数据加载与预处理
首先，我们需要加载数据并进行预处理，包括处理缺失值、编码类别型特征、标准化数值型特征等。

In [75]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

# 加载数据
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
data_dict = pd.read_csv('data_dictionary.csv')

# 查看数据
print(train.head())
print(test.head())

   ID                       dri_score psych_disturb    cyto_score diabetes  \
0   0  N/A - non-malignant indication            No           NaN       No   
1   1                    Intermediate            No  Intermediate       No   
2   2  N/A - non-malignant indication            No           NaN       No   
3   3                            High            No  Intermediate       No   
4   4                            High            No           NaN       No   

   hla_match_c_high  hla_high_res_8          tbi_status arrhythmia  \
0               NaN             NaN              No TBI         No   
1               2.0             8.0  TBI +- Other, >cGy         No   
2               2.0             8.0              No TBI         No   
3               2.0             8.0              No TBI         No   
4               2.0             8.0              No TBI         No   

   hla_low_res_6  ...          tce_div_match donor_related  \
0            6.0  ...                    NaN    

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


## 2. 数据预处理
### 2.1 处理缺失值
对于数值型特征，使用中位数填充；对于类别型特征，使用众数填充。

In [76]:
# 分离特征和目标变量
X = train.drop(columns=['ID', 'efs', 'efs_time'])
y = train['efs']


# 定义数值型和类别型特征
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns

# 定义预处理管道
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 应用预处理
X_processed = preprocessor.fit_transform(X)
test_processed = preprocessor.transform(test.drop(columns=['ID']))

### 2.2 数据探索

In [77]:
# # 统计分析
# print(train_data.describe())

# # 可视化分析
# import seaborn as sns
# import matplotlib.pyplot as plt

# # 绘制特征分布图
# for col in train_data.columns:
#     if train_data[col].dtype == 'number':
#         sns.histplot(train_data[col], kde=True)
#         plt.title(f'Distribution of {col}')
#         plt.show()

# # 绘制相关性热力图
# plt.figure(figsize=(10, 8))
# sns.heatmap(train_data.corr(), annot=True, cmap='coolwarm')
# plt.title('Correlation Heatmap')
# plt.show()

## 3. 模型训练与调优
### 3.1 基线模型
使用随机森林作为基线模型。

In [78]:
# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# 训练随机森林模型
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# 验证集评估
y_pred = rf.predict_proba(X_val)[:, 1]
print(f'Validation ROC AUC: {roc_auc_score(y_val, y_pred)}')

Validation ROC AUC: 0.7258494879409604


### 3.2 高级模型
使用XGBoost、LightGBM和CatBoost进行训练和调优。

In [79]:
# # XGBoost模型
# xgb_model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, random_state=42)
# xgb_model.fit(X_train, y_train)
# y_pred_xgb = xgb_model.predict_proba(X_val)[:, 1]
# print(f'XGBoost Validation ROC AUC: {roc_auc_score(y_val, y_pred_xgb)}')

# # LightGBM模型
# lgb_model = lgb.LGBMClassifier(objective='binary', n_estimators=100, random_state=42)
# lgb_model.fit(X_train, y_train)
# y_pred_lgb = lgb_model.predict_proba(X_val)[:, 1]
# print(f'LightGBM Validation ROC AUC: {roc_auc_score(y_val, y_pred_lgb)}')



In [80]:
# # 指定一个已存在的目录作为 train_dir
# train_dir = '/kaggle/working/catboost_info'

# # 确保目录存在
# import os
# if not os.path.exists(train_dir):
#     os.makedirs(train_dir)

# # 使用指定的 train_dir
# cb_model = cb.CatBoostClassifier(random_state=42, verbose=0, train_dir=train_dir)
# cb_model.fit(X_train, y_train)
# y_pred_cb = cb_model.predict_proba(X_val)[:, 1]
# print(f'CatBoost Validation ROC AUC: {roc_auc_score(y_val, y_pred_cb)}')

In [81]:
# import numpy as np
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.impute import SimpleImputer
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.metrics import roc_auc_score
# import xgboost as xgb
# import lightgbm as lgb
# import catboost as cb

# # 加载数据
# train = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/train.csv')
# test = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/test.csv')

# # 分离特征和目标变量
# X = train.drop(columns=['ID', 'efs', 'efs_time'])
# y = train['efs']

# # 定义数值型和类别型特征
# numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
# categorical_features = X.select_dtypes(include=['object', 'category']).columns

# # 定义预处理管道
# numeric_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')),
#     ('scaler', StandardScaler())
# ])

# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numeric_transformer, numeric_features),
#         ('cat', categorical_transformer, categorical_features)
#     ])

# # 应用预处理
# X_processed = preprocessor.fit_transform(X)
# test_processed = preprocessor.transform(test.drop(columns=['ID']))

# # 划分训练集和验证集
# X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# 指定一个已存在的目录作为 train_dir
train_dir = '/kaggle/working/catboost_info'

# 确保目录存在
import os
if not os.path.exists(train_dir):
    os.makedirs(train_dir)

# XGBoost模型
xgb_model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict_proba(X_val)[:, 1]
print(f'XGBoost Validation ROC AUC: {roc_auc_score(y_val, y_pred_xgb)}')

# LightGBM模型
lgb_model = lgb.LGBMClassifier(objective='binary', n_estimators=100, random_state=42)
lgb_model.fit(X_train, y_train)
y_pred_lgb = lgb_model.predict_proba(X_val)[:, 1]
print(f'LightGBM Validation ROC AUC: {roc_auc_score(y_val, y_pred_lgb)}')

# CatBoost模型
cb_model = cb.CatBoostClassifier(random_state=42, verbose=0, train_dir=train_dir)
cb_model.fit(X_train, y_train)
y_pred_cb = cb_model.predict_proba(X_val)[:, 1]
print(f'CatBoost Validation ROC AUC: {roc_auc_score(y_val, y_pred_cb)}')

XGBoost Validation ROC AUC: 0.7307544063168827
[LightGBM] [Info] Number of positive: 12455, number of negative: 10585
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011795 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 943
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 178
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.540582 -> initscore=0.162684
[LightGBM] [Info] Start training from score 0.162684
LightGBM Validation ROC AUC: 0.7468795389694087
CatBoost Validation ROC AUC: 0.7543280668821892


### 4. 超参数调优
使用GridSearchCV对XGBoost进行超参数调优。

In [82]:
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300]
}

grid_search = GridSearchCV(estimator=xgb.XGBClassifier(objective='binary:logistic', random_state=42),
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=5,
                           verbose=1)

grid_search.fit(X_train, y_train)
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best ROC AUC: {grid_search.best_score_}')

# 使用最佳参数重新训练
best_xgb_model = grid_search.best_estimator_
y_pred_best_xgb = best_xgb_model.predict_proba(X_val)[:, 1]
print(f'Best XGBoost Validation ROC AUC: {roc_auc_score(y_val, y_pred_best_xgb)}')

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200}
Best ROC AUC: 0.7445535529227847
Best XGBoost Validation ROC AUC: 0.7503479157337131


## 5. 模型融合
使用模型融合技术提高预测性能。

In [83]:
# 模型融合
from sklearn.ensemble import VotingClassifier

ensemble_model = VotingClassifier(estimators=[
    ('rf', rf),
    ('xgb', best_xgb_model),
    ('lgb', lgb_model),
    ('cb', cb_model)
], voting='soft')

ensemble_model.fit(X_train, y_train)
y_pred_ensemble = ensemble_model.predict_proba(X_val)[:, 1]
print(f'Ensemble Validation ROC AUC: {roc_auc_score(y_val, y_pred_ensemble)}')

[LightGBM] [Info] Number of positive: 12455, number of negative: 10585
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011482 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 943
[LightGBM] [Info] Number of data points in the train set: 23040, number of used features: 178
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.540582 -> initscore=0.162684
[LightGBM] [Info] Start training from score 0.162684
Ensemble Validation ROC AUC: 0.752519232117967


## 6. 生成提交文件
使用最佳模型对测试集进行预测，并生成提交文件。

In [84]:
# 使用最佳模型对测试集进行预测
test_predictions = ensemble_model.predict_proba(test_processed)[:, 1]

# 生成提交文件
submission = pd.DataFrame({'ID': test['ID'], 'prediction': test_predictions})

# 确保保存到 /kaggle/working/ 目录
submission.to_csv('/kaggle/working/submission.csv', index=False)

print('Submission file saved as /kaggle/working/submission.csv')

Submission file saved as /kaggle/working/submission.csv


## 7. 评估与优化
根据比赛的评估指标（Stratified C-index），进一步优化模型。

In [85]:
# 计算分层一致性指数（Stratified C-index）
from sklearn.metrics import roc_auc_score

# 假设我们有一个函数来计算分层C-index
def stratified_c_index(y_true, y_pred, race_groups):
    c_indices = []
    for group in race_groups.unique():
        group_mask = race_groups == group
        c_index = roc_auc_score(y_true[group_mask], y_pred[group_mask])
        c_indices.append(c_index)
    mean_c_index = np.mean(c_indices)
    std_c_index = np.std(c_indices)
    return mean_c_index - std_c_index

# 计算分层C-index
race_groups = train['race_group']
stratified_c_index_score = stratified_c_index(y, ensemble_model.predict_proba(X_processed)[:, 1], race_groups)
print(f'Stratified C-index: {stratified_c_index_score}')

## 下述是改进的内容

In [86]:
# !pip install lifelines

In [87]:
# !pip install lifelines
# 终端rm -r catboost_info