In [31]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
sns.set_style("whitegrid")

In [32]:
train_path = os.path.join("..", "data","train.csv")
test_path = os.path.join("..","data","test.csv")
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [33]:
train_x = train.drop('Transported',axis=1)
train_y = train['Transported']
pred_x = test.copy()
train_x['is_train'] = True
pred_x['is_train'] = False
full_data = pd.concat([train_x,pred_x],ignore_index=True)
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   12970 non-null  object 
 1   HomePlanet    12682 non-null  object 
 2   CryoSleep     12660 non-null  object 
 3   Cabin         12671 non-null  object 
 4   Destination   12696 non-null  object 
 5   Age           12700 non-null  float64
 6   VIP           12674 non-null  object 
 7   RoomService   12707 non-null  float64
 8   FoodCourt     12681 non-null  float64
 9   ShoppingMall  12664 non-null  float64
 10  Spa           12686 non-null  float64
 11  VRDeck        12702 non-null  float64
 12  Name          12676 non-null  object 
 13  is_train      12970 non-null  bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 1.3+ MB


In [34]:
full_data.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,12700.0,12707.0,12681.0,12664.0,12686.0,12702.0
mean,28.771969,222.897852,451.961675,174.906033,308.476904,306.789482
std,14.387261,647.596664,1584.370747,590.55869,1130.279641,1180.097223
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,49.0,77.0,29.0,57.0,42.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [35]:
full_data['HomePlanet'].value_counts(dropna=False)

HomePlanet
Earth     6865
Europa    3133
Mars      2684
NaN        288
Name: count, dtype: int64

1. HomePlanet： 同一组的成员大概率是家庭成员，而homeplanet是永居地，所以按照同一组的人来填充homeplanet的NaN值（如果存在），如果同一个组的人的homeplanet信息都缺失，则直接设置Unknown。
2. Cryosleep ： 五项消费金额都为0的乘客自然Cryosleep=True，反之，如果有消费金额，那么证明乘客Cryosleep=False
3. Cabin ： 甲板编号/房间号/侧舷，其中只有甲板编号和侧舷的信息与飞船的位置有比较大的关系，房间号是随即的编号，所以我们不需要房间号。对于缺失的Cabin项，我们为其标记缺失即可。
4. VIP ： 处理VIP缺失项，由于VIP乘客只占总乘客的一小部分，所以我们可以认为VIP信息缺失的乘客基本上都是非VIP乘客。
5. age ： 通过统计可以得知，在同一个家庭中的人年龄相差都比较小，所以对于age缺失的乘客，首先使用统一家庭组的中位数来填充，还缺失的乘客，使用homeplanet+cryosleep相同的乘客的中位数来填充，最后使用所有乘客的年龄中位数来填充剩余仍然缺失的age信息。
6. Destination ： destination可以通过同一家庭组的乘客去往同一destination，其次同一homeplanet去往同一destination，最后如果仍然缺失，则使用去往人数最多的destination填充
7. name : name基本没有什么作用，所以在我们完成其余所有的信息补充之后，name项可以删除

In [36]:
def preprocess_data(df) :
    '''
    预处理数据，用于训练数据和测试数据

    :param df: DataFrame
    :return: 填充完整缺失值的DataFrame
    '''

    df_copy = df.copy()

    # 删除name项
    df_copy = df_copy.drop('Name', axis = 1)

    #处理HomePlanet缺失
    df_copy['GroupID'] = df_copy['PassengerId'].apply(lambda x : x.split("_")[0])
    group_modes = df_copy.groupby('GroupID')['HomePlanet'].apply(lambda x : x.mode().iloc[0] if not x.mode().empty and not pd.isna(x.mode().iloc[0]) else 'Unknown').to_dict()
    df_copy['HomePlanet'] = df_copy['HomePlanet'].fillna(df_copy['GroupID'].map(group_modes)).fillna('Unknown')
    #添加标志列 为后面模型的预测提供信息
    df_copy['hp_is_miss'] = (df_copy['HomePlanet'] == 'Unknown').astype(int)

    # 处理destination
    group_destination = df_copy.groupby('GroupID')['Destination'].apply(lambda x : x.mode().iloc[0] if not x.mode().empty else np.nan)
    df_copy['Destination'] = df_copy.apply(lambda x : group_destination[x['GroupID']] if pd.isna(x['Destination']) else x['Destination'], axis = 1)

    hp_destination = df_copy.groupby('HomePlanet')['Destination'].agg(lambda x : x.mode().iloc[0] if not x.mode().empty else np.nan)
    df_copy['Destination'] = df_copy.apply(lambda x : hp_destination[x['HomePlanet']] if pd.isna(x['Destination']) else x['Destination'], axis = 1)

    df_copy['Destination'] = df_copy['Destination'].fillna(full_data['Destination'].mode()[0])

    speed_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

    #处理CryoSleep项 --> 如果五项消费项的金额都是0的话就认为该乘客选择了休眠
    mask = df_copy['CryoSleep'].isna()
    df_copy.loc[mask & (df_copy[speed_cols].sum(axis=1) == 0), 'CryoSleep'] = True
    df_copy.loc[mask & (df_copy[speed_cols].sum(axis=1) > 0), 'CryoSleep'] = False

    # 处理Cabin缺失
    df_copy[['Deck', 'CabinNum', 'Side']] = df_copy['Cabin'].str.split('/', expand=True)
    df_copy['cabin_is_miss'] = df_copy['Cabin'].isna().astype(int)
    df_copy['Deck'] = df_copy['Deck'].fillna('Unknown')
    df_copy['Side'] = df_copy['Side'].fillna('Unknown')
    df_copy = df_copy.drop(columns=['Cabin', 'CabinNum'])

    # 处理VIP缺失
    df_copy['VIP'] = df_copy['VIP'].fillna('False')

    # 处理age缺失
    group_median_age = df_copy.groupby('GroupID')['Age'].median()
    df_copy['Age'] = df_copy.apply(lambda x : group_median_age.get(x['GroupID'],np.nan) if pd.isna(x['Age']) else x['Age'], axis = 1)
    df_copy['Age'] = df_copy['Age'].fillna(df_copy.groupby(['HomePlanet','CryoSleep'])['Age'].transform('median'))
    df_copy['Age'] = df_copy['Age'].fillna(df_copy['Age'].median())

    # one-hot --> ['VIP', 'Deck', 'Side','HomePlanet']
    df_copy = df_copy.drop(columns=['GroupID'])
    df_copy = df_copy.drop(columns=['PassengerId'])

    cat_cols = ['Deck', 'Side','HomePlanet', 'Destination']
    df_copy = pd.get_dummies(df_copy, columns=cat_cols, drop_first=True)
    df_copy['CryoSleep'] = df_copy['CryoSleep'].astype(int)
    df_copy['VIP'] = df_copy['VIP'].astype(bool).astype(int)

    # 处理消费项缺失
    is_CryoSleep = df_copy['CryoSleep'] == True
    df_copy.loc[is_CryoSleep, speed_cols] = 0

    not_CryoSleep = df_copy[df_copy['CryoSleep'] == False]
    miss_rate = not_CryoSleep[speed_cols].isna().mean().sort_values(ascending = True)
    fill_order = miss_rate.index.tolist() # 把一个panda的series对象变成python的list

    num_features = ['Age', 'VIP', 'hp_is_miss', 'cabin_is_miss']
    one_hot_prefix = ['Deck', 'Side', 'HomePlanet']
    one_hot_features = [c for c in df_copy.columns if any(c.startswith(p + '_') for p in one_hot_prefix)]
    base_features = num_features + one_hot_features

    for target in fill_order:
        features = base_features + [c for c in speed_cols if c != target ]
        train = not_CryoSleep[not_CryoSleep[target].notna()]
        pred = not_CryoSleep[not_CryoSleep[target].isna()]
        if len(pred) == 0:
            continue
        model = RandomForestRegressor(
            n_estimators=200,
            random_state=42,
            max_depth=8,
            min_samples_leaf=10,
            n_jobs=-1
        )
        model.fit(train[features], train[target])
        df_copy.loc[pred.index, target] = model.predict(pred[features])

    return df_copy

full_data = preprocess_data(full_data)
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CryoSleep                  12970 non-null  int64  
 1   Age                        12970 non-null  float64
 2   VIP                        12970 non-null  int64  
 3   RoomService                12970 non-null  float64
 4   FoodCourt                  12970 non-null  float64
 5   ShoppingMall               12970 non-null  float64
 6   Spa                        12970 non-null  float64
 7   VRDeck                     12970 non-null  float64
 8   is_train                   12970 non-null  bool   
 9   hp_is_miss                 12970 non-null  int64  
 10  cabin_is_miss              12970 non-null  int64  
 11  Deck_B                     12970 non-null  bool   
 12  Deck_C                     12970 non-null  bool   
 13  Deck_D                     12970 non-null  boo

In [37]:
train_x = full_data[full_data['is_train'] == True]
train_x = train_x.drop(columns=['is_train'], axis = 1)
train_y = train_y.astype(int)

pred_x = full_data[full_data['is_train'] == False]
pred_x = pred_x.drop(columns=['is_train'], errors='ignore')

In [38]:
xgb_model = xgb.XGBClassifier(
    n_estimators=300,       # 树的数量
    max_depth=6,            # 树深度
    learning_rate=0.05,     # 学习率
    subsample=0.8,          # 样本采样率
    colsample_bytree=0.8,   # 特征采样率
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'   # 避免warning
)

xgb_model.fit(train_x, train_y)
pred_y = xgb_model.predict(pred_x)
# print(pred_y)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [39]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
submission = test[['PassengerId']].copy()
submission['Transported'] = pred_y.astype(bool)
filename = f'Space-Titanic-{timestamp}.csv'
submission.to_csv(f'../submissions/{filename}', index = False)
print(f"✅ 预测结果已保存到 ../submissions/{filename}")

✅ 预测结果已保存到 ../submissions/Space-Titanic-20260202_181902.csv
