In [39]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
sns.set_style("whitegrid")

In [40]:
train_path = os.path.join("..", "data","train.csv")
test_path = os.path.join("..","data","test.csv")
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [41]:
train_x = train.drop('Transported',axis=1)
train_y = train['Transported']
test_x = test.copy()
train_x['is_train'] = True
test_x['is_train'] = False
full_data = pd.concat([train_x,test_x],ignore_index=True)
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   12970 non-null  object 
 1   HomePlanet    12682 non-null  object 
 2   CryoSleep     12660 non-null  object 
 3   Cabin         12671 non-null  object 
 4   Destination   12696 non-null  object 
 5   Age           12700 non-null  float64
 6   VIP           12674 non-null  object 
 7   RoomService   12707 non-null  float64
 8   FoodCourt     12681 non-null  float64
 9   ShoppingMall  12664 non-null  float64
 10  Spa           12686 non-null  float64
 11  VRDeck        12702 non-null  float64
 12  Name          12676 non-null  object 
 13  is_train      12970 non-null  bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 1.3+ MB


In [42]:
full_data.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,12700.0,12707.0,12681.0,12664.0,12686.0,12702.0
mean,28.771969,222.897852,451.961675,174.906033,308.476904,306.789482
std,14.387261,647.596664,1584.370747,590.55869,1130.279641,1180.097223
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,49.0,77.0,29.0,57.0,42.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [43]:
full_data['HomePlanet'].value_counts(dropna=False)

HomePlanet
Earth     6865
Europa    3133
Mars      2684
NaN        288
Name: count, dtype: int64

1. HomePlanet： 同一组的成员大概率是家庭成员，而homeplanet是永居地，所以按照同一组的人来填充homeplanet的NaN值（如果存在），如果同一个组的人的homeplanet信息都缺失，则直接设置Unknown。
2.

In [44]:
def preprocess_data(df) :
    '''
    预处理数据，用于训练数据和测试数据

    :param df: DataFrame
    :param is_train: bool 是否拟合编码器
    :param label_encoders: dict 保存编码器
    :return:
    '''

    full_data = df.copy()

    #处理HomePlanet缺失
    full_data['GroupID'] = full_data['PassengerId'].apply(lambda x : x.split("_")[0])
    group_modes = full_data.groupby('GroupID')['HomePlanet'].apply(lambda x : x.mode().iloc[0] if not x.mode().empty and not pd.isna(x.mode().iloc[0]) else 'Unknown').to_dict()
    def fill_homeplanet(row):
        if pd.isna(row['HomePlanet']):
            return group_modes.get(row['GroupID'], "Unknown")
        return row['HomePlanet']
    full_data['HomePlanet'] =  full_data.apply(fill_homeplanet, axis = 1)
    #添加标志列 为后面模型的预测提供信息
    full_data['hp_is_miss'] = full_data['HomePlanet'].apply(lambda x : 1 if x == 'Unknown' else 0)
    full_data.drop('GroupID', axis = 1, inplace = True)

    speed_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

    #处理CryoSleep项 --> 如果五项消费项的金额都是0的话就认为该乘客选择了休眠
    mask = full_data['CryoSleep'].isna()
    full_data.loc[mask & (full_data[speed_cols].sum(axis=1) == 0), 'CryoSleep'] = True
    full_data.loc[mask & (full_data[speed_cols].sum(axis=1) > 0), 'CryoSleep'] = False

    # 处理Cabin缺失
    full_data[['Deck', 'CabinNum', 'Side']] = full_data['Cabin'].str.split('/', expand=True)
    full_data['cabin_is_miss'] = full_data['Cabin'].isna().astype(int)
    full_data['Deck'] = full_data['Deck'].fillna('Unknown')
    full_data['Side'] = full_data['Side'].fillna('Unknown')
    full_data = full_data.drop(columns=['Cabin', 'CabinNum'])

    # 处理消费项缺失
    # 如果选择了休眠。则消费项金额都为0
    full_data.loc[full_data['CryoSleep'] == True, speed_cols] = 0
    # 填充未选择休眠，但是消费金额缺失
    # 训练一个小的线性回归模型去预测
    # 应该只选择没有选择休眠的人进行训练模型
    not_cryosleep = full_data.loc[full_data['CryoSleep'] == False]
    # 填充顺序是从缺失值小到大
    def fill_speed(df):
        full_data = df.copy()
        features = ['Age', 'VIP', 'Deck', 'Side','HomePlanet']
        if 'Cabin' in df.columns:

    return full_data

full_data = preprocess_data(full_data)[0]
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   12970 non-null  object 
 1   HomePlanet    12970 non-null  object 
 2   CryoSleep     12660 non-null  object 
 3   Cabin         12671 non-null  object 
 4   Destination   12696 non-null  object 
 5   Age           12700 non-null  float64
 6   VIP           12674 non-null  object 
 7   RoomService   12707 non-null  float64
 8   FoodCourt     12681 non-null  float64
 9   ShoppingMall  12664 non-null  float64
 10  Spa           12686 non-null  float64
 11  VRDeck        12702 non-null  float64
 12  Name          12676 non-null  object 
 13  is_train      12970 non-null  bool   
 14  hp_is_miss    12970 non-null  int64  
dtypes: bool(1), float64(6), int64(1), object(7)
memory usage: 1.4+ MB


In [46]:
full_data['CryoSleep'].

CryoSleep
False    8079
True     4581
Name: count, dtype: int64