In [8]:
# 数据清洗 - 修改后的版本
def handle_missing_values(df):
    # 创建副本以避免链式赋值
    df = df.copy()
    
    # 年龄用均值填充 - 修改后的方式
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    
    # 船舱号码用'Unknown'标记 - 修改后的方式
    df['Cabin'] = df['Cabin'].fillna('Unknown')
    
    # 登船港口用众数填充 - 修改后的方式
    if df['Embarked'].isna().any():
        df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    
    return df

# 生成训练和测试数据
train_df = generate_titanic_data(891)
test_df = generate_titanic_data(418)

# 应用修改后的函数
train_df = handle_missing_values(train_df)
test_df = handle_missing_values(test_df)

In [9]:
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,Name_1,male,39.834156,0,0,Ticket_1,64.105905,,S
1,2,1,3,Name_2,male,31.456970,1,0,Ticket_2,95.887200,,Q
2,3,0,2,Name_3,male,24.643464,0,0,Ticket_3,6.408659,,Q
3,4,0,3,Name_4,male,31.456970,2,0,Ticket_4,37.908794,,Q
4,5,0,2,Name_5,female,38.387389,2,0,Ticket_5,32.749639,,
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,3,Name_887,male,31.456970,0,0,Ticket_887,37.012273,B,S
887,888,1,2,Name_888,male,40.907486,0,0,Ticket_888,5.876060,E,S
888,889,1,3,Name_889,male,32.399473,0,0,Ticket_889,65.820879,B,S
889,890,0,2,Name_890,male,7.031579,0,0,Ticket_890,17.496935,,S


In [10]:
test_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,1,Name_1,female,30.521259,0,1,Ticket_1,49.999900,,S
1,2,1,3,Name_2,male,30.521259,0,0,Ticket_2,30.314310,B,S
2,3,0,3,Name_3,male,30.521259,0,1,Ticket_3,14.738814,,S
3,4,0,3,Name_4,female,30.705258,1,1,Ticket_4,33.637730,E,C
4,5,0,2,Name_5,female,48.347564,1,0,Ticket_5,9.029301,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
413,414,0,2,Name_414,male,36.803068,0,0,Ticket_414,0.664710,E,S
414,415,0,2,Name_415,female,48.368767,2,0,Ticket_415,10.745114,E,S
415,416,0,3,Name_416,male,30.521259,1,0,Ticket_416,20.952707,,S
416,417,0,2,Name_417,male,30.521259,2,1,Ticket_417,1.734470,D,S


In [17]:
# 显示 test_df 和 train_df 的前5行，便于快速查看数据内容
print("train_df 前5行：")
display(train_df.head())


print("test_df 前5行：")
display(test_df.head())

train_df 前5行：


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,Name_1,male,39.834156,0,0,Ticket_1,64.105905,,S
1,2,1,3,Name_2,male,31.45697,1,0,Ticket_2,95.8872,,Q
2,3,0,2,Name_3,male,24.643464,0,0,Ticket_3,6.408659,,Q
3,4,0,3,Name_4,male,31.45697,2,0,Ticket_4,37.908794,,Q
4,5,0,2,Name_5,female,38.387389,2,0,Ticket_5,32.749639,,


test_df 前5行：


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,1,Name_1,female,30.521259,0,1,Ticket_1,49.9999,,S
1,2,1,3,Name_2,male,30.521259,0,0,Ticket_2,30.31431,B,S
2,3,0,3,Name_3,male,30.521259,0,1,Ticket_3,14.738814,,S
3,4,0,3,Name_4,female,30.705258,1,1,Ticket_4,33.63773,E,C
4,5,0,2,Name_5,female,48.347564,1,0,Ticket_5,9.029301,,S


In [18]:
from sklearn.preprocessing import LabelEncoder

# 数据预处理 - 特征工程

def feature_engineering(df):
  df = df.copy()
  
  # 性别编码
  df['Sex'] = LabelEncoder().fit_transform(df['Sex'])
  
  # 登船港口编码
  df['Embarked'] = LabelEncoder().fit_transform(df['Embarked'])
  
  # 船舱首字母提取并编码（如有需要）
  df['Cabin_initial'] = df['Cabin'].astype(str).str[0]
  df['Cabin_initial'] = LabelEncoder().fit_transform(df['Cabin_initial'])
  
  # 删除无用特征
  df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)
  
  return df

# 对训练集和测试集进行特征工程
train_df_fe = feature_engineering(train_df)
test_df_fe = feature_engineering(test_df)

# 展示处理后的数据
display(train_df_fe.head())
display(test_df_fe.head())

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin_initial
0,1,0,3,1,39.834156,0,0,64.105905,2,5
1,2,1,3,1,31.45697,1,0,95.8872,1,5
2,3,0,2,1,24.643464,0,0,6.408659,1,5
3,4,0,3,1,31.45697,2,0,37.908794,1,5
4,5,0,2,0,38.387389,2,0,32.749639,3,5


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin_initial
0,1,0,1,0,30.521259,0,1,49.9999,2,5
1,2,1,3,1,30.521259,0,0,30.31431,2,1
2,3,0,3,1,30.521259,0,1,14.738814,2,5
3,4,0,3,0,30.705258,1,1,33.63773,0,4
4,5,0,2,0,48.347564,1,0,9.029301,2,5


In [19]:
# 查看 test_df_fe 的基本统计信息和缺失值情况
print("test_df_fe 的描述性统计信息：")

display(test_df_fe.describe())

print("\ntest_df_fe 缺失值统计：")
print(test_df_fe.isnull().sum())

test_df_fe 的描述性统计信息：


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Cabin_initial
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,209.5,0.507177,2.303828,0.677033,30.521259,0.511962,0.279904,30.493116,1.851675,3.509569
std,120.810458,0.500548,0.808252,0.46817,12.799087,0.703605,0.518833,29.126448,0.686987,1.810217
min,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.000349,0.0,0.0
25%,105.25,0.0,2.0,0.0,22.648049,0.0,0.0,9.07373,2.0,2.0
50%,209.5,1.0,3.0,1.0,30.521259,0.0,0.0,21.272595,2.0,5.0
75%,313.75,1.0,3.0,1.0,37.324072,1.0,0.0,40.717864,2.0,5.0
max,418.0,1.0,3.0,1.0,74.475155,3.0,2.0,157.884418,3.0,5.0



test_df_fe 缺失值统计：
PassengerId      0
Survived         0
Pclass           0
Sex              0
Age              0
SibSp            0
Parch            0
Fare             0
Embarked         0
Cabin_initial    0
dtype: int64


In [20]:
# 统计 test_df_fe 中各特征的唯一值数量，便于了解特征分布
print("test_df_fe 各特征唯一值数量：")
print(test_df_fe.nunique())

test_df_fe 各特征唯一值数量：
PassengerId      418
Survived           2
Pclass             3
Sex                2
Age              333
SibSp              4
Parch              3
Fare             418
Embarked           4
Cabin_initial      6
dtype: int64
