#尝试传统机器学习算法(随机森林)解决Kaggle Titanic挑战

In [None]:
!unzip titanic.zip

Archive:  titanic.zip
  inflating: gender_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [None]:
#分析数据准备数据预处理
import pandas as pd

data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print(data.shape,test.shape)

(891, 12) (418, 11)


In [None]:
from sklearn.model_selection import train_test_split

 #用sklearn把train数据分为train和eval两个子集
data_train,data_eval = train_test_split(data,test_size=0.2,shuffle=True)

print(data_train.shape,data_eval.shape)
data_train.head()

(712, 12) (179, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
575,576,0,3,"Patchett, Mr. George",male,19.0,0,0,358585,14.5,,S
845,846,0,3,"Abbing, Mr. Anthony",male,42.0,0,0,C.A. 5547,7.55,,S
524,525,0,3,"Kassem, Mr. Fared",male,,0,0,2700,7.2292,,C
850,851,0,3,"Andersson, Master. Sigvard Harald Elias",male,4.0,4,2,347082,31.275,,S
560,561,0,3,"Morrow, Mr. Thomas Rowan",male,,0,0,372622,7.75,,Q


In [24]:
import numpy as np

#特征工程和去噪方法
def enhanced_feature_engineering(data, is_train=True):
    """
    针对性的特征工程：只处理姓名和票号
    """
    df = data.copy()

    # 1. Tokenize the names
    if 'Name' in df.columns:
        # 使用正则表达式替换掉特殊符号，然后拆分
        df['Name_Tokens'] = df['Name'].str.replace('[,.]', ' ').str.split()

        # 提取头衔 (Mr., Mrs., Miss., etc.)
        df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

        # 将罕见头衔合并
        title_mapping = {
            'Mr': 'Mr',
            'Mrs': 'Mrs',
            'Miss': 'Miss',
            'Master': 'Master',
            'Dr': 'Rare',
            'Rev': 'Rare',
            'Col': 'Rare',
            'Major': 'Rare',
            'Mlle': 'Miss',
            'Countess': 'Rare',
            'Ms': 'Miss',
            'Lady': 'Rare',
            'Jonkheer': 'Rare',
            'Don': 'Rare',
            'Sir': 'Rare',
            'Capt': 'Rare',
            'Mme': 'Mrs',
            'Dona': 'Rare'
        }
        df['Title'] = df['Title'].map(title_mapping)
        df['Title'] = df['Title'].fillna('Unknown')

    # 2. Extract prefix from ticket
    if 'Ticket' in df.columns:
        # 使用正则表达式提取票号中的字母前缀部分
        df['Ticket_Prefix'] = df['Ticket'].str.extract('([A-Za-z./]+)', expand=False)
        df['Ticket_Prefix'] = df['Ticket_Prefix'].fillna('NONE')

        # 提取票号中的数字部分
        df['Ticket_Number'] = df['Ticket'].str.extract('(\d+)$', expand=False)
        df['Ticket_Number'] = df['Ticket_Number'].fillna(0).astype(int)

    # 基本特征处理（仍然需要基本清洗和填充）
    # 年龄异常值和缺失值处理
    if 'Age' in df.columns:
        df.loc[(df['Age'] < 0) | (df['Age'] > 100), 'Age'] = np.nan
        df['Age'] = df['Age'].fillna(df['Age'].median())

    # 票价异常值和缺失值处理
    if 'Fare' in df.columns:
        df.loc[df['Fare'] > 500, 'Fare'] = df['Fare'].median()
        df['Fare'] = df['Fare'].fillna(df['Fare'].median())

    # 登船港口缺失值处理
    if 'Embarked' in df.columns:
        df['Embarked'] = df['Embarked'].fillna('S')

    # 选择建模特征
    base_features = [
        'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
        'Embarked', 'Title', 'Ticket_Prefix', 'Ticket_Number'
    ]

    # 根据是否为训练数据决定是否包含标签
    if is_train and 'Survived' in df.columns:
        final_features = base_features + ['Survived']
    else:
        final_features = base_features

    # 只选择存在的特征
    available_features = [f for f in final_features if f in df.columns]

    return df[available_features]

data_train = enhanced_feature_engineering(data_train)
data_eval = enhanced_feature_engineering(data_eval)
test = enhanced_feature_engineering(test, is_train=False)

print(data_train.shape,data_eval.shape,test.shape)
data_train.head()

(712, 11) (179, 11) (418, 10)


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Ticket_Prefix,Ticket_Number,Survived
575,3,male,19.0,0,0,14.5,S,Mr,NONE,358585,0
845,3,male,42.0,0,0,7.55,S,Mr,C.A.,5547,0
524,3,male,28.0,0,0,7.2292,C,Mr,NONE,2700,0
850,3,male,4.0,4,2,31.275,S,Master,NONE,347082,0
560,3,male,28.0,0,0,7.75,Q,Mr,NONE,372622,0


In [25]:
#把数据放入tensorflow标准数据处理方法中，准备用做训练输入
import tensorflow as tf
import tensorflow_decision_forests as tfdf

train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(
    data_train,
    label="Survived"
)

eval_ds = tfdf.keras.pd_dataframe_to_tf_dataset(
    data_eval,
    label="Survived"
)


test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test)

In [14]:
#随机模型构建用于训练
ramforest_model = tfdf.keras.RandomForestModel()
ramforest_model.fit(train_ds)
ramforest_model.summary()

Use /tmp/tmp04yca8b3 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:05.921321. Found 712 examples.
Training model...
Model trained in 0:00:00.674866
Compiling model...
Model compiled.


<tf_keras.src.callbacks.History at 0x788588836550>

In [18]:
predict = ramforest_model.predict(eval_ds)
print(predict)

[[0.20333321]
 [0.12666662]
 [0.83333266]
 [0.00666667]
 [0.99666584]
 [0.9899992 ]
 [0.73333275]
 [0.99666584]
 [0.00666667]
 [0.29333314]
 [0.11666662]
 [0.33666644]
 [0.5733329 ]
 [0.73333275]
 [0.14999993]
 [0.08666665]
 [0.3766664 ]
 [0.09999997]
 [0.02666667]
 [0.94333255]
 [0.43666634]
 [0.19666655]
 [0.01666667]
 [0.01      ]
 [0.7133328 ]
 [0.99999917]
 [0.08333332]
 [0.17333324]
 [0.00333333]
 [0.08333332]
 [0.37333307]
 [0.88666594]
 [0.07      ]
 [0.47333297]
 [0.09333331]
 [0.12666662]
 [0.18333323]
 [0.03666667]
 [0.12333328]
 [0.40666637]
 [0.7033328 ]
 [0.8699993 ]
 [0.41333303]
 [0.03333334]
 [0.32666644]
 [0.11999995]
 [0.03      ]
 [0.75666606]
 [0.03333334]
 [0.66666615]
 [0.06666667]
 [0.18666656]
 [0.4666663 ]
 [0.8933326 ]
 [0.02333334]
 [0.07      ]
 [0.01333333]
 [0.00666667]
 [0.06000001]
 [0.03333334]
 [0.31666645]
 [0.11666662]
 [0.10999996]
 [0.23666652]
 [0.03      ]
 [0.99999917]
 [0.02666667]
 [0.03333334]
 [0.10999996]
 [0.02666667]
 [0.97666585]
 [0.84

In [17]:
self_evaluation = ramforest_model.make_inspector().evaluation()
print(f"Accuracy: {self_evaluation.accuracy} Loss:{self_evaluation.loss}")

Accuracy: 0.8595505617977528 Loss:0.5945938356100383


In [26]:
predict_test = ramforest_model.predict(test_ds)
print(predict_test)

[[0.01666667]
 [0.48333296]
 [0.11333329]
 [0.07333333]
 [0.77666605]
 [0.04666667]
 [0.9066659 ]
 [0.10999996]
 [0.9066659 ]
 [0.06666667]
 [0.01333333]
 [0.25999984]
 [0.9933325 ]
 [0.06333334]
 [0.8399993 ]
 [0.81999934]
 [0.11333329]
 [0.17666657]
 [0.7733327 ]
 [0.71999943]
 [0.21666653]
 [0.78999937]
 [0.9499992 ]
 [0.3333331 ]
 [0.97666585]
 [0.05333334]
 [0.99666584]
 [0.16999991]
 [0.11999995]
 [0.31666645]
 [0.07999999]
 [0.07333333]
 [0.78999937]
 [0.11666662]
 [0.8833326 ]
 [0.1466666 ]
 [0.12999995]
 [0.14999993]
 [0.08333332]
 [0.73333275]
 [0.04      ]
 [0.42333302]
 [0.3233331 ]
 [0.9833325 ]
 [0.98666584]
 [0.14333327]
 [0.3533331 ]
 [0.05333334]
 [0.9899992 ]
 [0.6933328 ]
 [0.82333267]
 [0.14999993]
 [0.9933325 ]
 [0.9066659 ]
 [0.09333331]
 [0.05666667]
 [0.00333333]
 [0.06666667]
 [0.04      ]
 [0.99999917]
 [0.00333333]
 [0.15333326]
 [0.06666667]
 [0.9033326 ]
 [0.806666  ]
 [0.93333256]
 [0.78999937]
 [0.19333322]
 [0.6733328 ]
 [0.9566659 ]
 [0.846666  ]
 [0.00

In [27]:
#将概率转换为类别标签,tensorflow的输出方式不支持0，1输出需要手动转换
def convert_probabilities_to_classes(predictions, threshold=0.5):

    print(f"\n== : 阈值转换 (threshold={threshold}) ===")

    # 转换为类别
    predicted_classes = (predictions > threshold).astype(int)

    print(f"概率预测: {predictions.flatten()}")
    print(f"类别预测: {predicted_classes.flatten()}")

    # 显示转换逻辑
    print("\n转换逻辑:")
    for i, (prob, cls) in enumerate(zip(predictions.flatten(), predicted_classes.flatten())):
        print(f"  样本{i+1}: {prob:.3f} → {cls} ({'存活' if cls==1 else '死亡'})")

    return predicted_classes

#转换
converted1 = convert_probabilities_to_classes(predict)
converted2 = convert_probabilities_to_classes(predict_test)


== : 阈值转换 (threshold=0.5) ===
概率预测: [0.20333321 0.12666662 0.83333266 0.00666667 0.99666584 0.9899992
 0.73333275 0.99666584 0.00666667 0.29333314 0.11666662 0.33666644
 0.5733329  0.73333275 0.14999993 0.08666665 0.3766664  0.09999997
 0.02666667 0.94333255 0.43666634 0.19666655 0.01666667 0.01
 0.7133328  0.99999917 0.08333332 0.17333324 0.00333333 0.08333332
 0.37333307 0.88666594 0.07       0.47333297 0.09333331 0.12666662
 0.18333323 0.03666667 0.12333328 0.40666637 0.7033328  0.8699993
 0.41333303 0.03333334 0.32666644 0.11999995 0.03       0.75666606
 0.03333334 0.66666615 0.06666667 0.18666656 0.4666663  0.8933326
 0.02333334 0.07       0.01333333 0.00666667 0.06000001 0.03333334
 0.31666645 0.11666662 0.10999996 0.23666652 0.03       0.99999917
 0.02666667 0.03333334 0.10999996 0.02666667 0.97666585 0.8499993
 0.9733325  0.06666667 0.01666667 0.69999945 0.13666661 0.99999917
 0.04       0.8899993  0.8699993  0.99666584 0.07666666 0.00333333
 0.07999999 0.39666638 0.93333256 0

In [39]:
#修改为提交格式
test = pd.read_csv('test.csv')
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': converted2.flatten()
})

submission.to_csv('submission.csv', index=False)
print("\n 提交文件已保存: submission.csv")


 提交文件已保存: submission.csv
