In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.utils import resample
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, roc_auc_score, log_loss
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import neighbors
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from mlxtend.classifier import StackingClassifier

# 读取数据
titanic = pd.read_csv("./data/train.csv")



In [6]:
print(titanic.columns)
print(titanic.head(20))
print(f"数据总行数: {len(titanic)}")

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
    PassengerId  Survived  Pclass  \
0             1         0       3   
1             2         1       1   
2             3         1       3   
3             4         1       1   
4             5         0       3   
5             6         0       3   
6             7         0       1   
7             8         0       3   
8             9         1       3   
9            10         1       2   
10           11         1       3   
11           12         1       1   
12           13         0       3   
13           14         0       3   
14           15         0       3   
15           16         1       2   
16           17         0       3   
17           18         1       2   
18           19         0       3   
19           20         1       3   

                                                 Name     Sex   Age  

In [8]:
titanic_t = pd.read_csv("./data/test.csv")
print(titanic_t.columns)
print(titanic_t.head(20))
print(f"数据总行数: {len(titanic_t)}")

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
    PassengerId  Pclass                                               Name  \
0           892       3                                   Kelly, Mr. James   
1           893       3                   Wilkes, Mrs. James (Ellen Needs)   
2           894       2                          Myles, Mr. Thomas Francis   
3           895       3                                   Wirz, Mr. Albert   
4           896       3       Hirvonen, Mrs. Alexander (Helga E Lindqvist)   
5           897       3                         Svensson, Mr. Johan Cervin   
6           898       3                               Connolly, Miss. Kate   
7           899       2                       Caldwell, Mr. Albert Francis   
8           900       3          Abrahim, Mrs. Joseph (Sophie Halaut Easu)   
9           901       3                            Davies, Mr. John Samuel   
1

In [2]:
# 缺失值处理
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
titanic["Cabin"] = titanic["Cabin"].fillna("U")
titanic["Embarked"] = titanic["Embarked"].fillna("S")
titanic["Deck"] = titanic["Cabin"].apply(lambda x: x[0])  # 提取 Deck

# 编码
ordinal_encoder = OrdinalEncoder()
label_encoder = LabelEncoder()
titanic["PassengerId"] = label_encoder.fit_transform(titanic["PassengerId"])
titanic["Ticket"] = ordinal_encoder.fit_transform(titanic[["Ticket"]])
titanic["Sex"] = titanic["Sex"].map({"male": 0, "female": 1})
titanic["Embarked"] = titanic["Embarked"].map({"S": 0, "C": 1, "Q": 2})
titanic["Deck"] = titanic["Deck"].map({
    "A": 0, "B": 0.4, "C": 0.8, "D": 1.2, "E": 1.6,
    "F": 2.0, "G": 2.4, "T": 2.8, "U": 1.5
})

# 删除无关字段
titanic = titanic.drop(["Cabin", "Name"], axis=1)


In [3]:
# 特征与目标
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "Deck", "Ticket"]
x_data = titanic[predictors].astype("float64")
y_data = titanic["Survived"]

# 标准化
x_data = StandardScaler().fit_transform(x_data)


In [4]:
print("—————— 单模型部分 ——————")

# 线性回归
lin_reg = LinearRegression()
scores = cross_val_score(lin_reg, x_data, y_data, cv=7)
print(f"线性回归交叉验证得分：{scores.mean():.4f}")

# 逻辑回归
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=0)
clf = LogisticRegression(max_iter=200)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print("逻辑回归模型评估：")
print("召回率：", recall_score(y_test, y_pred))
print("精确率：", precision_score(y_test, y_pred))
print("F1 得分：", f1_score(y_test, y_pred))
print("准确率：", accuracy_score(y_test, y_pred))
print("ROC AUC 得分：", roc_auc_score(y_test, y_pred))
print("对数损失：", log_loss(y_test, y_pred))


—————— 单模型部分 ——————
线性回归交叉验证得分：0.3728
逻辑回归模型评估：
召回率： 0.75
精确率： 0.7425742574257426
F1 得分： 0.746268656716418
准确率： 0.8097014925373134
ROC AUC 得分： 0.7976190476190477
对数损失： 6.859053443451399


In [5]:
# 自助法验证逻辑回归
x_boot, y_boot = resample(x_data, y_data, replace=True, n_samples=100)
LR = LogisticRegression()
LR.fit(x_boot, y_boot)
print(f"自助法的逻辑回归模型的精确度: {LR.score(x_data, y_data):.4f}")


自助法的逻辑回归模型的精确度: 0.7879


In [6]:
# 决策树
dt = DecisionTreeClassifier()
scores = cross_val_score(dt, x_data, y_data, cv=5)
print(f"决策树模型精确度: {scores.mean():.4f}")

# 神经网络
mlp = MLPClassifier(hidden_layer_sizes=(20, 10), max_iter=2000)
scores = cross_val_score(mlp, x_data, y_data, cv=3)
print(f"神经网络模型精确度: {scores.mean():.4f}")

# KNN
knn = neighbors.KNeighborsClassifier(21)
scores = cross_val_score(knn, x_data, y_data, cv=5)
print(f"KNN 模型精确度: {scores.mean():.4f}")

# 随机森林
RF1 = RandomForestClassifier(random_state=1, n_estimators=100)
scores = cross_val_score(RF1, x_data, y_data, cv=3)
print(f"随机森林模型精确度: {scores.mean():.4f}")


决策树模型精确度: 0.7958




神经网络模型精确度: 0.7744
KNN 模型精确度: 0.7980
随机森林模型精确度: 0.8260


In [7]:
# SVM 核函数对比
svms = {
    "线性核": SVC(kernel='linear', C=1, gamma='auto'),
    "多项式核": SVC(kernel='poly', degree=3, C=1, gamma='auto'),
    "RBF 核": SVC(kernel='rbf', C=1, gamma='auto'),
    "Sigmoid 核": SVC(kernel='sigmoid', C=1, gamma='auto')
}
for name, model in svms.items():
    score = cross_val_score(model, x_data, y_data, cv=5).mean()
    print(f"{name} SVM 模型精确度: {score:.4f}")


线性核 SVM 模型精确度: 0.7879
多项式核 SVM 模型精确度: 0.8103
RBF 核 SVM 模型精确度: 0.8193
Sigmoid 核 SVM 模型精确度: 0.7093


In [46]:
print("\n—————— 集成模型部分 ——————")

# Bagging (RandomForest)
bagging_clf_rf = BaggingClassifier(RF1, n_estimators=20)
scores = cross_val_score(bagging_clf_rf, x_data, y_data, cv=3)
print(f"Bagging 模型（随机森林）精确度: {scores.mean():.4f}")

# Bagging (KNN)
bagging_clf_knn = BaggingClassifier(knn, n_estimators=10)
scores = cross_val_score(bagging_clf_knn, x_data, y_data, cv=3)
print(f"Bagging 模型（KNN）精确度: {scores.mean():.4f}")

# AdaBoost (决策树)
dt_stump = DecisionTreeClassifier(max_depth=1)
adaboost_dt = AdaBoostClassifier(estimator=dt_stump, n_estimators=10)

scores = cross_val_score(adaboost_dt, x_data, y_data, cv=5)
print(f"AdaBoost 模型（决策树）精确度: {scores.mean():.4f}")



—————— 集成模型部分 ——————
Bagging 模型（随机森林）精确度: 0.8204
Bagging 模型（KNN）精确度: 0.8047
AdaBoost 模型（决策树）精确度: 0.7992


In [9]:
# Stacking
bagging_rf = BaggingClassifier(RF1, n_estimators=20)
adaboost_bagging = AdaBoostClassifier(estimator=bagging_rf, n_estimators=10)
rbf_svm = SVC(kernel='rbf', C=1, gamma='auto', probability=True)
lr_base = LogisticRegression()

sclf = StackingClassifier(
    classifiers=[bagging_rf, adaboost_bagging, rbf_svm, lr_base],
    meta_classifier=LogisticRegression(),
    use_probas=True,
    average_probas=False
)

scores = cross_val_score(sclf, x_data, y_data, cv=3)
print(f"Stacking 模型精确度: {scores.mean():.4f}")


Stacking 模型精确度: 0.8171


In [59]:
print(f"线性回归交叉验证得分：{scores.mean():.4f}")
print("逻辑回归模型评估：")
print("召回率：", recall_score(y_test, y_pred))
print("精确率：", precision_score(y_test, y_pred))
print("F1 得分：", f1_score(y_test, y_pred))
print("准确率：", accuracy_score(y_test, y_pred))
print("ROC AUC 得分：", roc_auc_score(y_test, y_pred))
print("对数损失：", log_loss(y_test, y_pred))
print(f"自助法的逻辑回归模型的精确度: {LR.score(x_data, y_data):.4f}")
print(f"决策树模型精确度: {scores.mean():.4f}")
print(f"神经网络模型精确度: {scores.mean():.4f}")
print(f"KNN 模型精确度: {scores.mean():.4f}")
print(f"随机森林模型精确度: {scores.mean():.4f}")
for name, model in svms.items():
    score = cross_val_score(model, x_data, y_data, cv=5).mean()
    print(f"{name} SVM 模型精确度: {score:.4f}")
print("\n—————— 集成模型部分 ——————")

# Bagging (RandomForest)
bagging_clf_rf = BaggingClassifier(RF1, n_estimators=20)
scores = cross_val_score(bagging_clf_rf, x_data, y_data, cv=3)
print(f"Bagging 模型（随机森林）精确度: {scores.mean():.4f}")

# Bagging (KNN)
bagging_clf_knn = BaggingClassifier(knn, n_estimators=10)
scores = cross_val_score(bagging_clf_knn, x_data, y_data, cv=3)
print(f"Bagging 模型（KNN）精确度: {scores.mean():.4f}")

# AdaBoost (决策树)
dt_stump = DecisionTreeClassifier(max_depth=1)
adaboost_dt = AdaBoostClassifier(estimator=dt_stump, n_estimators=10)

scores = cross_val_score(adaboost_dt, x_data, y_data, cv=5)
print(f"AdaBoost 模型（决策树）精确度: {scores.mean():.4f}")


print(f"Stacking 模型精确度: {scores.mean():.4f}")

线性回归交叉验证得分：0.7992
逻辑回归模型评估：
召回率： 0.75
精确率： 0.7425742574257426
F1 得分： 0.746268656716418
准确率： 0.8097014925373134
ROC AUC 得分： 0.7976190476190477
对数损失： 6.859053443451399
自助法的逻辑回归模型的精确度: 0.7879
决策树模型精确度: 0.7992
神经网络模型精确度: 0.7992
KNN 模型精确度: 0.7992
随机森林模型精确度: 0.7992
线性核 SVM 模型精确度: 0.7879
多项式核 SVM 模型精确度: 0.8103
RBF 核 SVM 模型精确度: 0.8193
Sigmoid 核 SVM 模型精确度: 0.7093

—————— 集成模型部分 ——————
Bagging 模型（随机森林）精确度: 0.8272
Bagging 模型（KNN）精确度: 0.8013
AdaBoost 模型（决策树）精确度: 0.7992
Stacking 模型精确度: 0.7992


In [60]:
import pandas as pd

# 读取 CSV 文件
file_path = "./data/titanic_submission_optimized.csv"
data = pd.read_csv(file_path)

# 打印内容
print(data)

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         0
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]
