In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.preprocessing import LabelEncoder

# 載入資料
file_path = r"C:\Users\90607\Downloads\titanic\train.csv"
titanic_data = pd.read_csv(file_path)

titanic_data['Age'].fillna(titanic_data['Age'].median(), inplace=True)
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)

# 編碼類別變數
label_encoders = {}
for col in ['Sex', 'Embarked']:
    le = LabelEncoder()
    titanic_data[col] = le.fit_transform(titanic_data[col])
    label_encoders[col] = le

# 選擇特徵
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = titanic_data[features]
y = titanic_data['Survived']

# 分割資料為訓練集與測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [40]:
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import numpy as np

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
train_acc_dt = accuracy_score(y_train, dt.predict(X_train))
test_acc_dt = accuracy_score(y_test, dt.predict(X_test))

# Bagging
bagging = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42)
bagging.fit(X_train, y_train)
train_acc_bagging = accuracy_score(y_train, bagging.predict(X_train))
test_acc_bagging = accuracy_score(y_test, bagging.predict(X_test))

# AdaBoost
adaboost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), n_estimators=200, random_state=42)
adaboost.fit(X_train, y_train)
train_acc_adaboost = accuracy_score(y_train, adaboost.predict(X_train))
test_acc_adaboost = accuracy_score(y_test, adaboost.predict(X_test))

# Results with training and testing accuracy
results_with_train_test = pd.DataFrame({
    "Model": ["Decision Tree", "Bagging", "AdaBoost"],
    "Train Accuracy": [train_acc_dt, train_acc_bagging, train_acc_adaboost],
    "Test Accuracy": [test_acc_dt, test_acc_bagging, test_acc_adaboost]
})
results_with_train_test




Unnamed: 0,Model,Train Accuracy,Test Accuracy
0,Decision Tree,0.980738,0.753731
1,Bagging,0.980738,0.764925
2,AdaBoost,0.863563,0.809701


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE

models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42,max_depth=3),
    
    "Bagging": BaggingClassifier(
        base_estimator=DecisionTreeClassifier(random_state=42,max_depth=3),
        oob_score=True,
        n_estimators=300,
        max_samples=0.8,
        max_features=0.8,
        bootstrap=True,
        random_state=42
    ),
    "AdaBoost": AdaBoostClassifier(
        base_estimator=DecisionTreeClassifier(random_state=42,max_depth=1),
        n_estimators=200,
        learning_rate=0.1,
        random_state=42
    )
}

results = []
# Fit each model before evaluating
for name, model in models.items():
    # Fit the model
    model.fit(X_train, y_train)
    
    # Make predictions
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    
    # Calculate AUC if the model supports `predict_proba`
    if hasattr(model, "predict_proba"):
        auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    else:
        auc = "N/A"
    
    # Confusion Matrix
    conf_matrix = confusion_matrix(y_test, preds)
    
    # Store the results
    results.append({"Model": name, "Accuracy": accuracy, "AUC": auc, "Confusion Matrix": conf_matrix})

# Print results
print("模型表現比較:")
for result in results:
    print(f"模型: {result['Model']}")
    print(f"準確率 (Accuracy): {result['Accuracy']:.3f}")
    print(f"AUC: {result['AUC']}")
    print(f"混淆矩陣:\n{result['Confusion Matrix']}\n")

# Compute training and testing scores
train_scores = {name: model.score(X_train, y_train) for name, model in models.items()}
test_scores = {name: model.score(X_test, y_test) for name, model in models.items()}

# Print train and test scores
for model_name in train_scores:
    print(f"{model_name}: training score = {train_scores[model_name]:.3f}, testing score = {test_scores[model_name]:.3f}")




模型表現比較:
模型: Decision Tree
準確率 (Accuracy): 0.810
AUC: 0.820307568715212
混淆矩陣:
[[139  18]
 [ 33  78]]

模型: Bagging
準確率 (Accuracy): 0.817
AUC: 0.8787226717163024
混淆矩陣:
[[147  10]
 [ 39  72]]

模型: AdaBoost
準確率 (Accuracy): 0.802
AUC: 0.8670740804498767
混淆矩陣:
[[134  23]
 [ 30  81]]

Decision Tree: training score = 0.831, testing score = 0.810
Bagging: training score = 0.844, testing score = 0.817
AdaBoost: training score = 0.844, testing score = 0.802


In [20]:
train_errors = []
test_errors = []

for n_estimators in range(1, 1001):  # 與 n_estimators 一致
    adaboost_clf = AdaBoostClassifier(
        base_estimator=DecisionTreeClassifier(max_depth=1),
        n_estimators=n_estimators,
        random_state=42
    )
    adaboost_clf.fit(X_train, y_train)

    train_pred = adaboost_clf.predict(X_train)
    train_error = 1 - accuracy_score(y_train, train_pred)
    train_errors.append(train_error)

    test_pred = adaboost_clf.predict(X_test)
    test_error = 1 - accuracy_score(y_test, test_pred)
    test_errors.append(test_error)

plt.figure(figsize=(10, 6))
plt.plot(range(1, 201), train_errors, label="Training Error", linestyle='--')
plt.plot(range(1, 201), test_errors, label="Testing Error", linestyle='-')
plt.xlabel("Number of Estimators")
plt.ylabel("Error Rate")
plt.title("Error Rate vs. Number of Estimators in AdaBoost")
plt.legend()
plt.grid()
plt.show()



KeyboardInterrupt: 