### AdaBoost

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import learning_curve
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report
from sklearn.metrics import classification_report
# 加载数据
df = pd.read_csv('breast cancer.csv')
df = df.drop(['id', 'Unnamed: 32'], axis=1)
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']
# 设置支持中文的字体（例如 SimHei）
plt.rcParams['font.sans-serif'] = ['SimHei']  # 使用黑体
plt.rcParams['axes.unicode_minus'] = False    # 解决负号显示问题
# 标准化和分割数据集
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("数据预处理完成：X_train shape:", X_train.shape, "X_test shape:", X_test.shape)
print("训练集正负类分布:", pd.Series(y_train).value_counts())
print("测试集正负类分布:", pd.Series(y_test).value_counts())

数据预处理完成：X_train shape: (455, 30) X_test shape: (114, 30)
训练集正负类分布: diagnosis
0    286
1    169
Name: count, dtype: int64
测试集正负类分布: diagnosis
0    71
1    43
Name: count, dtype: int64


In [2]:
# 定义 AdaBoost 模型
adaboost_model = AdaBoostClassifier(learning_rate=0.1, n_estimators=200, random_state=42,algorithm='SAMME')

# 训练模型
adaboost_model.fit(X_train, y_train)

# 预测
y_pred_adaboost = adaboost_model.predict(X_test)
y_prob_adaboost = adaboost_model.predict_proba(X_test)[:, 1]

# 准确率
accuracy_adaboost = accuracy_score(y_test, y_pred_adaboost)
print(f"AdaBoost 准确率: {accuracy_adaboost:.4f}")
# 分类报告
print("AdaBoost 分类报告:")
print(classification_report(y_test, y_pred_adaboost))

AdaBoost 准确率: 0.9649
AdaBoost 分类报告:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97        71
           1       0.98      0.93      0.95        43

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



In [None]:
# 混淆矩阵
cm_adaboost = confusion_matrix(y_test, y_pred_adaboost)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_adaboost, annot=True, fmt='d', cmap='Blues')
plt.title('AdaBoost 混淆矩阵-宋傲操')
plt.xlabel('预测标签')
plt.ylabel('真实标签')
os.makedirs(f'picture/AdaBoost', exist_ok=True)
plt.savefig(f'picture/AdaBoost/confusion_matrix.png')
plt.show()
# ROC 曲线
fpr_adaboost, tpr_adaboost, _ = roc_curve(y_test, y_prob_adaboost)
roc_auc_adaboost = auc(fpr_adaboost, tpr_adaboost)
plt.figure(figsize=(6, 5))
plt.plot(fpr_adaboost, tpr_adaboost, label=f'AdaBoost (AUC = {roc_auc_adaboost:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('假阳性率')
plt.ylabel('真阳性率')
plt.title('AdaBoost ROC 曲线-宋傲操')
plt.legend(loc='lower right')
plt.grid(True)
plt.savefig(f'picture/AdaBoost/roc_curve.png')
plt.show()

In [None]:
# 学习曲线
train_sizes, train_scores, val_scores = learning_curve(adaboost_model, X_train, y_train, cv=5, n_jobs=-1)
train_mean = np.mean(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)
plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_mean, label='Training Score')
plt.plot(train_sizes, val_mean, label='Validation Score')
plt.title('AdaBoost 学习曲线-宋傲操')
plt.xlabel('Training Examples')
plt.ylabel('Score')
plt.legend(loc='best')
plt.grid(True)
plt.savefig(f'picture/AdaBoost/learning curve.png')
plt.show()

### CatBoost

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report
# 加载数据
df = pd.read_csv('breast cancer.csv')
df = df.drop(['id', 'Unnamed: 32'], axis=1)
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']
# 设置支持中文的字体（例如 SimHei）
plt.rcParams['font.sans-serif'] = ['SimHei']  # 使用黑体
plt.rcParams['axes.unicode_minus'] = False    # 解决负号显示问题
# 标准化和分割数据集
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("数据预处理完成：X_train shape:", X_train.shape, "X_test shape:", X_test.shape)
print("训练集正负类分布:", pd.Series(y_train).value_counts())
print("测试集正负类分布:", pd.Series(y_test).value_counts())

In [None]:
from catboost import CatBoostClassifier
# 定义 CatBoost 模型
catboost_model = CatBoostClassifier(l2_leaf_reg=3, depth=5, learning_rate=0.01, iterations=200, verbose=0, random_state=42)

# 训练模型
catboost_model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=False)

# 预测
y_pred_catboost = catboost_model.predict(X_test)
y_prob_catboost = catboost_model.predict_proba(X_test)[:, 1]

# 准确率
accuracy_catboost = accuracy_score(y_test, y_pred_catboost)
print(f"CatBoost 准确率: {accuracy_catboost:.4f}")
print("catboost 分类报告:")
print(classification_report(y_test, y_pred_catboost))

In [None]:
cm_catboost = confusion_matrix(y_test, y_pred_catboost)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_catboost, annot=True, fmt='d', cmap='Blues')
plt.title('CatBoost 混淆矩阵-宋傲操')
plt.xlabel('预测标签')
plt.ylabel('真实标签')
os.makedirs(f'picture/CatBoost', exist_ok=True)
plt.savefig(f'picture/CatBoost/confusion_matrix.png')
plt.show()

# ROC 曲线
fpr_catboost, tpr_catboost, _ = roc_curve(y_test, y_prob_catboost)
roc_auc_catboost = auc(fpr_catboost, tpr_catboost)
plt.figure(figsize=(6, 5))
plt.plot(fpr_catboost, tpr_catboost, label=f'CatBoost (AUC = {roc_auc_catboost:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('假阳性率')
plt.ylabel('真阳性率')
plt.title('CatBoost ROC 曲线-宋傲操')
plt.legend(loc='lower right')
plt.grid(True)
plt.savefig(f'picture/CatBoost/roc_curve.png')
plt.show()

In [None]:
# 损失曲线
results_dict = catboost_model.get_evals_result()
plt.figure(figsize=(8, 6))
plt.plot(results_dict['validation']['Logloss'], label='Validation Loss')
plt.title('CatBoost 损失曲线-宋傲操')
plt.xlabel('Iteration')
plt.ylabel('Log Loss')
plt.legend()
plt.grid(True)
plt.savefig(f'picture/CatBoost/loss_curve.png')
plt.show()

### Elastic Net logistic Regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import learning_curve
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, precision_recall_curve, classification_report
# 加载数据
df = pd.read_csv('breast cancer.csv')
df = df.drop(['id', 'Unnamed: 32'], axis=1)
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']
# 设置支持中文的字体（例如 SimHei）
plt.rcParams['font.sans-serif'] = ['SimHei']  # 使用黑体
plt.rcParams['axes.unicode_minus'] = False    # 解决负号显示问题
# 标准化和分割数据集
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("数据预处理完成：X_train shape:", X_train.shape, "X_test shape:", X_test.shape)
print("训练集正负类分布:", pd.Series(y_train).value_counts())
print("测试集正负类分布:", pd.Series(y_test).value_counts())

In [None]:
## 训练 Elastic Net Logistic Regression 模型
# 定义 Elastic Net Logistic Regression 模型
elasticnet_lr_model = SGDClassifier(loss='log_loss', penalty='elasticnet', l1_ratio=0.5, alpha=0.0001, max_iter=1000, random_state=42)

# 训练模型
elasticnet_lr_model.fit(X_train, y_train)

# 预测
y_pred_elasticnet = elasticnet_lr_model.predict(X_test)
y_prob_elasticnet = elasticnet_lr_model.predict_proba(X_test)[:, 1]

# 准确率
accuracy_elasticnet = accuracy_score(y_test, y_pred_elasticnet)
print(f"Elastic Net LR 准确率: {accuracy_elasticnet:.4f}")
print(" Elastic Net LR分类报告:")
print(classification_report(y_test, y_pred_elasticnet))

In [None]:
# 混淆矩阵
cm_elasticnet = confusion_matrix(y_test, y_pred_elasticnet)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_elasticnet, annot=True, fmt='d', cmap='Blues')
plt.title('Elastic Net LR 混淆矩阵-宋傲操')
plt.xlabel('预测标签')
plt.ylabel('真实标签')
os.makedirs(f'picture/Elastic Net logistic Regression', exist_ok=True)
plt.savefig(f'picture/Elastic Net logistic Regression/confusion_matrix.png')
plt.show()

# ROC 曲线
fpr_elasticnet, tpr_elasticnet, _ = roc_curve(y_test, y_prob_elasticnet)
roc_auc_elasticnet = auc(fpr_elasticnet, tpr_elasticnet)
plt.figure(figsize=(6, 5))
plt.plot(fpr_elasticnet, tpr_elasticnet, label=f'Elastic Net LR (AUC = {roc_auc_elasticnet:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('假阳性率')
plt.ylabel('真阳性率')
plt.title('Elastic Net LR ROC 曲线-宋傲操')
plt.legend(loc='lower right')
plt.grid(True)
plt.savefig(f'picture/Elastic Net logistic Regression/roc_curve.png')
plt.show()

In [None]:
# 学习曲线
train_sizes, train_scores, val_scores = learning_curve(elasticnet_lr_model, X_train, y_train, cv=5, n_jobs=-1)
train_mean = np.mean(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)
plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_mean, label='Training Score')
plt.plot(train_sizes, val_mean, label='Validation Score')
plt.title('Elastic Net LR 学习曲线-宋傲操')
plt.xlabel('Training Examples')
plt.ylabel('Score')
plt.legend(loc='best')
plt.grid(True)
plt.savefig(f'picture/Elastic Net logistic Regression/learning_curve.png')
plt.show()