In [21]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from ucimlrepo import fetch_ucirepo 

In [22]:
#读取数据
breast_cancer_wisconsin_diagnostic=fetch_ucirepo(id=17)
X=breast_cancer_wisconsin_diagnostic.data.features
y=breast_cancer_wisconsin_diagnostic.data.targets
if isinstance(y, pd.Series):
    data=X.copy()
    data['target']=y
else:
    data=pd.concat([X, y], axis=1)

In [23]:
#清除掉包含空值的行
if data.isnull().sum().any():
    print("There are missing values in the dataset.")
    data = data.dropna()  

In [24]:
print(data.head())

   radius1  texture1  perimeter1   area1  smoothness1  compactness1  \
0    17.99     10.38      122.80  1001.0      0.11840       0.27760   
1    20.57     17.77      132.90  1326.0      0.08474       0.07864   
2    19.69     21.25      130.00  1203.0      0.10960       0.15990   
3    11.42     20.38       77.58   386.1      0.14250       0.28390   
4    20.29     14.34      135.10  1297.0      0.10030       0.13280   

   concavity1  concave_points1  symmetry1  fractal_dimension1  ...  texture3  \
0      0.3001          0.14710     0.2419             0.07871  ...     17.33   
1      0.0869          0.07017     0.1812             0.05667  ...     23.41   
2      0.1974          0.12790     0.2069             0.05999  ...     25.53   
3      0.2414          0.10520     0.2597             0.09744  ...     26.50   
4      0.1980          0.10430     0.1809             0.05883  ...     16.67   

   perimeter3   area3  smoothness3  compactness3  concavity3  concave_points3  \
0      184.

In [25]:
print(data.columns)

Index(['radius1', 'texture1', 'perimeter1', 'area1', 'smoothness1',
       'compactness1', 'concavity1', 'concave_points1', 'symmetry1',
       'fractal_dimension1', 'radius2', 'texture2', 'perimeter2', 'area2',
       'smoothness2', 'compactness2', 'concavity2', 'concave_points2',
       'symmetry2', 'fractal_dimension2', 'radius3', 'texture3', 'perimeter3',
       'area3', 'smoothness3', 'compactness3', 'concavity3', 'concave_points3',
       'symmetry3', 'fractal_dimension3', 'Diagnosis'],
      dtype='object')


In [26]:
#数据转换
data['Diagnosis'] = data['Diagnosis'].map({'M': 1, 'B': 0})

In [27]:
#数据预处理
X=data.drop('Diagnosis', axis=1) 
y=data['Diagnosis'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [28]:
#PCA降维
pca_1 = PCA(n_components=1)
X_train_pca_1 = pca_1.fit_transform(X_train_scaled)
X_test_pca_1 = pca_1.transform(X_test_scaled)

pca_2 = PCA(n_components=2)
X_train_pca_2 = pca_2.fit_transform(X_train_scaled)
X_test_pca_2 = pca_2.transform(X_test_scaled)

In [29]:
#训练决策树
decision_tree_original = DecisionTreeClassifier(max_depth=2, min_samples_leaf=2, min_samples_split=5, criterion='gini', random_state=42)
decision_tree_original.fit(X_train_scaled, y_train)

decision_tree_pca_1 = DecisionTreeClassifier(max_depth=2, min_samples_leaf=2, min_samples_split=5, criterion='gini', random_state=42)
decision_tree_pca_1.fit(X_train_pca_1, y_train)

decision_tree_pca_2 = DecisionTreeClassifier(max_depth=2, min_samples_leaf=2, min_samples_split=5, criterion='gini', random_state=42)
decision_tree_pca_2.fit(X_train_pca_2, y_train)

In [30]:
# 8. 模型预测及评估
y_pred_original = decision_tree_original.predict(X_test_scaled)
y_pred_pca_1 = decision_tree_pca_1.predict(X_test_pca_1)
y_pred_pca_2 = decision_tree_pca_2.predict(X_test_pca_2)

f1_original = f1_score(y_test, y_pred_original)
precision_original = precision_score(y_test, y_pred_original)
recall_original = recall_score(y_test, y_pred_original)

f1_pca_1 = f1_score(y_test, y_pred_pca_1)
precision_pca_1 = precision_score(y_test, y_pred_pca_1)
recall_pca_1 = recall_score(y_test, y_pred_pca_1)

f1_pca_2 = f1_score(y_test, y_pred_pca_2)
precision_pca_2 = precision_score(y_test, y_pred_pca_2)
recall_pca_2 = recall_score(y_test, y_pred_pca_2)

In [31]:
res = {
    'Metric': ['F1 Score', 'Precision', 'Recall'],
    'Original Data': [f1_original, precision_original, recall_original],
    'PCA 1st Component': [f1_pca_1, precision_pca_1, recall_pca_1],
    'PCA 1st and 2nd Components': [f1_pca_2, precision_pca_2, recall_pca_2]
}

In [32]:
df_res=pd.DataFrame(res)
print(df_res)

      Metric  Original Data  PCA 1st Component  PCA 1st and 2nd Components
0   F1 Score       0.904762           0.899225                    0.885246
1  Precision       0.904762           0.878788                    0.915254
2     Recall       0.904762           0.920635                    0.857143


In [33]:
print(f"F1 Score (Original Data): {f1_original}")
print(f"Precision (Original Data): {precision_original}")
print(f"Recall (Original Data): {recall_original}")

print(f"F1 Score (PCA 1st Component): {f1_pca_1}")
print(f"Precision (PCA 1st Component): {precision_pca_1}")
print(f"Recall (PCA 1st Component): {recall_pca_1}")

print(f"F1 Score (PCA 1st and 2nd Components): {f1_pca_2}")
print(f"Precision (PCA 1st and 2nd Components): {precision_pca_2}")
print(f"Recall (PCA 1st and 2nd Components): {recall_pca_2}")

F1 Score (Original Data): 0.9047619047619048
Precision (Original Data): 0.9047619047619048
Recall (Original Data): 0.9047619047619048
F1 Score (PCA 1st Component): 0.8992248062015504
Precision (PCA 1st Component): 0.8787878787878788
Recall (PCA 1st Component): 0.9206349206349206
F1 Score (PCA 1st and 2nd Components): 0.8852459016393442
Precision (PCA 1st and 2nd Components): 0.9152542372881356
Recall (PCA 1st and 2nd Components): 0.8571428571428571


In [34]:
# 9. 混淆矩阵及 FPR、TPR 这个地方到底是为什么？
def extract_metrics(cm):
    TN, FP, FN, TP = cm.ravel()
    FPR = FP / (FP + TN)
    TPR = TP / (TP + FN)
    return FP, TP, FPR, TPR

In [35]:
cm_original=confusion_matrix(y_test, y_pred_original)
cm_pca_1=confusion_matrix(y_test, y_pred_pca_1)
cm_pca_2=confusion_matrix(y_test, y_pred_pca_2)

FP_original, TP_original, FPR_original, TPR_original = extract_metrics(cm_original)
FP_pca_1,TP_pca_1, FPR_pca_1, TPR_pca_1=extract_metrics(cm_pca_1)
FP_pca_2,TP_pca_2, FPR_pca_2, TPR_pca_2=extract_metrics(cm_pca_2)

In [36]:
res_cm = {
    'Metric': ['Confusion Matrix', 'FP', 'TP', 'FPR', 'TPR'],
    'Original Data': [str(cm_original), FP_original, TP_original, FPR_original, TPR_original],
    'PCA 1st Component': [str(cm_pca_1), FP_pca_1, TP_pca_1, FPR_pca_1, TPR_pca_1],
    'PCA 1st and 2nd Components': [str(cm_pca_2), FP_pca_2, TP_pca_2, FPR_pca_2, TPR_pca_2]
}

In [37]:
pd.DataFrame(res_cm)

Unnamed: 0,Metric,Original Data,PCA 1st Component,PCA 1st and 2nd Components
0,Confusion Matrix,[[102 6]\n [ 6 57]],[[100 8]\n [ 5 58]],[[103 5]\n [ 9 54]]
1,FP,6,8,5
2,TP,57,58,54
3,FPR,0.055556,0.074074,0.046296
4,TPR,0.904762,0.920635,0.857143


In [38]:
print(f"Confusion Matrix (Original Data):\n{cm_original}")
print(f"FP: {FP_original}, TP: {TP_original}, FPR: {FPR_original}, TPR: {TPR_original}")

print(f"Confusion Matrix (PCA 1st Component):\n{cm_pca_1}")
print(f"FP: {FP_pca_1}, TP: {TP_pca_1}, FPR: {FPR_pca_1}, TPR: {TPR_pca_1}")

print(f"Confusion Matrix (PCA 1st and 2nd Components):\n{cm_pca_2}")
print(f"FP: {FP_pca_2}, TP: {TP_pca_2}, FPR: {FPR_pca_2}, TPR: {TPR_pca_2}")

Confusion Matrix (Original Data):
[[102   6]
 [  6  57]]
FP: 6, TP: 57, FPR: 0.05555555555555555, TPR: 0.9047619047619048
Confusion Matrix (PCA 1st Component):
[[100   8]
 [  5  58]]
FP: 8, TP: 58, FPR: 0.07407407407407407, TPR: 0.9206349206349206
Confusion Matrix (PCA 1st and 2nd Components):
[[103   5]
 [  9  54]]
FP: 5, TP: 54, FPR: 0.046296296296296294, TPR: 0.8571428571428571
