In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
import numpy as np
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
column_names = ['id', 'diagnosis'] + [f'feature_{i}' for i in range(1, 31)]
df = pd.read_csv(url, names=column_names)
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})
X = df.drop(['id', 'diagnosis'], axis=1)
y = df['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = DecisionTreeClassifier(min_samples_leaf=2, min_samples_split=5, max_depth=2)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
f1_no_pca = f1_score(y_test, y_pred)
precision_no_pca = precision_score(y_test, y_pred)
recall_no_pca = recall_score(y_test, y_pred)
pca = PCA()
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
X_train_pca_1 = X_train_pca[:, :1]
X_test_pca_1 = X_test_pca[:, :1]
clf_pca_1 = DecisionTreeClassifier(min_samples_leaf=2, min_samples_split=5, max_depth=2)
clf_pca_1.fit(X_train_pca_1, y_train)
y_pred_pca_1 = clf_pca_1.predict(X_test_pca_1)
f1_pca_1 = f1_score(y_test, y_pred_pca_1)
precision_pca_1 = precision_score(y_test, y_pred_pca_1)
recall_pca_1 = recall_score(y_test, y_pred_pca_1)
X_train_pca_2 = X_train_pca[:, :2]
X_test_pca_2 = X_test_pca[:, :2]
clf_pca_2 = DecisionTreeClassifier(min_samples_leaf=2, min_samples_split=5, max_depth=2)
clf_pca_2.fit(X_train_pca_2, y_train)
y_pred_pca_2 = clf_pca_2.predict(X_test_pca_2)
f1_pca_2 = f1_score(y_test, y_pred_pca_2)
precision_pca_2 = precision_score(y_test, y_pred_pca_2)
recall_pca_2 = recall_score(y_test, y_pred_pca_2)
cm = confusion_matrix(y_test, y_pred_pca_2)
TN, FP, FN, TP = cm.ravel()
FPR = FP / (FP + TN)
TPR = TP / (TP + FN)
print(f"F1 score without PCA: {f1_no_pca}")
print(f"Precision without PCA: {precision_no_pca}")
print(f"Recall without PCA: {recall_no_pca}")
print(f"F1 score with PCA (first principal component): {f1_pca_1}")
print(f"Precision with PCA (first principal component): {precision_pca_1}")
print(f"Recall with PCA (first principal component): {recall_pca_1}")
print(f"F1 score with PCA (first and second principal components): {f1_pca_2}")
print(f"Precision with PCA (first and second principal components): {precision_pca_2}")
print(f"Recall with PCA (first and second principal components): {recall_pca_2}")
print(f"False Positives (FP): {FP}")
print(f"True Positives (TP): {TP}")
print(f"False Positive Rate (FPR): {FPR}")
print(f"True Positive Rate (TPR): {TPR}")
print("Using continuous data can be beneficial for the model as it contains more information. However, PCA can help reduce the dimensionality of the data, which may lead to faster training and potentially better generalization. In this case, we need to compare the scores to determine if PCA is actually beneficial.")

F1 score without PCA: 0.9047619047619048
Precision without PCA: 0.9047619047619048
Recall without PCA: 0.9047619047619048
F1 score with PCA (first principal component): 0.9243697478991596
Precision with PCA (first principal component): 0.9821428571428571
Recall with PCA (first principal component): 0.873015873015873
F1 score with PCA (first and second principal components): 0.9243697478991596
Precision with PCA (first and second principal components): 0.9821428571428571
Recall with PCA (first and second principal components): 0.873015873015873
False Positives (FP): 1
True Positives (TP): 55
False Positive Rate (FPR): 0.009259259259259259
True Positive Rate (TPR): 0.873015873015873
Using continuous data can be beneficial for the model as it contains more information. However, PCA can help reduce the dimensionality of the data, which may lead to faster training and potentially better generalization. In this case, we need to compare the scores to determine if PCA is actually beneficial.
