In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
columns = ['ID', 'Diagnosis'] + [f'feature_{i}' for i in range(1, 31)]
data = pd.read_csv(url, header=None, names=columns)


In [6]:
data = data.drop(columns=('ID'))

In [8]:
data['Diagnosis'] = data['Diagnosis'].map({'M': 1, 'B': 0})

In [13]:
X = data.drop(columns=['Diagnosis'])
y = data['Diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [15]:
print(data)

     Diagnosis  feature_1  feature_2  feature_3  feature_4  feature_5  \
0            1      17.99      10.38     122.80     1001.0    0.11840   
1            1      20.57      17.77     132.90     1326.0    0.08474   
2            1      19.69      21.25     130.00     1203.0    0.10960   
3            1      11.42      20.38      77.58      386.1    0.14250   
4            1      20.29      14.34     135.10     1297.0    0.10030   
..         ...        ...        ...        ...        ...        ...   
564          1      21.56      22.39     142.00     1479.0    0.11100   
565          1      20.13      28.25     131.20     1261.0    0.09780   
566          1      16.60      28.08     108.30      858.1    0.08455   
567          1      20.60      29.33     140.10     1265.0    0.11780   
568          0       7.76      24.54      47.92      181.0    0.05263   

     feature_6  feature_7  feature_8  feature_9  ...  feature_21  feature_22  \
0      0.27760    0.30010    0.14710     0.

In [17]:
model_v1 = RandomForestClassifier(n_estimators=200, random_state=42)
model_v1.fit(X_train, y_train)

y_pred = model_v1.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


Accuracy: 0.9578947368421052
Confusion Matrix:
[[181   6]
 [  6  92]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       187
           1       0.94      0.94      0.94        98

    accuracy                           0.96       285
   macro avg       0.95      0.95      0.95       285
weighted avg       0.96      0.96      0.96       285

