In [1]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
cancer_data = load_breast_cancer()
df_cancer = pd.DataFrame(cancer_data['data'], columns=cancer_data['feature_names'])
df_cancer['target'] = cancer_data['target']
print(df_cancer.head())

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area  \
0             

In [2]:
df_cancer.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [3]:
df_cancer.dtypes

mean radius                float64
mean texture               float64
mean perimeter             float64
mean area                  float64
mean smoothness            float64
mean compactness           float64
mean concavity             float64
mean concave points        float64
mean symmetry              float64
mean fractal dimension     float64
radius error               float64
texture error              float64
perimeter error            float64
area error                 float64
smoothness error           float64
compactness error          float64
concavity error            float64
concave points error       float64
symmetry error             float64
fractal dimension error    float64
worst radius               float64
worst texture              float64
worst perimeter            float64
worst area                 float64
worst smoothness           float64
worst compactness          float64
worst concavity            float64
worst concave points       float64
worst symmetry      

In [4]:
df_cancer.isnull().sum()

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64

In [5]:
df_cancer.shape

(569, 31)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X = df_cancer.drop(columns=['target'])
y = df_cancer['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
log_reg_accuracy = accuracy_score(y_test, y_pred_log_reg)

tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)
y_pred_tree = tree_clf.predict(X_test)
tree_clf_accuracy = accuracy_score(y_test, y_pred_tree)

rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
rf_clf_accuracy = accuracy_score(y_test, y_pred_rf)

log_reg_accuracy, tree_clf_accuracy, rf_clf_accuracy

(0.9766081871345029, 0.9239766081871345, 0.9766081871345029)

In [26]:
from sklearn.model_selection import GridSearchCV

log_reg_params = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['saga', 'lbfgs']
}

tree_params = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

rf_params = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5]
}

log_reg_search = GridSearchCV(LogisticRegression(max_iter=20000), log_reg_params, cv=5)
log_reg_search.fit(X_train, y_train)
best_log_reg_params = log_reg_search.best_params_

tree_search = GridSearchCV(DecisionTreeClassifier(), tree_params, cv=5)
tree_search.fit(X_train, y_train)
best_tree_params = tree_search.best_params_

rf_search = GridSearchCV(RandomForestClassifier(), rf_params, cv=5)
rf_search.fit(X_train, y_train)
best_rf_params = rf_search.best_params_

best_log_reg_params, best_tree_params, best_rf_params

({'C': 100, 'solver': 'lbfgs'},
 {'max_depth': None, 'min_samples_split': 2},
 {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 50})

In [29]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
y_pred_log_reg = log_reg.predict(X_test)
log_reg_accuracy = accuracy_score(y_test, y_pred_log_reg)
log_reg_conf_matrix = confusion_matrix(y_test, y_pred_log_reg)
log_reg_class_report = classification_report(y_test, y_pred_log_reg, zero_division=1)

print("Logistic Regression:\n", log_reg_class_report)
print("Confusion Matrix:\n", log_reg_conf_matrix)


Logistic Regression:
               precision    recall  f1-score   support

           0       0.37      1.00      0.54        63
           1       1.00      0.00      0.00       108

    accuracy                           0.37       171
   macro avg       0.68      0.50      0.27       171
weighted avg       0.77      0.37      0.20       171

Confusion Matrix:
 [[ 63   0]
 [108   0]]




In [30]:
y_pred_tree = tree_clf.predict(X_test)
tree_accuracy = accuracy_score(y_test, y_pred_tree)
tree_conf_matrix = confusion_matrix(y_test, y_pred_tree)
tree_class_report = classification_report(y_test, y_pred_tree)

print("\nDecision Tree:\n", tree_class_report)
print("Confusion Matrix:\n", tree_conf_matrix)


Decision Tree:
               precision    recall  f1-score   support

           0       0.89      0.92      0.91        63
           1       0.95      0.94      0.94       108

    accuracy                           0.93       171
   macro avg       0.92      0.93      0.93       171
weighted avg       0.93      0.93      0.93       171

Confusion Matrix:
 [[ 58   5]
 [  7 101]]


In [31]:
y_pred_rf = rf_clf.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_conf_matrix = confusion_matrix(y_test, y_pred_rf)
rf_class_report = classification_report(y_test, y_pred_rf)

print("\nRandom Forest:\n", rf_class_report)
print("Confusion Matrix:\n", rf_conf_matrix)


Random Forest:
               precision    recall  f1-score   support

           0       0.97      0.94      0.95        63
           1       0.96      0.98      0.97       108

    accuracy                           0.96       171
   macro avg       0.97      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171

Confusion Matrix:
 [[ 59   4]
 [  2 106]]


In [32]:
log_reg_accuracy, tree_accuracy, rf_accuracy

(0.3684210526315789, 0.9298245614035088, 0.9649122807017544)

In [38]:
print(f"Logistic Regression Accuracy: {log_reg_accuracy}")
print(f"Decision Tree Accuracy: {tree_accuracy}")
print(f"Random Forest Accuracy: {rf_accuracy}")

if log_reg_accuracy >= tree_accuracy and log_reg_accuracy >= rf_accuracy:
    best_model = log_reg
    model_name = "Logistic Regression"
elif tree_accuracy >= log_reg_accuracy and tree_accuracy >= rf_accuracy:
    best_model = tree_clf
    model_name = "Decision Tree"
else:
    best_model = rf_clf
    model_name = "Random Forest"

print(f"The best model based on accuracy is: {model_name}")


Logistic Regression Accuracy: 0.3684210526315789
Decision Tree Accuracy: 0.9298245614035088
Random Forest Accuracy: 0.9649122807017544
The best model based on accuracy is: Random Forest


In [39]:
y_pred_best_model = best_model.predict(X_test)
print("Predictions using Random Forest):")
print(y_pred_best_model)

Predictions using Random Forest):
[1 0 0 1 1 0 0 0 0 1 1 0 1 0 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0
 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 0 0 1 1 0 0 1 1 1 0 0 1 1 0 0 1 0
 1 1 1 1 1 1 0 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 1 0 0 1 0 0 1 1 1 0 0 1 0
 1 1 0 1 0 1 1 1 0 1 1 1 0 1 0 0 1 1 0 0 0 1 1 1 0 1 1 1 0 1 0 1 1 0 1 0 0
 0 1 0 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1]


In [40]:
best_model_accuracy = accuracy_score(y_test, y_pred_best_model)
best_model_conf_matrix = confusion_matrix(y_test, y_pred_best_model)
best_model_class_report = classification_report(y_test, y_pred_best_model)

print(f"\nAccuracy of the best model ({model_name}): {best_model_accuracy}")
print("Confusion Matrix:")
print(best_model_conf_matrix)
print("Classification Report:")
print(best_model_class_report)


Accuracy of the best model (Random Forest): 0.9649122807017544
Confusion Matrix:
[[ 59   4]
 [  2 106]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.94      0.95        63
           1       0.96      0.98      0.97       108

    accuracy                           0.96       171
   macro avg       0.97      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171

