In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


cancer = load_breast_cancer()


X = pd.DataFrame(cancer.data, columns=cancer.feature_names)


y = pd.Series(cancer.target, name='target')


print("Here are the first 5 rows of X:")
print(X.head())


print("\nFirst 5 things in y (target):")
print(y.head())


print("\nShape of X is:", X.shape)
print("Shape of y is:", y.shape)


print("\nHow many 0s and 1s in y:")
print(y.value_counts())


print("\nWhat the targets mean:")
print(dict(zip(cancer.target_names, [0, 1])))


print("\nSome info about this data:")
print("- We have", X.shape[0], "samples (rows)")
print("- There are", X.shape[1], "features (columns)")
print("- The types of features are:", X.dtypes.unique())
print("- The target classes are:", list(cancer.target_names))


Here are the first 5 rows of X:
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst radius  worst texture 

In [2]:

X_train, X_test, Y_train, Y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("Y_train shape:", Y_train.shape)
print("Y_test shape:", Y_test.shape)



X_train shape: (455, 30)
X_test shape: (114, 30)
Y_train shape: (455,)
Y_test shape: (114,)


In [3]:
scaler = StandardScaler()

scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

#First three features:

first_three_features = X_train_scaled[:, :3]


print("Mean of first 3 features:")
print(first_three_features.mean(0))


print("Standard deviation of first 3 features:")
print(first_three_features.std(0))


Mean of first 3 features:
[-4.31742554e-15  2.24606658e-15 -7.38359313e-16]
Standard deviation of first 3 features:
[1. 1. 1.]


In [4]:
logistic_regression = LogisticRegression(solver='liblinear', random_state=42)

logistic_regression.fit(X_train_scaled,Y_train)

y_predicted = logistic_regression.predict(X_test_scaled)

accuracy = accuracy_score(Y_test, y_predicted)

classification_rp = classification_report(Y_test, y_predicted)

confusion_mx = confusion_matrix(Y_test, y_predicted)

print("The accuracy is:" , accuracy)
print("The classification report is:" , classification_rp)
print("The confusion matric is:" , confusion_mx)




The accuracy is: 0.9824561403508771
The classification report is:               precision    recall  f1-score   support

           0       0.98      0.98      0.98        42
           1       0.99      0.99      0.99        72

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114

The confusion matric is: [[41  1]
 [ 1 71]]


Interpretation: The model performs great in my opinion, having a 98.2% accuracy with high precision, recall, and F1-scores for both class groups, indicating it validates and correctly identifies both benign and malignant cases. The confusion matrix shows only 2 misclassifications, suggesting vey good reliability and great balance. 

Getting into the specifics: 

-  Precision: It tells how many of the model's positive predictions were actually right. here the score is 0.98 so that means there were very few false postive cases. 

-  Recall: It shows how many cases the model correctly identified. With a recall of around 0.98 and 0.99 it is also really good, which means it is able to identify all the true casses in each class group.

-  F1-Score: It balances precision and recall. Since the score is around 0.98 to 0.99, we can interpret that model maintains high-quality predictions.

- Confusion matrix: [[41 1], [1 71]] shows the model made just 2 errors: 1 false positive and 1 false negative. This confirms that the classifier is very accurate across both classes. 

In [5]:
#Pipeline :

pipelin = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(solver='liblinear', random_state=42))
])

from sklearn.model_selection import cross_val_score

cross_valuation_scores = cross_val_score(pipelin, X, y, cv=5, scoring='accuracy')

print("Accuracy scores:", cross_valuation_scores)

print("Mean accuracy:", cross_valuation_scores.mean())

print("Standard deviation:", cross_valuation_scores.std())




Accuracy scores: [0.98245614 0.97368421 0.97368421 0.97368421 0.99115044]
Mean accuracy: 0.9789318428815402
Standard deviation: 0.006990390328940835


Comparison with step 3: 

Accuracy scores: 0.98 for the single train-test split and the accuracy for the cross validation was [0.9825, 0.9737, 0.9737, 0.9737, 0.9912]. The mean accuracy of the cross validation split was 97.89 percent. 

Interpretation: Even though the single split accuracy is higher than the cross validation split, it is not necessarily better as they both have different purposes, with the single-train-test split being for quick check on unseen data and the cross validation giving a more reliable estimate. 

The low standard deviation means that the model's performance is very stable and it behaves similarly across all 5 folds, with very little variation in accuracy.



In [6]:
final_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(solver='liblinear', random_state=42))
])

final_pipeline.fit(X_train, Y_train)

final_predictions = final_pipeline.predict(X_test)

final_accuracy = accuracy_score(Y_test, final_predictions)
print("Final pipeline accuracy is:", final_accuracy)

Final pipeline accuracy is: 0.9824561403508771


The pipeline accuracy on the 20% test set is 98.25%, which matches the accuracy from Part 3 when we manually scaled and trained a logistic regression model. 

This shows that the pipeline successfully automated , fitting and achieved equally good performance, while making the code cleaner and less error-prone.

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

random_forest_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
])

random_forest_pipeline.fit(X_train, Y_train)
random_forest_predictions = random_forest_pipeline.predict(X_test)

random_forest_accuracy = accuracy_score(Y_test, random_forest_predictions)
print("Random Forest Accuracy on Test Set:", random_forest_accuracy)

random_forest_cm = confusion_matrix(Y_test, random_forest_predictions)
print("Confusion Matrix:", random_forest_cm)

random_forest_cr = classification_report(Y_test, random_forest_predictions)
print("Classification Report:", random_forest_cr)


Random Forest Accuracy on Test Set: 0.956140350877193
Confusion Matrix: [[39  3]
 [ 2 70]]
Classification Report:               precision    recall  f1-score   support

           0       0.95      0.93      0.94        42
           1       0.96      0.97      0.97        72

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



Logistic Regression had higher accuracy (0.982) than Random Forest (0.956).  

It also performed slightly better in precision and recall for both classes.  

Random Forest was close but slightly weaker in detecting benign cases.  

Overall, Logistic Regression was the better model on this dataset.

The difference is not massive, but it suggests that Logistic Regression generalized slightly better on this dataset.