In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the dataset
df = pd.read_csv('breast_cancer.csv')

# Display basic information about the dataset
print(df.info())
print(df.head())

# Handle missing values if any
df = df.dropna()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4024 entries, 0 to 4023
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Age                     4024 non-null   int64 
 1   Race                    4024 non-null   object
 2   Marital Status          4024 non-null   object
 3   T Stage                 4024 non-null   object
 4   N Stage                 4024 non-null   object
 5   6th Stage               4024 non-null   object
 6   differentiate           4024 non-null   object
 7   Grade                   4024 non-null   object
 8   A Stage                 4024 non-null   object
 9   Tumor Size              4024 non-null   int64 
 10  Estrogen Status         4024 non-null   object
 11  Progesterone Status     4024 non-null   object
 12  Regional Node Examined  4024 non-null   int64 
 13  Reginol Node Positive   4024 non-null   int64 
 14  Survival Months         4024 non-null   int64 
 15  Stat

In [3]:
# Encode categorical variables
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Define features and target variable
X = df.drop('Status', axis=1)
y = df['Status']

In [4]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
# Initialize the models
log_reg = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier()

# Train Logistic Regression model
log_reg.fit(X_train, y_train)
log_reg_preds = log_reg.predict(X_test)

# Train Random Forest model
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

In [6]:
# Evaluate the models
print("Logistic Regression Accuracy:", accuracy_score(y_test, log_reg_preds))
print("Random Forest Accuracy:", accuracy_score(y_test, rf_preds))

print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, log_reg_preds))

print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_preds))

Logistic Regression Accuracy: 0.9043478260869565
Random Forest Accuracy: 0.9105590062111801

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.98      0.95       685
           1       0.83      0.45      0.58       120

    accuracy                           0.90       805
   macro avg       0.87      0.72      0.76       805
weighted avg       0.90      0.90      0.89       805


Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.98      0.95       685
           1       0.82      0.51      0.63       120

    accuracy                           0.91       805
   macro avg       0.87      0.74      0.79       805
weighted avg       0.91      0.91      0.90       805



In [7]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Define hyperparameters for GridSearch
log_reg_params = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

In [8]:
# GridSearch for Logistic Regression
grid_log_reg = GridSearchCV(log_reg, log_reg_params, cv=5)
grid_log_reg.fit(X_train, y_train)
best_log_reg = grid_log_reg.best_estimator_

# GridSearch for Random Forest
grid_rf = GridSearchCV(rf, rf_params, cv=5)
grid_rf.fit(X_train, y_train)
best_rf = grid_rf.best_estimator_

25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\balas\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\balas\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\balas\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1172, in fit
    solver = _check_solver(sel

In [9]:
# Display best parameters and model accuracy
print("Best Logistic Regression Parameters:", grid_log_reg.best_params_)
print("Best Logistic Regression Accuracy:", grid_log_reg.best_score_)

print("Best Random Forest Parameters:", grid_rf.best_params_)
print("Best Random Forest Accuracy:", grid_rf.best_score_)

Best Logistic Regression Parameters: {'C': 1, 'penalty': 'l2'}
Best Logistic Regression Accuracy: 0.8912676410073125
Best Random Forest Parameters: {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 100}
Best Random Forest Accuracy: 0.9033847550785815


In [10]:
# RandomizedSearch for Logistic Regression
random_log_reg = RandomizedSearchCV(log_reg, log_reg_params, n_iter=10, cv=5)
random_log_reg.fit(X_train, y_train)
best_random_log_reg = random_log_reg.best_estimator_

# RandomizedSearch for Random Forest
random_rf = RandomizedSearchCV(rf, rf_params, n_iter=10, cv=5)
random_rf.fit(X_train, y_train)
best_random_rf = random_rf.best_estimator_

25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\balas\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\balas\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\balas\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1172, in fit
    solver = _check_solver(sel

In [11]:
# Display best parameters and model accuracy for RandomizedSearch
print("Best Randomized Logistic Regression Parameters:", random_log_reg.best_params_)
print("Best Randomized Logistic Regression Accuracy:", random_log_reg.best_score_)

print("Best Randomized Random Forest Parameters:", random_rf.best_params_)
print("Best Randomized Random Forest Accuracy:", random_rf.best_score_)

Best Randomized Logistic Regression Parameters: {'penalty': 'l2', 'C': 1}
Best Randomized Logistic Regression Accuracy: 0.8912676410073125
Best Randomized Random Forest Parameters: {'n_estimators': 100, 'min_samples_split': 2, 'max_depth': 20}
Best Randomized Random Forest Accuracy: 0.9024540440288632
