In [1]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('data.csv')

# Display first few rows
print("🔹 First few rows:")
print(df.head())

# Display column names
print("\n🔹 Columns:")
print(df.columns.tolist())

# Display data types of each column
print("\n🔹 Data types:")
print(df.dtypes)

# Display summary info
print("\n🔹 Info:")
print(df.info())

# Display summary statistics (for num


🔹 First few rows:
         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  texture_worst  perimeter_worst  area

In [3]:
# 1. SETUP AND IMPORTS
# ==============================================================================
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Import the models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# For RandomizedSearchCV distributions
from scipy.stats import randint

# Ignore warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')


# 2. LOAD AND PREPARE THE DATA
# ==============================================================================
print("--- 1. Loading and Preparing Data ---")
# Load the dataset
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target

# Create a DataFrame for easier exploration (optional)
df = pd.DataFrame(X, columns=cancer.feature_names)
# print(df.head())
# print(f"\nTarget classes: {cancer.target_names}") # 0: malignant, 1: benign

# Split the data into training and testing sets
# We use stratify=y to maintain the same proportion of classes in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Scale the features
# Scaling is important for models like Logistic Regression and SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Data prepared: {X_train_scaled.shape[0]} training samples, {X_test_scaled.shape[0]} testing samples.\n")


# 3. TRAIN AND EVALUATE BASELINE MODELS
# ==============================================================================
print("--- 2. Training and Evaluating Baseline Models ---")
# Define the models to train
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42)
}

# Create a dictionary to store the results
results = {}

# Loop through the models, train them, and evaluate
for name, model in models.items():
    # Use scaled data for all models for consistency
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Store results
    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1
    }

# Convert results to a DataFrame for nice printing
results_df = pd.DataFrame(results).T
print("Baseline Model Performance:")
print(results_df)
print("\n")


# 4. HYPERPARAMETER TUNING WITH GridSearchCV (for SVM)
# ==============================================================================
print("--- 3. Hyperparameter Tuning with GridSearchCV (for SVM) ---")

# Define the parameter grid for SVM
# We'll tune 'C' (regularization), 'gamma' (kernel coefficient), and 'kernel' type
param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}

# Instantiate GridSearchCV
# cv=5 means 5-fold cross-validation
# n_jobs=-1 uses all available CPU cores
grid_search_svm = GridSearchCV(estimator=SVC(random_state=42), 
                               param_grid=param_grid_svm, 
                               cv=5, 
                               n_jobs=-1, 
                               verbose=1, 
                               scoring='f1') # We optimize for F1-score

# Fit the grid search to the data
grid_search_svm.fit(X_train_scaled, y_train)

# Get the best parameters and the best estimator
print(f"\nBest Parameters for SVM: {grid_search_svm.best_params_}")
best_svm = grid_search_svm.best_estimator_

# Evaluate the tuned SVM model
y_pred_svm_tuned = best_svm.predict(X_test_scaled)
results['SVM (Tuned)'] = {
    "Accuracy": accuracy_score(y_test, y_pred_svm_tuned),
    "Precision": precision_score(y_test, y_pred_svm_tuned),
    "Recall": recall_score(y_test, y_pred_svm_tuned),
    "F1-Score": f1_score(y_test, y_pred_svm_tuned)
}
print("\n")


# 5. HYPERPARAMETER TUNING WITH RandomizedSearchCV (for Random Forest)
# ==============================================================================
print("--- 4. Hyperparameter Tuning with RandomizedSearchCV (for Random Forest) ---")
# RandomizedSearch is great for large search spaces

# Define the parameter distribution for Random Forest
param_dist_rf = {
    'n_estimators': randint(50, 500), # Number of trees
    'max_depth': randint(5, 30),      # Max depth of the tree
    'min_samples_leaf': randint(1, 10), # Min samples at a leaf node
    'min_samples_split': randint(2, 20), # Min samples to split a node
    'criterion': ['gini', 'entropy']
}

# Instantiate RandomizedSearchCV
# n_iter=50 means it will try 50 different random combinations of parameters
random_search_rf = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42),
                                      param_distributions=param_dist_rf,
                                      n_iter=50, # Number of parameter settings that are sampled
                                      cv=5,
                                      n_jobs=-1,
                                      random_state=42,
                                      verbose=1,
                                      scoring='f1')

# Fit the random search to the data
random_search_rf.fit(X_train_scaled, y_train)

# Get the best parameters and the best estimator
print(f"\nBest Parameters for Random Forest: {random_search_rf.best_params_}")
best_rf = random_search_rf.best_estimator_

# Evaluate the tuned Random Forest model
y_pred_rf_tuned = best_rf.predict(X_test_scaled)
results['Random Forest (Tuned)'] = {
    "Accuracy": accuracy_score(y_test, y_pred_rf_tuned),
    "Precision": precision_score(y_test, y_pred_rf_tuned),
    "Recall": recall_score(y_test, y_pred_rf_tuned),
    "F1-Score": f1_score(y_test, y_pred_rf_tuned)
}
print("\n")


# 6. FINAL ANALYSIS AND MODEL SELECTION
# ==============================================================================
print("--- 5. Final Analysis and Model Selection ---")

# Create a final DataFrame with all results
final_results_df = pd.DataFrame(results).T
final_results_df = final_results_df.sort_values(by='F1-Score', ascending=False)

print("Final Comparison of All Models:")
print(final_results_df)

print("\n--- Detailed Report for the Best Model ---")
best_model_name = final_results_df.index[0]
if "SVM" in best_model_name:
    best_model_preds = y_pred_svm_tuned
elif "Random Forest" in best_model_name:
    best_model_preds = y_pred_rf_tuned
else:
    # Fallback for other models if they happen to be best
    best_model_obj = models[best_model_name]
    best_model_obj.fit(X_train_scaled, y_train)
    best_model_preds = best_model_obj.predict(X_test_scaled)

print(f"The best performing model is: {best_model_name}")
print("\nClassification Report:")
print(classification_report(y_test, best_model_preds, target_names=cancer.target_names))

--- 1. Loading and Preparing Data ---
Data prepared: 398 training samples, 171 testing samples.

--- 2. Training and Evaluating Baseline Models ---
Baseline Model Performance:
                        Accuracy  Precision    Recall  F1-Score
Logistic Regression     0.988304   0.990654  0.990654  0.990654
K-Nearest Neighbors     0.959064   0.938596  1.000000  0.968326
Support Vector Machine  0.976608   0.981308  0.981308  0.981308
Random Forest           0.935673   0.944444  0.953271  0.948837


--- 3. Hyperparameter Tuning with GridSearchCV (for SVM) ---
Fitting 5 folds for each of 32 candidates, totalling 160 fits

Best Parameters for SVM: {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}


--- 4. Hyperparameter Tuning with RandomizedSearchCV (for Random Forest) ---
Fitting 5 folds for each of 50 candidates, totalling 250 fits

Best Parameters for Random Forest: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 363}


--- 5. Final Analysis

In [5]:
# 1. SETUP AND IMPORTS
# ==============================================================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Import the models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# For RandomizedSearchCV distributions
from scipy.stats import randint

# Ignore warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')


# 2. LOAD AND PREPARE THE DATA (MODIFIED FOR CSV)
# ==============================================================================
print("--- 1. Loading and Preparing Data from CSV ---")
# Load the dataset from your CSV file
# Make sure 'data.csv' is the correct path to your file
df = pd.read_csv('data.csv')

# --- Data Cleaning and Preprocessing ---

# Drop the unnecessary columns
df.drop('id', axis=1, inplace=True)
df.drop('Unnamed: 32', axis=1, inplace=True)

# Encode the 'diagnosis' column (the target)
# We will map 'M' (Malignant) to 1 and 'B' (Benign) to 0
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})
target_names = ['Benign', 'Malignant'] # For final report

# Separate features (X) and target (y)
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

# Split the data into training and testing sets
# We use stratify=y to maintain the same proportion of classes in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Scale the features
# Scaling is important for models like Logistic Regression and SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Data prepared: {X_train_scaled.shape[0]} training samples, {X_test_scaled.shape[0]} testing samples.\n")


# 3. TRAIN AND EVALUATE BASELINE MODELS
# ==============================================================================
print("--- 2. Training and Evaluating Baseline Models ---")
# Define the models to train
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(random_state=42, probability=True), # probability=True for some metrics
    "Random Forest": RandomForestClassifier(random_state=42)
}

# Create a dictionary to store the results
results = {}

# Loop through the models, train them, and evaluate
for name, model in models.items():
    # Use scaled data for all models for consistency
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Store results
    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1
    }

# Convert results to a DataFrame for nice printing
results_df = pd.DataFrame(results).T
print("Baseline Model Performance:")
print(results_df)
print("\n")


# 4. HYPERPARAMETER TUNING WITH GridSearchCV (for SVM)
# ==============================================================================
print("--- 3. Hyperparameter Tuning with GridSearchCV (for SVM) ---")

# Define the parameter grid for SVM
param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}

# Instantiate GridSearchCV
grid_search_svm = GridSearchCV(estimator=SVC(random_state=42), 
                               param_grid=param_grid_svm, 
                               cv=5, 
                               n_jobs=-1, 
                               verbose=1, 
                               scoring='f1')

grid_search_svm.fit(X_train_scaled, y_train)

# Get the best parameters and the best estimator
print(f"\nBest Parameters for SVM: {grid_search_svm.best_params_}")
best_svm = grid_search_svm.best_estimator_

# Evaluate the tuned SVM model
y_pred_svm_tuned = best_svm.predict(X_test_scaled)
results['SVM (Tuned)'] = {
    "Accuracy": accuracy_score(y_test, y_pred_svm_tuned),
    "Precision": precision_score(y_test, y_pred_svm_tuned),
    "Recall": recall_score(y_test, y_pred_svm_tuned),
    "F1-Score": f1_score(y_test, y_pred_svm_tuned)
}
print("\n")


# 5. HYPERPARAMETER TUNING WITH RandomizedSearchCV (for Random Forest)
# ==============================================================================
print("--- 4. Hyperparameter Tuning with RandomizedSearchCV (for Random Forest) ---")

# Define the parameter distribution for Random Forest
param_dist_rf = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(5, 30),
    'min_samples_leaf': randint(1, 10),
    'min_samples_split': randint(2, 20),
    'criterion': ['gini', 'entropy']
}

# Instantiate RandomizedSearchCV
random_search_rf = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42),
                                      param_distributions=param_dist_rf,
                                      n_iter=50, 
                                      cv=5,
                                      n_jobs=-1,
                                      random_state=42,
                                      verbose=1,
                                      scoring='f1')

random_search_rf.fit(X_train_scaled, y_train)

# Get the best parameters and the best estimator
print(f"\nBest Parameters for Random Forest: {random_search_rf.best_params_}")
best_rf = random_search_rf.best_estimator_

# Evaluate the tuned Random Forest model
y_pred_rf_tuned = best_rf.predict(X_test_scaled)
results['Random Forest (Tuned)'] = {
    "Accuracy": accuracy_score(y_test, y_pred_rf_tuned),
    "Precision": precision_score(y_test, y_pred_rf_tuned),
    "Recall": recall_score(y_test, y_pred_rf_tuned),
    "F1-Score": f1_score(y_test, y_pred_rf_tuned)
}
print("\n")


# 6. FINAL ANALYSIS AND MODEL SELECTION
# ==============================================================================
print("--- 5. Final Analysis and Model Selection ---")

# Create a final DataFrame with all results
final_results_df = pd.DataFrame(results).T
final_results_df = final_results_df.sort_values(by='F1-Score', ascending=False)

print("Final Comparison of All Models:")
print(final_results_df)

print("\n--- Detailed Report for the Best Model ---")
best_model_name = final_results_df.index[0]
if "SVM" in best_model_name:
    best_model_preds = y_pred_svm_tuned
elif "Random Forest" in best_model_name:
    best_model_preds = y_pred_rf_tuned
else:
    best_model_obj = models[best_model_name]
    best_model_obj.fit(X_train_scaled, y_train)
    best_model_preds = best_model_obj.predict(X_test_scaled)

print(f"The best performing model is: {best_model_name}")
print("\nClassification Report:")
# Use the target_names we defined during data prep
print(classification_report(y_test, best_model_preds, target_names=target_names))

--- 1. Loading and Preparing Data from CSV ---
Data prepared: 398 training samples, 171 testing samples.

--- 2. Training and Evaluating Baseline Models ---
Baseline Model Performance:
                        Accuracy  Precision    Recall  F1-Score
Logistic Regression     0.970760   0.983607  0.937500  0.960000
K-Nearest Neighbors     0.964912   1.000000  0.906250  0.950820
Support Vector Machine  0.959064   1.000000  0.890625  0.942149
Random Forest           0.964912   1.000000  0.906250  0.950820


--- 3. Hyperparameter Tuning with GridSearchCV (for SVM) ---
Fitting 5 folds for each of 32 candidates, totalling 160 fits

Best Parameters for SVM: {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}


--- 4. Hyperparameter Tuning with RandomizedSearchCV (for Random Forest) ---
Fitting 5 folds for each of 50 candidates, totalling 250 fits

Best Parameters for Random Forest: {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 162}


--- 5. Final An