In [9]:
#libraries for entire process

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier

In [15]:
# Load dataset function
def preprocess_data(data):
    # Handle missing values
    data['Date_of_Last_Visit'] = pd.to_datetime(data['Date_of_Last_Visit'], errors='coerce')
    data['Date_of_Surgery'] = pd.to_datetime(data['Date_of_Surgery'], errors='coerce')
    data['Time_Since_Surgery'] = (data['Date_of_Last_Visit'] - data['Date_of_Surgery']).dt.days.fillna(0)
    data['Patient_Status'] = data['Patient_Status'].fillna('Unknown')

    # Encode categorical variables
    categorical_columns = ['Gender', 'Tumour_Stage', 'Histology', 'ER status', 'PR status', 'HER2 status', 'Surgery_type', 'Patient_Status']
    for col in categorical_columns:
        data[col] = LabelEncoder().fit_transform(data[col])

    # Drop unnecessary columns
    data.drop(['Patient_ID', 'Date_of_Surgery', 'Date_of_Last_Visit'], axis=1, inplace=True)

    return data

In [17]:
from sklearn.model_selection import GridSearchCV

def perform_classification(X, y):
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define hyperparameter grid
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Perform Grid Search for best hyperparameters
    clf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_clf = grid_search.best_estimator_  # Retrieve the best model
    print(f"Best Parameters: {grid_search.best_params_}")

    # Train and predict using the fine-tuned model
    best_clf.fit(X_train, y_train)
    y_pred = best_clf.predict(X_test)
    y_proba = best_clf.predict_proba(X_test)

    # Evaluate Classification Model (same as before)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Compute ROC AUC score
    if len(set(y)) > 2:  # Multi-class case
        y_test_binarized = label_binarize(y_test, classes=np.unique(y))
        roc_auc = roc_auc_score(y_test_binarized, y_proba, multi_class='ovr', average='weighted')
    else:  # Binary case
        roc_auc = roc_auc_score(y_test, y_proba[:, 1])

    print("Classification Metrics:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    print(f"ROC-AUC: {roc_auc:.2f}")

    # Confusion Matrix and ROC Curve plotting remain the same as before


In [19]:
def perform_clustering(X):
    # Test different numbers of clusters to find the best one
    silhouette_scores = []
    cluster_range = range(2, 10)  # Evaluate from 2 to 9 clusters
    
    for n_clusters in cluster_range:
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        clusters = kmeans.fit_predict(X)
        sil_score = silhouette_score(X, clusters)
        silhouette_scores.append(sil_score)
        print(f"Silhouette Score for {n_clusters} clusters: {sil_score:.2f}")

    # Select the best number of clusters based on the silhouette score
    optimal_clusters = cluster_range[np.argmax(silhouette_scores)]
    print(f"Optimal number of clusters: {optimal_clusters}")
    
    # Apply KMeans with the optimal number of clusters
    kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
    clusters = kmeans.fit_predict(X)
    
    # Visualize Clusters using PCA
    from sklearn.decomposition import PCA
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis', alpha=0.6)
    plt.title(f'K-Means Clustering with {optimal_clusters} Clusters (PCA Reduced)')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.show()


In [21]:
# Advanced missing value handling
data['Time_Since_Surgery'] = data['Time_Since_Surgery'].fillna(data['Time_Since_Surgery'].median())

# Feature engineering
data['ER_PR_status_combined'] = data['ER status'].astype(str) + "_" + data['PR status'].astype(str)

# Experiment with OneHotEncoder for better interpretability of categorical features
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
encoded_columns = encoder.fit_transform(data[categorical_columns])
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(categorical_columns))
data = pd.concat([data.drop(categorical_columns, axis=1), encoded_df], axis=1)


KeyError: 'Time_Since_Surgery'

In [13]:
if __name__ == "__main__":
    # Load and preprocess the data
    data = pd.read_csv('breast_cancer_data.csv')
    data = preprocess_data(data)

    X = data.drop('Patient_Status', axis=1)
    y = data['Patient_Status']

    # Scale data for clustering
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Perform Classification with Fine-Tuning
    print("=== Classification Task (Fine-Tuned) ===")
    perform_classification(X_scaled, y)

    # Perform Clustering with Fine-Tuning
    print("\n=== Clustering Task (Fine-Tuned) ===")
    perform_clustering(X_scaled)


NameError: name 'preprocess_data' is not defined