In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneOut
from datetime import datetime
import os
import joblib




In [2]:
def load_and_combine_datasets(control_path, cancer_path):
    # Load datasets
    control_df = pd.read_csv(control_path)
    cancer_df = pd.read_csv(cancer_path)
    
    # Determine the size for undersampling
    sample_size = len(cancer_df)
    
    # Undersample the healthy dataset
    control_df_sample = control_df.sample(n=sample_size)
    
    # Combine datasets and shuffle
    combined_df = pd.concat([control_df_sample, cancer_df]).sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Split into features and labels
    X = combined_df.drop(['cancer_type', 'type'], axis=1)
    y = combined_df['cancer_type'].apply(lambda x: 1 if x != 'normal' else 0)  # 1 = cancer, 0 = healthy
    
    return X, y


In [3]:
def pca_data(X, n_components):
    # Apply PCA (assumes X is already standardized)
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    return X_pca, pca

In [4]:
def save_model(model, model_name, cancer_type, folder="Models"):
    """Save the model to a file using joblib."""
    if not os.path.exists(folder):
        os.makedirs(folder)  # Create directory if it doesn't exist
    model_filename = f"{folder}/{model_name}_{cancer_type}.joblib"
    joblib.dump(model, model_filename)
    print(f"Model {model_name} saved to {model_filename}")

In [5]:
# Train and evaluate models using LOOCV
def train_and_evaluate_loocv(X, y, models, cancer_type):
    loo = LeaveOneOut()
    results = {}

    for model_name, (model, param_grid) in models.items():
        print(f"Training {model_name} with LOOCV...")
        fold_scores = []
        all_y_true = []
        all_y_pred = []

        # GridSearch for hyperparameter tuning
        grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', verbose=0, n_jobs=-1)
        grid_search.fit(X, y)
        best_model = grid_search.best_estimator_
        # Save the best model
        save_model(best_model, model_name, cancer_type)

        # Perform LOOCV
        for train_index, test_index in loo.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # Fit and predict with the best model
            best_model.fit(X_train, y_train)
            y_pred = best_model.predict(X_test)

            # Track results
            fold_scores.append(best_model.score(X_test, y_test))
            all_y_true.extend(y_test)
            all_y_pred.extend(y_pred)

        # Compute overall statistics
        avg_score = np.mean(fold_scores)
        classification_report_dict = classification_report(all_y_true, all_y_pred, output_dict=True)

        # Store results
        results[model_name] = {
            'best_params': grid_search.best_params_,
            'avg_loocv_score': avg_score,
            'classification_report': classification_report_dict
        }

    return results

In [6]:
def save_results_to_csv(results, output_file):
    # Convert the results dictionary into a DataFrame
    print(results)
    rows = []
    for model_name, result in results.items():
        print(result)
        row = {
            'model': model_name,
            'best_params': result['best_params'],
            'avg loocv score': result['avg_loocv_score'],
            'classification_report': str(result['classification_report'])  # Serialize the report as a string
        }
        rows.append(row)
    
    df = pd.DataFrame(rows)
    df.to_csv(output_file, index=False)

In [7]:
# Define models and parameter grids
models_to_test = {
    'Random Forest': (RandomForestClassifier(random_state=42), {
        'n_estimators': [50, 100],
        'max_depth': [None, 10]
    }),
    # 'SVM': (SVC(kernel='linear', random_state=42), {
    #     'C': [0.1, 1, 10]
    # }),
    'Logistic Regression': (LogisticRegression(random_state=42, max_iter=500, penalty='l1', solver='liblinear'), {
        'C': [0.1, 1]
    }),
    # 'Naive Bayes': (GaussianNB(), {}),
    # 'KNN': (KNeighborsClassifier(), {
    #     'n_neighbors': [3, 5, 7, 10],
    #     'weights': ['uniform', 'distance']
    # }),
    # 'ZeroR': (DummyClassifier(strategy='most_frequent'), {}),  # ZeroR always predicts the most frequent class

}

all_results = {}

In [8]:

dataset_files = [f for f in os.listdir("Dataset") if f.endswith('.csv')]
for dataset_file in dataset_files:
    file_path = os.path.join("Dataset", dataset_file)

    cancer_type = os.path.splitext(file_path)[0].split('/')[-1]
    print(f"Processing datasets: {file_path}")

    X, y = load_and_combine_datasets("ControlDataset/normal.csv", file_path)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    joblib.dump(scaler, f"Scalers/{cancer_type}_scaler.joblib")


    all_results = {}


    # Train and evaluate models using LOOCV
    model_results = train_and_evaluate_loocv(X_scaled, y, models_to_test, cancer_type)
    all_results.update({model_name: result for model_name, result in model_results.items()})

        
    # save total results
    csv_filename = f"Result/results_{cancer_type}_pca_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    save_results_to_csv(results=all_results, output_file=csv_filename)

    print("Pipeline complete. Results saved to " + csv_filename)

Processing datasets: Dataset/throat.csv
Training Random Forest with LOOCV...
Model Random Forest saved to Models/Random Forest_throat.joblib
Training Logistic Regression with LOOCV...
Model Logistic Regression saved to Models/Logistic Regression_throat.joblib
{'Random Forest': {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.968421052631579, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9368421052631579, 'f1-score': 0.967391304347826, 'support': 95.0}, '1': {'precision': 0.9405940594059405, 'recall': 1.0, 'f1-score': 0.9693877551020408, 'support': 95.0}, 'accuracy': 0.968421052631579, 'macro avg': {'precision': 0.9702970297029703, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}, 'weighted avg': {'precision': 0.9702970297029702, 'recall': 0.968421052631579, 'f1-score': 0.9683895297249334, 'support': 190.0}}}, 'Logistic Regression': {'best_params': {'C': 0.1}, 'avg_loocv_score': 0.968421052631579, 'classification