In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneOut
from datetime import datetime


In [2]:
def load_and_combine_datasets(control_path, cancer_path):
    # Load datasets
    control_df = pd.read_csv(control_path)
    cancer_df = pd.read_csv(cancer_path)
    
    # Determine the size for undersampling
    sample_size = len(cancer_df)
    
    # Undersample the healthy dataset
    control_df_sample = control_df.sample(n=sample_size, random_state=42)
    
    # Combine datasets and shuffle
    combined_df = pd.concat([control_df_sample, cancer_df]).sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Split into features and labels
    X = combined_df.drop(['cancer_type', 'type'], axis=1)
    y = combined_df['cancer_type'].apply(lambda x: 1 if x != 'normal' else 0)  # 1 = cancer, 0 = healthy
    
    return X, y


In [3]:
def pca_data(X, n_components):
    # Apply PCA (assumes X is already standardized)
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    return X_pca, pca

In [4]:
# Train and evaluate models using LOOCV
def train_and_evaluate_loocv(X, y, models):
    loo = LeaveOneOut()
    results = {}

    for model_name, (model, param_grid) in models.items():
        print(f"Training {model_name} with LOOCV...")
        fold_scores = []
        all_y_true = []
        all_y_pred = []

        # GridSearch for hyperparameter tuning
        grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', verbose=0, n_jobs=-1)
        grid_search.fit(X, y)
        best_model = grid_search.best_estimator_

        # Perform LOOCV
        for train_index, test_index in loo.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # Fit and predict with the best model
            best_model.fit(X_train, y_train)
            y_pred = best_model.predict(X_test)

            # Track results
            fold_scores.append(best_model.score(X_test, y_test))
            all_y_true.extend(y_test)
            all_y_pred.extend(y_pred)

        # Compute overall statistics
        avg_score = np.mean(fold_scores)
        classification_report_dict = classification_report(all_y_true, all_y_pred, output_dict=True)

        # Store results
        results[model_name] = {
            'best_params': grid_search.best_params_,
            'avg_loocv_score': avg_score,
            'classification_report': classification_report_dict
        }

    return results

In [5]:
def save_results_to_csv(results, output_file):
    # Convert the results dictionary into a DataFrame
    print(results)
    rows = []
    for model_name, result in results.items():
        print(result)
        row = {
            'model': model_name[0],
            'pca_size': model_name[1],
            'best_params': result['best_params'],
            'avg loocv score': result['avg_loocv_score'],
            'classification_report': str(result['classification_report'])  # Serialize the report as a string
        }
        rows.append(row)
    
    df = pd.DataFrame(rows)
    df.to_csv(output_file, index=False)

In [6]:
# Define models and parameter grids
models_to_test = {
    'Random Forest': (RandomForestClassifier(random_state=42), {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20]
    }),
    'SVM': (SVC(kernel='linear', random_state=42), {
        'C': [0.1, 1, 10]
    }),
    'Logistic Regression': (LogisticRegression(random_state=42, max_iter=500, penalty='l1', solver='liblinear'), {
        'C': [0.1, 1, 10]
    }),
    'Naive Bayes': (GaussianNB(), {}),
    'KNN': (KNeighborsClassifier(), {
        'n_neighbors': [3, 5, 7, 10],
        'weights': ['uniform', 'distance']
    }),
    'ZeroR': (DummyClassifier(strategy='most_frequent'), {}),  # ZeroR always predicts the most frequent class

}

pca_to_test = [0, 10, 44]
all_results = {}

CHANGE WHAT DATA IS BEING LOADED HERE

In [None]:
# Load Data
X, y = load_and_combine_datasets("Dataset/normal.csv", "Dataset/lung.csv")
cancer_type = "lung"

In [8]:

# Preprocess Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


all_results = {}

for num_pca in pca_to_test:
    print(f"Running LOOCV with PCA on {num_pca} features...")
    # Apply PCA if specified
    if num_pca == 0:
        X_pca = X_scaled
    else:
        X_pca, pca = pca_data(X_scaled, num_pca)

    # Train and evaluate models using LOOCV
    model_results = train_and_evaluate_loocv(X_pca, y, models_to_test)
    all_results.update({(model_name, num_pca): result for model_name, result in model_results.items()})

    
# save total results
csv_filename = f"Result/results_{cancer_type}_pca_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
save_results_to_csv(results=all_results, output_file=csv_filename)

print("Pipeline complete. Results saved to " + csv_filename)

Running LOOCV with PCA on 0 features...
Training Random Forest with LOOCV...
Training SVM with LOOCV...
Training Logistic Regression with LOOCV...
Training Naive Bayes with LOOCV...
Training KNN with LOOCV...
Training ZeroR with LOOCV...
Running LOOCV with PCA on 10 features...
Training Random Forest with LOOCV...
Training SVM with LOOCV...
Training Logistic Regression with LOOCV...
Training Naive Bayes with LOOCV...
Training KNN with LOOCV...
Training ZeroR with LOOCV...
Running LOOCV with PCA on 44 features...
Training Random Forest with LOOCV...
Training SVM with LOOCV...
Training Logistic Regression with LOOCV...
Training Naive Bayes with LOOCV...
Training KNN with LOOCV...
Training ZeroR with LOOCV...
{('Random Forest', 0): {'best_params': {'max_depth': None, 'n_estimators': 50}, 'avg_loocv_score': 0.9930555555555556, 'classification_report': {'0': {'precision': 1.0, 'recall': 0.9861111111111112, 'f1-score': 0.993006993006993, 'support': 72.0}, '1': {'precision': 0.986301369863013