In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import json
from datetime import datetime


In [9]:
def load_and_combine_datasets(control_path, cancer_path):
    # Load datasets
    control_df = pd.read_csv(control_path)
    cancer_df = pd.read_csv(cancer_path)
    
    # Determine the size for undersampling
    sample_size = len(cancer_df)
    
    # Undersample the healthy dataset
    control_df_sample = control_df.sample(n=sample_size, random_state=42)
    
    # Combine datasets and shuffle
    combined_df = pd.concat([control_df_sample, cancer_df]).sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Split into features and labels
    X = combined_df.drop(['cancer_type', 'type'], axis=1)
    y = combined_df['cancer_type'].apply(lambda x: 1 if x != 'normal' else 0)  # 1 = cancer, 0 = healthy
    
    return X, y


In [10]:
def pca_data(X, n_components):
    # Apply PCA (assumes X is already standardized)
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    return X_pca, pca

In [11]:
def train_and_evaluate(X_train, X_val, X_test, y_train, y_val, y_test, models):
    results = {}
    
    for model_name, (model, param_grid) in models.items():
        print(f"Training {model_name}...")
        grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', verbose=0, n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        # Save best parameters and performance
        best_model = grid_search.best_estimator_
        train_score = grid_search.best_score_
        val_score = best_model.score(X_val, y_val)
        test_score = best_model.score(X_test, y_test)
        y_pred = best_model.predict(X_test)
        
        results[model_name] = {
            'best_params': grid_search.best_params_,
            'train_score': train_score,
            'val_score': val_score,
            'test_score': test_score,
            'classification_report': classification_report(y_test, y_pred, output_dict=True)
        }
    
    return results

In [12]:
def save_results(results, output_file):
    with open(output_file, 'w') as f:
        json.dump(results, f, indent=4)

In [13]:
# Define models and parameter grids
models_to_test = {
    'Random Forest': (RandomForestClassifier(random_state=42), {
        'n_estimators': [50, 100],#, 200],
        'max_depth': [None, 10],#, 20]
    }),
    'SVM': (SVC(random_state=42), {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    }),
    'Logistic Regression': (LogisticRegression(random_state=42, max_iter=500), {
        'C': [0.1, 1, 10]
    })
}

pca_to_test = [0, 10, 100, 160]

In [None]:
# Load Data
X, y = load_and_combine_datasets("Dataset/normal.csv", "Dataset/lung.csv")
cancer_type = "lung"

In [None]:

# Split data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Preprocess data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

for num_pca in pca_to_test:
    print(f"running pca on {num_pca} features")
    # Run PCA feature reduction
    if num_pca == 0:
        X_train_pca = X_train_scaled
        X_val_pca = X_val_scaled
        X_test_pca = X_test_scaled
    else:
        X_train_pca, pca = pca_data(X_train_scaled, num_pca)
        X_val_pca = pca.transform(X_val_scaled)
        X_test_pca = pca.transform(X_test_scaled)

    # Train and evaluate models
    results = train_and_evaluate(X_train_pca, X_val_pca, X_test_pca, y_train, y_val, y_test, models_to_test)

    # Save results
    filename = f"Result/results_{cancer_type}_pca_{num_pca}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    save_results(results, filename)

print("Pipeline complete. Results saved to 'results.json'.")

running pca on 0 features
Training Random Forest...
Training SVM...
Training Logistic Regression...
running pca on 10 features
Training Random Forest...
Training SVM...
Training Logistic Regression...
running pca on 100 features
Training Random Forest...
Training SVM...
Training Logistic Regression...
running pca on 160 features
Training Random Forest...
Training SVM...
Training Logistic Regression...
Pipeline complete. Results saved to 'results.json'.
