In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, auc, roc_auc_score, average_precision_score, precision_recall_curve
import gc

In [None]:

import os
dirname = '/kaggle/input/isic-2024-challenge/'
# Get the working directory
working_dir = '/kaggle/working/'

# Explicitly specify data types for the columns with mixed types
dtype_dict = {
    51: str,   # Replace 51 with the actual column name
    52: str,   # Replace 52 with the actual column name
}
train_meta = pd.read_csv(os.path.join(dirname,'train-metadata.csv'), dtype = dtype_dict)
test_meta = pd.read_csv(os.path.join(dirname,'test-metadata.csv'), dtype = dtype_dict)


In [None]:
# Get common columns, excluding 'target' from the comparison
common_columns = [col for col in train_meta.columns if col in test_meta.columns or col == 'target']


In [None]:
#Filter both DataFrames to keep only common columns
train_meta = train_meta[common_columns]
test_meta = test_meta[[col for col in common_columns if col != 'target']] 

In [None]:
# Define columns to exclude from imputation
columns_to_exclude = ['isic_id', 'patient_id',
                      'attribution', 'copyright_license','image_type','tbp_tile_type','tbp_lv_location','tbp_lv_location_simple'] 


In [None]:
# 1. Identify potential features: Get all column names
all_columns = train_meta.columns.tolist()

In [None]:
# 2. Filter numerical features: Exclude the columns_to_exclude and 'target'
num_meta_cols = [col for col in all_columns if (train_meta[col].dtype in [np.number, 'Int64']) and col not in columns_to_exclude and col != 'target']  # Exclude 'target'


In [None]:
# 3. Filter categorical features: Exclude the columns_to_exclude
cat_meta_cols = [col for col in all_columns if (train_meta[col].dtype == 'object' or train_meta[col].dtype.name == 'category') and col not in columns_to_exclude]  


In [None]:
# Separate features (X) and target (y)
X = train_meta.drop(columns=['target'] + columns_to_exclude)
y = train_meta['target']


In [None]:
# Features for the test set (exclude 'target' since it's not there)
X_test = test_meta.drop(columns=columns_to_exclude) 


In [None]:
# Define preprocessing pipeline for numerical columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))])  # Use median for numerical imputation


In [None]:
# Define preprocessing pipeline for categorical columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot',
 OneHotEncoder(handle_unknown='ignore'))]) 


In [None]:
# Create the full pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_meta_cols),
        ('cat', categorical_transformer, cat_meta_cols)])


In [None]:
# Apply preprocessing to the train set
X_processed = preprocessor.fit_transform(X)

In [None]:
# Apply the same preprocessing to the test set
X_test_processed = preprocessor.transform(X_test)

In [None]:
# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.2, random_state=42, stratify=y)


In [None]:

def score(solution: pd.DataFrame, submission: pd.DataFrame, min_tpr: float=0.80) -> float:
    '''
    2024 ISIC Challenge metric: pAUC
    
    Given a solution file and submission file, this function returns the
    the partial area under the receiver operating characteristic (pAUC) 
    above a given true positive rate (TPR) = 0.80.
    https://en.wikipedia.org/wiki/Partial_Area_Under_the_ROC_Curve.
    
    (c) 2024 Nicholas R Kurtansky, MSKCC

    Args:
        solution: ground truth pd.DataFrame of 1s and 0s
        submission: solution dataframe of predictions of scores ranging [0, 1]

    Returns:
        Float value range [0, max_fpr]
    '''

    

    # check submission is numeric
    if not pd.api.types.is_numeric_dtype(submission.values):
        raise ParticipantVisibleError('Submission target column must be numeric')

    # rescale the target. set 0s to 1s and 1s to 0s (since sklearn only has max_fpr)
    v_gt = abs(np.asarray(solution.values)-1)
    
    # flip the submissions to their compliments
    v_pred = -1.0*np.asarray(submission.values)

    max_fpr = abs(1-min_tpr)

    # using sklearn.metric functions: (1) roc_curve and (2) auc
    fpr, tpr, _ = roc_curve(v_gt, v_pred, sample_weight=None)
    if max_fpr is None or max_fpr == 1:
        return auc(fpr, tpr)
    if max_fpr <= 0 or max_fpr > 1:
        raise ValueError("Expected min_tpr in range [0, 1), got: %r" % min_tpr)
        
    # Add a single point at max_fpr by linear interpolation
    stop = np.searchsorted(fpr, max_fpr, "right")
    x_interp = [fpr[stop - 1], fpr[stop]]
    y_interp = [tpr[stop - 1], tpr[stop]]
    tpr = np.append(tpr[:stop], np.interp(max_fpr, x_interp, y_interp))
    fpr = np.append(fpr[:stop], max_fpr)
    partial_auc = auc(fpr, tpr)

    return(partial_auc)

In [None]:
def p_auc(y_true, y_pred, min_tpr = 0.80):
    df = pd.DataFrame({'labels': y_true, 'predictions': y_pred})
    p_auc_score = score(df[['labels']], df[['predictions']])
    return p_auc_score


In [None]:
# If you want to use the best model based on validation pAUC
best_val_pauc = -1
best_model = None
for n_estimators in [300]:
    for max_depth in [9]:  #None used to 
        rf_model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42, class_weight='balanced', n_jobs =-1)
        rf_model.fit(X_train, y_train)
        y_val_pred_proba = rf_model.predict_proba(X_val)[:, 1]
        pauc = p_auc(y_val, y_val_pred_proba)
        print(f"Validation pAUC (n_estimators={n_estimators}, max_depth={max_depth}): {pauc:.4f}")
        
        if pauc > best_val_pauc:
            best_val_pauc = pauc
            best_model = rf_model


In [None]:
# Make predictions on the test set using the best model
y_test_pred_proba = best_model.predict_proba(X_test_processed)[:, 1]


In [None]:
# 3. Create a DataFrame for submission
submission_df = pd.DataFrame({
    'isic_id': test_meta['isic_id'],  # Use original 'isic_id' values
    'target': y_test_pred_proba
})


In [None]:
# 4. Save the submission to a CSV file

submission_df.to_csv(os.path.join(working_dir,"submission.csv"), index=False)
