In [1]:
import pandas as pd

# =======================
# Load dataset
# =======================

# 2020
df_2020_train = pd.read_csv("/kaggle/input/rawcsvs/Raw CSV's/2020/train.csv")
df_2020_test  = pd.read_csv("/kaggle/input/rawcsvs/Raw CSV's/2020/test.csv")
df_2020_duplicates = pd.read_csv("/kaggle/input/rawcsvs/Raw CSV's/2020/Duplicates/2020_Challenge_duplicates.csv")

# 2019
df_2019_train_GT = pd.read_csv("/kaggle/input/rawcsvs/Raw CSV's/2019/ISIC_2019_Training_GroundTruth.csv")
df_2019_train_MD = pd.read_csv("/kaggle/input/rawcsvs/Raw CSV's/2019/Patient MetaData/ISIC_2019_Training_Metadata.csv")
df_2019_test_MD  = pd.read_csv("/kaggle/input/rawcsvs/Raw CSV's/2019/Patient MetaData/ISIC_2019_Test_Metadata.csv")

# =======================
# 2020 preprocessing
# =======================

# Drop unnecessary columns
df_2020_train.drop(['benign_malignant', 'target'], axis=1, inplace=True)

# Replace diagnosis values
df_2020_train = df_2020_train.replace({
    'diagnosis': {
        'unknown': 'UNK',
        'nevus': 'NV',
        'melanoma': 'MEL',
        'seborrheic keratosis': 'BKL',
        'lentigo NOS': 'BKL',
        'lichenoid keratosis': 'BKL',
        'solar lentigo': 'BKL',
        'cafe-au-lait macule': 'UNK',
        'atypical melanocytic proliferation': 'UNK'
    }
})

# Rename columns
df_2020_train = df_2020_train.rename(columns={
    'anatom_site_general_challenge': 'anatom_site_general',
    'image_name': 'image',
})
df_2020_test = df_2020_test.rename(columns={
    'anatom_site_general_challenge': 'anatom_site_general',
    'image_name': 'image',
})

# Remove duplicates
df_2020_duplicates_train = df_2020_duplicates[df_2020_duplicates['partition'] == 'train']
df_2020_duplicates_test  = df_2020_duplicates[df_2020_duplicates['partition'] == 'test']

remove_train = list(df_2020_duplicates_train['ISIC_id'].values)
remove_test  = list(df_2020_duplicates_test['ISIC_id'].values)

df_2020_train = df_2020_train[~df_2020_train['image'].isin(remove_train)]
df_2020_test  = df_2020_test [~df_2020_test['image'].isin(remove_test)]

# =======================
# 2019 preprocessing
# =======================

# Convert wide â†’ long
df_2019_train_GT_transformed = pd.melt(
    df_2019_train_GT,
    id_vars='image',
    var_name='diagnosis',
    value_name='value'
)

# Keep rows with value == 1
df_2019_train_GT_transformed = df_2019_train_GT_transformed[df_2019_train_GT_transformed["value"] == 1]
df_2019_train_GT_transformed.drop("value", axis=1, inplace=True)
df_2019_train_GT_transformed.reset_index(drop=True, inplace=True)

# Merge metadata
df_2019_train_GT_combined = df_2019_train_GT_transformed.merge(df_2019_train_MD, on="image")
df_2019_train_GT_combined.drop("lesion_id", axis=1, inplace=True)

# Fill missing
df_2019_train_GT_combined[['anatom_site_general','sex']] = df_2019_train_GT_combined[['anatom_site_general','sex']].fillna("unknown")
df_2019_train_GT_combined['age_approx'] = df_2019_train_GT_combined['age_approx'].fillna(60)

# =======================
# Construct final DataFrames
# =======================

# Test set without demographic info
df_2020_test_no_PD = df_2020_test[['image']]

# Train set with only image + diagnosis (2020 + 2019)
df_2020_select_2 = df_2020_train[['image', 'diagnosis']]
df_2019_select_2 = df_2019_train_GT_combined[['image', 'diagnosis']]

df_2020_2019_9_label = pd.concat([df_2020_select_2, df_2019_select_2], ignore_index=True)

# Remove corrupted image
df_2020_2019_9_label = df_2020_2019_9_label[df_2020_2019_9_label['image'] != 'ISIC_0066580']
df_2020_2019_9_label.reset_index(drop=True, inplace=True)

# =======================
# Save output
# =======================

save_path = "/kaggle/working/"

df_2020_test_no_PD.to_csv(
    save_path + "test_2020_no_PateintDetail.csv",
    header=True,
    index=False
)

df_2020_2019_9_label.to_csv(
    save_path + "train_2020_and_2019_with_9_Labels.csv",
    header=True,
    index=False
)
