## 1- Importing Libraries and Configuration

In [None]:
import pandas as pd
import numpy as np
from  lightgbm import LGBMClassifier,log_evaluation
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold

#config
class Config():
    seed=2024
    num_folds=10
    TARGET_NAME ='target'
import random
def seed_everything(seed):
    np.random.seed(seed)
    random.seed(seed)
seed_everything(Config.seed)

##  2- Read Dataset
 the ratio of targets is 1000:1


In [None]:
train=pd.read_csv("/kaggle/input/isic-2024-challenge/train-metadata.csv")
print(f"len(train):{len(train)}")
test=pd.read_csv("/kaggle/input/isic-2024-challenge/test-metadata.csv")
print(f"len(test):{len(test)}")
train.head()


In [None]:
train[Config.TARGET_NAME].value_counts()

## 3-Feature Engineer
I have Created The part of the features and part came from multimodel-isic

In [None]:
def FE(df):
    df['sex']=(df['sex']=='male').astype(np.int8)
    df['child']=(df['age_approx']<=18).astype(np.int8)
    df.drop(['iddx_2','iddx_3','iddx_4','iddx_5','mel_mitotic_index','mel_thick_mm'],axis=1,inplace=True,errors='ignore')
   
    df.drop(['isic_id','lesion_id'],axis=1,inplace=True,errors='ignore')
  
    df.drop(['iddx_full','iddx_1','tbp_lv_dnn_lesion_confidence'],axis=1,inplace=True,errors='ignore')
    
    for col in ['lower extremity', 'head/neck', 'posterior torso','anterior torso', 'upper extremity']:
        df[f'anatom_site_general_{col}']=(df['anatom_site_general']==col).astype(np.int8)
    for col in ['Memorial Sloan Kettering Cancer Center', 'ACEMID MIA',
       'Department of Dermatology, Hospital Clínic de Barcelona',
       'University Hospital of Basel',
       'Frazer Institute, The University of Queensland, Dermatology Research Centre',
       'Department of Dermatology, University of Athens, Andreas Syggros Hospital of Skin and Venereal Diseases, Alexander Stratigos, Konstantinos Liopyris',
       'ViDIR Group, Department of Dermatology, Medical University of Vienna']:
        df[f'anatom_site_general_{col}']=(df['anatom_site_general']==col).astype(np.int8)
    for col in ['Right Leg - Upper', 'Head & Neck', 'Torso Back Top Third',
       'Torso Front Top Half', 'Right Arm - Upper', 'Left Leg - Upper',
       'Torso Front Bottom Half', 'Left Arm - Upper', 'Right Leg',
       'Torso Back Middle Third', 'Right Arm - Lower',
       'Right Leg - Lower', 'Left Leg - Lower', 'Left Arm - Lower',
       'Unknown', 'Left Leg', 'Torso Back Bottom Third', 'Left Arm',
       'Right Arm', 'Torso Front', 'Torso Back']:
        df[f'tbp_lv_location_{col}']=(df['tbp_lv_location']==col).astype(np.int8)
    for col in ['CC-BY', 'CC-0', 'CC-BY-NC']:
        df[f'copyright_license_{col}']=(df['copyright_license']==col).astype(np.int8)
    for col in ['Right Leg', 'Head & Neck', 'Torso Back', 'Torso Front',
       'Right Arm', 'Left Leg', 'Left Arm', 'Unknown']:
        df[f'tbp_lv_location_simple_{col}']=(df['tbp_lv_location_simple']==col).astype(np.int8)
    

    df['tbp_tile_type']=(df['tbp_tile_type']=='3D: XP').astype(np.int8)
    
   
    for c in ['A','B','C','H','L']:
        col1,col2=f'tbp_lv_{c}',f'tbp_lv_{c}ext'
        df[f'{col1}+{col2}']=df[col1]+df[col2]
        df[f'{col1}-{col2}']=df[col1]-df[col2]
        df[f'{col1}*{col2}']=df[col1]*df[col2]
        df[f'{col1}/{col2}']=df[col1]/(df[col2]+1e-20)  
        
    age2target={5.0: 0.0,
     15.0: 0.0,
     20.0: 0.000574052812858783,
     25.0: 0.0005825808330905913,
     30.0: 0.0002884615384615385,
     35.0: 0.0003465303647232089,
     40.0: 0.00047927916413713774,
     45.0: 0.0011450381679389313,
     50.0: 0.0005633920373925382,
     55.0: 0.0007914250812931198,
     60.0: 0.0015893843907667856,
     65.0: 0.001219379026680741,
     70.0: 0.0012319296040226274,
     75.0: 0.0007791954806662121,
     80.0: 0.0011376564277588168,
     85.0: 0.00169548999660902}
    df['age2target']=df['age_approx'].apply(lambda x:age2target.get(x,np.nan))
    copyright2target={'CC-0': 0.001151229722658294,
     'CC-BY': 0.0013823273944452684,
     'CC-BY-NC': 0.0005392685557407589}
    df['copyright2target']=df['copyright_license'].apply(lambda x:copyright2target.get(x,np.nan))
    
    
  
    df["lesion_size_ratio"]=df["tbp_lv_minorAxisMM"]/df["clin_size_long_diam_mm"]
    df["lesion_shape_index"]=df["tbp_lv_areaMM2"]/(df["tbp_lv_perimeterMM"]**2)
    df["hue_contrast"]= (df["tbp_lv_H"]-df["tbp_lv_Hext"]).abs()
    df["luminance_contrast"]= (df["tbp_lv_L"]-df["tbp_lv_Lext"]).abs()
    df["lesion_color_difference"]=np.sqrt(df["tbp_lv_deltaA"]**2+df["tbp_lv_deltaB"]**2+df["tbp_lv_deltaL"]**2)
    df["border_complexity"]=df["tbp_lv_norm_border"]+df["tbp_lv_symm_2axis"]
    df["3d_position_distance"]=np.sqrt(df["tbp_lv_x"]**2+df["tbp_lv_y"]**2+df["tbp_lv_z"]**2)
    df["perimeter_to_area_ratio"]=df["tbp_lv_perimeterMM"]/df["tbp_lv_areaMM2"]
    df["lesion_visibility_score"]=df["tbp_lv_deltaLBnorm"]+df["tbp_lv_norm_color"]
    df["combined_anatomical_site"]=df["anatom_site_general"]+"_"+df["tbp_lv_location"]
    df["symmetry_border_consistency"]=df["tbp_lv_symm_2axis"]*df["tbp_lv_norm_border"]
    df["color_consistency"]=df["tbp_lv_stdL"]/df["tbp_lv_Lext"]
    df["size_age_interaction"]=df["clin_size_long_diam_mm"]*df["age_approx"]
    df["hue_color_std_interaction"]=df["tbp_lv_H"]*df["tbp_lv_color_std_mean"]
    df["lesion_severity_index"]=(df["tbp_lv_norm_border"]+df["tbp_lv_norm_color"]+df["tbp_lv_eccentricity"])/3
    df["shape_complexity_index"]=df["border_complexity"]+df["lesion_shape_index"]
    df["color_contrast_index"]=df["tbp_lv_deltaA"]+df["tbp_lv_deltaB"]+df["tbp_lv_deltaL"]+df["tbp_lv_deltaLBnorm"]
    df["log_lesion_area"]=np.log(df["tbp_lv_areaMM2"]+1)
    df["normalized_lesion_size"]=df["clin_size_long_diam_mm"]/df["age_approx"]
    df["mean_hue_difference"]=(df["tbp_lv_H"]+df["tbp_lv_Hext"])/2
    df["std_dev_contrast"]=np.sqrt((df["tbp_lv_deltaA"]**2+df["tbp_lv_deltaB"]**2+df["tbp_lv_deltaL"]**2)/3)
    df["color_shape_composite_index"]=(df["tbp_lv_color_std_mean"]+df["tbp_lv_area_perim_ratio"]+df["tbp_lv_symm_2axis"])/3
    df["3d_lesion_orientation"]=np.arctan2(df["tbp_lv_y"],df["tbp_lv_x"])
    df["overall_color_difference"]=(df["tbp_lv_deltaA"]+df["tbp_lv_deltaB"]+df["tbp_lv_deltaL"])/3
    df["symmetry_perimeter_interaction"]=df["tbp_lv_symm_2axis"]*df["tbp_lv_perimeterMM"]
    df["comprehensive_lesion_index"]=(df["tbp_lv_area_perim_ratio"]+df["tbp_lv_eccentricity"]+df["tbp_lv_norm_color"]+df["tbp_lv_symm_2axis"])/4
    
   
    df.drop(['anatom_site_general','image_type','tbp_lv_location','tbp_lv_location_simple','attribution','copyright_license','combined_anatomical_site'],axis=1,inplace=True)
    
    return df
train=FE(train)
test=FE(test)

train.head()

## 4-Metric:pauc

In [None]:
def pauc_above_tpr(y_true, y_pred):
    min_tpr=0.8
    v_gt = abs(np.asarray(y_true)-1)
    v_pred = np.array([1.0 - x for x in y_pred])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)
    return 'pauc',partial_auc,True

## 5-Model  training

In [None]:
choose_cols=[col for col in test.drop(['patient_id'],axis=1).columns]


drop_cols=[]
metric=train[choose_cols].corr().values
for i in range(len(metric)):
    for j in range(i+1,len(metric)):
        if abs(metric[i][j])>0.99:
            drop_cols+=[choose_cols[j]]
choose_cols=[col for col in choose_cols if col not in drop_cols]
print(f"len(choose_cols):{len(choose_cols)}")

cols_name={}
for i in range(len(choose_cols)):
    cols_name[choose_cols[i]]=f"cols_{i}"
def fit_and_predict(train_feats=train,test_feats=test,model=None,num_folds=10,name='lgb'):
    X=train_feats[choose_cols].copy().rename(columns=cols_name)
    y=train_feats['target'].copy()
    patient_id=train_feats['patient_id'].copy()
    oof_pred=np.zeros((len(X)))
    test_X=test_feats[choose_cols].copy().rename(columns=cols_name)
    test_pred_pro=np.zeros((num_folds,len(test_X)))
     
   
    sgkf = StratifiedGroupKFold(n_splits=num_folds,shuffle=True)
    for fold, (train_index, valid_index) in (enumerate(sgkf.split(X,y,patient_id))):
        print(f"name {name},fold:{fold}")

        X_train, X_valid = X.iloc[train_index].reset_index(drop=True), X.iloc[valid_index].reset_index(drop=True)
        y_train, y_valid = y.iloc[train_index].reset_index(drop=True), y.iloc[valid_index].reset_index(drop=True)
        
       
        zero_index=np.where(y_train==0)[0]
        one_index=np.where(y_train==1)[0]
        np.random.shuffle(zero_index)
        total_index=list(zero_index[:100000])+list(one_index)
        X_train=X_train.iloc[total_index]
        y_train=y_train.iloc[total_index]
        
        if 'lgb' in name:
            model.fit(X_train,y_train,eval_set=[(X_valid, y_valid)],
                          callbacks=[log_evaluation(100)],
                          eval_metric=pauc_above_tpr
                         )
        if 'xgb' in name:
            model.fit(X_train,y_train,eval_set=[(X_valid, y_valid)],
                          verbose=100,
                         )
        if 'cat' in name:
            model.fit(X_train,y_train,eval_set=[(X_valid, y_valid)],
                          verbose=100
                         )
        
        oof_pred[valid_index]=model.predict_proba(X_valid)[:,1]
        test_pred_pro[fold]=model.predict_proba(test_X)[:,1]

    test_pred_pro=test_pred_pro.mean(axis=0)
    
    print(f"name:{name},pauc:{pauc_above_tpr(y.values.astype(np.int8),oof_pred)}")
    
    return oof_pred,test_pred_pro

In [None]:
lgb_params={
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 6,
    "learning_rate": 0.05,
    "n_estimators":2050,
    "colsample_bytree": 0.2,
    "colsample_bynode": 0.2,
    "verbose": -1,
    "random_state": 2024,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':127,
    "verbose": -1,
    "max_bin":225,
    }
lgb_oof_pred_pro,lgb_test_pro=fit_and_predict(model=LGBMClassifier(**lgb_params),num_folds=Config.num_folds,name='lgb_final')
print(f"lgb_test_pro[:10]:{lgb_test_pro[:10]}")

cat_model=CatBoostClassifier(verbose=0,random_state=2024,
                          iterations = 2050,
                          learning_rate=0.01,
                          objective = 'Logloss',
                          boosting_type = 'Plain',
                          bootstrap_type = 'Bernoulli',
                          colsample_bylevel = 0.08656159895289164,
                          subsample = 0.46623542352578917,
                          depth=9)
cat_oof_pred_pro,cat_test_pro=fit_and_predict(model=cat_model,num_folds=Config.num_folds,name='cat_final')
print(f"cat_test_pro[:10]:{cat_test_pro[:10]}")

xgb_params = {'objective': 'binary:logistic', 'colsample_bytree': 0.11756728710020253,'max_depth': 4, 
                'learning_rate': 0.009393224320850784,'n_estimators': 2050,
               'subsample': 0.9589462514195692,'lambda': 0.34216652262461505,
                'alpha': 1.150597512455824e-07,'random_state':2024
              }
xgb_model = XGBClassifier(**xgb_params)

xgb_oof_pred_pro,xgb_test_pro=fit_and_predict(model=xgb_model,num_folds=Config.num_folds,name='xgb_final')
print(f"xgb_test_pro[:10]:{xgb_test_pro[:10]}")

test_pros=(lgb_test_pro+xgb_test_pro+cat_test_pro)/3
print(test_pros)

## 6-Submission

In [None]:
submission=pd.read_csv("/kaggle/input/isic-2024-challenge/sample_submission.csv")
submission['target']=test_pros
submission.to_csv("submission.csv",index=None)
submission.head()