In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score
from joblib import dump

In [2]:
df = pd.read_csv("brfss_skin_lifestyle.csv")

df = df[df["CNCRTYP2"].notna()]
df = df[~df["CNCRTYP2"].isin([77, 99])]

skin_codes = [16, 22, 23]
df["target_skin_cancer"] = df["CNCRTYP2"].apply(
    lambda x: 1 if x in skin_codes else 0
)

In [3]:
FEATURES = [
    '_SEX',     # Sex 
                # 1 = Male, 2 = Female
    '_AGE80',   # Age 
                # 18-99
    'MARITAL',  # Are you ...
                # 1 = Married, 2 = Divorced, 3 = Widowed, 4 = Separated, 5 = Never married, 6 = Dating
    'EMPLOY1',  # Are you currently ...
                # 1 = Employed for wages, 2 = Self-employed, 3 = Out of work for 1 years or more,
                # 4 = Out of work for less than 1 year, 5 = A homemaker, 6 = A student, 7 = Retired, 8 = Unable to work
    '_BMI5',    # Body Mass Index (weight (kg) / (height (m)^2) )
                # 1-99 (decimal)
    'GENHLTH',  # Would you say that in general your health is ...
                # 1 = Excellent, 2 = Very Good, 3 = Good, 4 = Fair, 5 = Poor
    '_PHYS14D', # How many days during the past 30 days was your physical health not good?
                # 1 = Zero days when physical health not good, 2 = 1 - 13 days, 3 = 14+ days
    '_MENT14D', # How many days during the past 30 days was your mental health not good?
                # 1 = Zero days when mental health not good, 2 = 1 - 13 days, 3 = 14+ days
    'ADDEPEV3', # Have you ever had a depressive disorder?
                # 1 = Yes, 2 = No
    'POORHLTH', # How many days did poor physical or mental health keep you from doing your usual activities?
                # 0 = None, 1-30 (number of days)
    'EXERANY2', # During the past month, did you participate in any physical activities or exercises?
                # 1 = Yes, 2 = No
    'SMOKE100', # Have you smoked at least 100 cigarettes in your entire life? 
                # 1 = Yes, 2 = No
    'CVDCRHD4', # Have you ever had angina or coronary heart disease?
                # 1 = Yes, 2 = No
    '_ASTHMS1', # Asthma status
                # 1 = Current, 2 = Former, 3 = Never 
    'DIABETE4', # Have you ever had diabetes?
                # 1 = Yes, 2 = Yes, but during pregnancy, 3 = No, 4 = Pre-diabetes or borderline diabetes
    'DIFFWALK', # Do you have serious difficulty walking or climbing stairs?
                # 1 = Yes, 2 = No
    'HAVARTH4', # Have you ever had some form of arthritis, rheumatoid arthritis, gout, lupus, or fibromyalgia?
                # 1 = Yes, 2 = No
    'CHCKDNY2', # Have you ever had kidney disease (not including kidney stones, bladder infection, or incontinence)
                # 1 = Yes, 2 = No
    'CHCSCNC1', # Have you ever had skin cancer that is not melanoma?
                # 1 = Yes, 2 = No
    'CHCOCNC1', # Have you ever had melanoma or any types of cancer?
                # 1 = Yes, 2 = No
]

df = df[FEATURES + ["target_skin_cancer"]]

In [4]:
class DomainCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        def clean(x):
            if pd.isna(x):
                return np.nan
            if 1 <= x <= 30:
                return x
            elif x == 88:
                return 0
            else:
                return np.nan

        X["_BMI5"] = X["_BMI5"] / 100
        X["_BMI5"] = X["_BMI5"].replace({777: np.nan, 999: np.nan})

        clean_1_cols = [
            'GENHLTH','SMOKE100', 'EXERANY2', 'CVDCRHD4',
            'DIABETE4','HAVARTH4','ADDEPEV3','CHCKDNY2','_ASTHMS1',
            'DIFFWALK','MARITAL', 'CHCSCNC1','CHCOCNC1','_MENT14D','_PHYS14D'
        ]
        X[clean_1_cols] = X[clean_1_cols].replace({7: np.nan, 9: np.nan})

        X['POORHLTH'] = X['POORHLTH'].apply(clean)
        X['EMPLOY1'] = X['EMPLOY1'].replace({9: np.nan})
        
        return X

In [5]:
X = df.drop(columns=["target_skin_cancer"])
y = df["target_skin_cancer"]

x_train, x_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [6]:
cb = CatBoostClassifier(
    depth=5,
    learning_rate=0.01,
    iterations=1000,
    early_stopping_rounds=100,
    loss_function="Logloss",
    l2_leaf_reg=5,
    verbose=False
)

pipeline = Pipeline([
    ("domain_clean", DomainCleaner()),
    ("model", cb)
])

In [7]:
pipeline.fit(x_train, y_train)

y_pred = pipeline.predict(x_test)
y_prob = pipeline.predict_proba(x_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

              precision    recall  f1-score   support

           0       0.77      0.80      0.78      1363
           1       0.80      0.78      0.79      1460

    accuracy                           0.79      2823
   macro avg       0.79      0.79      0.79      2823
weighted avg       0.79      0.79      0.79      2823

ROC-AUC: 0.8550412566960471


In [8]:
feature_importance = pipeline.named_steps["model"].get_feature_importance()

importance_df = pd.DataFrame({
    "feature": x_train.columns,
    "importance": feature_importance
}).sort_values("importance", ascending=False)

importance_df

Unnamed: 0,feature,importance
18,CHCSCNC1,41.307679
19,CHCOCNC1,21.925329
5,GENHLTH,6.020407
1,_AGE80,4.155026
0,_SEX,3.759847
2,MARITAL,2.861645
17,CHCKDNY2,2.328822
4,_BMI5,2.122116
6,_PHYS14D,1.696801
14,DIABETE4,1.655816


In [9]:
dump(pipeline, "skin_cancer_model.joblib")

['skin_cancer_model.joblib']