## Importing dataset:

In [8]:
import pandas as pd

train_path = '/kaggle/input/competitions/playground-series-s6e2/train.csv'
train_df = pd.read_csv(train_path)
train_df.head() #checkng if data has loaded safely

Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,0,58,1,4,152,239,0,0,158,1,3.6,2,2,7,Presence
1,1,52,1,1,125,325,0,2,171,0,0.0,1,0,3,Absence
2,2,56,0,2,160,188,0,2,151,0,0.0,1,0,3,Absence
3,3,44,0,3,134,229,0,2,150,0,1.0,2,0,3,Absence
4,4,58,1,4,140,234,0,2,125,1,3.8,2,3,3,Presence


## Cleaning data:

In [9]:
target = 'Heart Disease'

#Encoding the target
train_df[target] = train_df[target].map({'Absence': 0, 'Presence': 1})

#removing redundant features and seperating features and target:
X = train_df.drop(columns=['id',target], axis=1)
y = train_df[target]

## Feature engineering:

In [10]:
def create_features(df):
    #Feature crosses/ Interaction features: (Optinal, some models are able to learn these relatonships on there own)
    df['Y1'] = df['BP'] * df['Cholesterol']
    df['Y2'] = df['Number of vessels fluro'] * df['Slope of ST']
    df['Y3'] = df['Cholesterol'] * df['Slope of ST']
    df['Y4'] = df['Cholesterol'] * df['Number of vessels fluro']
    df['Y5'] = df['BP'] * df['Slope of ST']
    df['Y6'] = df['BP'] * df['Number of vessels fluro']

    return df

In [11]:
X = create_features(X)

In [12]:
#Custom transformers

from sklearn.base import BaseEstimator, TransformerMixin

#Binning Transformer
class Binning(BaseEstimator, TransformerMixin):
    def __init__(self, col_to_bin, num_bins, new_col_name ,labels=None):
        self.col_to_bin = col_to_bin
        self.num_bins = num_bins
        self.labels = labels
        self.new_col_name = new_col_name

    def fit(self, X, y=None):
        X = X.copy()
        _, self.bin_edges = pd.cut(X[self.col_to_bin], bins=self.num_bins, labels=False, retbins=True)
        return self

    def transform(self,X):
        X = X.copy() 
        X[self.new_col_name] = pd.cut(X[self.col_to_bin], bins=self.bin_edges, labels=False)
        return X

#GroupMean Transformer
class GroupMeanEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, groupby_col, agg_col, new_col_name):
        self.groupby_col = groupby_col
        self.agg_col = agg_col
        self.new_col_name = new_col_name

    def fit(self,X,y=None):
        self.means = X.groupby(self.groupby_col,observed=True)[self.agg_col].mean()
        return self

    def transform(self,X):
        X = X.copy()
        X[self.new_col_name] = X[self.groupby_col].map(self.means)
        return X

#FrequencyEncoder Transformer
class FreqEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols, normalize=True):
        self.cat_cols = cat_cols
        self.normalize = normalize
        self.freq_maps = {}

    def fit(self, X, y=None):
        for col in self.cat_cols:
            self.freq_maps[col] = X[col].value_counts(normalize=self.normalize)
        return self

    def transform(self, X):
        X = X.copy()

        for col in self.cat_cols:
            X[col + '_freq'] = X[col].map(self.freq_maps[col])
            X[col + '_freq'] = X[col + '_freq'].fillna(0) 
        return X

## Building data preprocessing pipelines:

In [13]:
cat_cols = ['Sex','Chest pain type','FBS over 120','Exercise angina','EKG results']

In [14]:
from sklearn.pipeline import Pipeline 

preprocessor = Pipeline([
    ('Binning', Binning(col_to_bin='Age', num_bins=3, new_col_name='Age_bins')),
    ('GroupMeanEncoder_BP', GroupMeanEncoder(groupby_col='Age_bins', agg_col='BP', new_col_name='X1')),
    ('GroupMeanEncoder_Cholesterol', GroupMeanEncoder(groupby_col='Age_bins', agg_col='Cholesterol', new_col_name='X2')),
    ('GroupMeanEncoder_HR', GroupMeanEncoder(groupby_col='Age_bins', agg_col='Max HR', new_col_name='X3')),
    ('FreqEncoding', FreqEncoder(cat_cols=cat_cols))
])

## Optuna Tuning:

### XGBoost:

In [15]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):

    params = {
        # Core learning parameters
        "n_estimators": trial.suggest_int("n_estimators", 300, 3000),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        
        # Tree complexity control
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0, 5),
        
        # Sampling 
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 1.0),
        
        # Regularization 
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),

        # Performance
        "random_state": 42,
        "eval_metric": "logloss",
        "tree_method": "hist",   
        "verbosity": 0,
        "device":'cuda'
    }

    model = Pipeline([
        ('prep', preprocessor),
        ('XGB', XGBClassifier(**params))
    ])

    score = cross_val_score(model, X, y, cv=5, scoring="roc_auc").mean()

    return score


In [16]:
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=50)

In [17]:
# print(study.best_params)
# print(study.best_value)

Best hyperparameter set and score found was:
* Hyperparameter set: {'n_estimators': 1975, 'learning_rate': 0.04831569888473999, 'max_depth': 3, 'min_child_weight': 9, 'gamma': 2.784643331448016, 'subsample': 0.663818482689914, 'colsample_bytree': 0.8620627964975561, 'reg_alpha': 7.483905997549215e-05, 'reg_lambda': 0.04488123713071685}
* Score: 0.9554285869965723

## Retraining the model:
Using the complete data and learned best parameters to retrain a new model on complete train data

In [18]:
best_xgb_model = Pipeline([
        ('prep', preprocessor),
        ('XGB', XGBClassifier(
            n_estimators=1975,
            learning_rate=0.04831569888473999,
            max_depth=3,
            min_child_weight=9,
            gamma=2.784643331448016,
            subsample=0.663818482689914,
            colsample_bytree=0.8620627964975561,
            reg_alpha=7.483905997549215e-05,
            reg_lambda=0.04488123713071685,
            
            random_state=42,
            eval_metric="logloss",
            tree_method="hist",
            device="cuda",
            verbosity=0
        ))
    ])

In [19]:
best_xgb_model.fit(X,y)

## Preparing test data:

In [20]:
test_path = '/kaggle/input/competitions/playground-series-s6e2/test.csv'
test_df = pd.read_csv(test_path)
test_df.head()

Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
0,630000,58,1,3,120,288,0,2,145,1,0.8,2,3,3
1,630001,55,0,2,120,209,0,0,172,0,0.0,1,0,3
2,630002,54,1,4,120,268,0,0,150,1,0.0,2,3,7
3,630003,44,0,3,112,177,0,0,168,0,0.9,1,0,3
4,630004,43,1,1,138,267,0,0,163,0,1.8,2,0,7


In [21]:
# Removing redundant features
X_test = test_df.drop('id', axis=1)

#applying the create_features on test set
X_test = create_features(X_test)

## Predicting on the test set:

In [22]:
y_pred_xgb = best_xgb_model.predict_proba(X_test)[:,1]



## Preparing submission csv:

In [23]:
submission = pd.DataFrame({
    'id': test_df['id'],
    target: y_pred_xgb
})

In [24]:
submission.to_csv('submission.csv', index=False)

This model outperforms the optuna tuned Catboost and LGBM models from last iteration of this project.(Check those out on my kaggle profile or on my github page) 

Thanks for reading this all the way!!!