## Importing data

In [44]:
import pandas as pd

train_path = '/kaggle/input/competitions/playground-series-s6e2/train.csv'
train_df = pd.read_csv(train_path)
train_df.head()

Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,0,58,1,4,152,239,0,0,158,1,3.6,2,2,7,Presence
1,1,52,1,1,125,325,0,2,171,0,0.0,1,0,3,Absence
2,2,56,0,2,160,188,0,2,151,0,0.0,1,0,3,Absence
3,3,44,0,3,134,229,0,2,150,0,1.0,2,0,3,Absence
4,4,58,1,4,140,234,0,2,125,1,3.8,2,3,3,Presence


## Cleaning data:

In [45]:
target = 'Heart Disease'

#Encoding the target
train_df[target] = train_df[target].map({'Absence': 0, 'Presence': 1})

#removing redundant features and seperating features and target:
X = train_df.drop(columns=['id',target], axis=1)
y = train_df[target]

## Feature Engineering:

In [46]:
def create_features(df):
    #Feature crosses/ Interaction features: (Optinal, some models are able to learn these relatonships on there own)
    df['Y1'] = df['BP'] * df['Cholesterol']
    df['Y2'] = df['Number of vessels fluro'] * df['Slope of ST']
    df['Y3'] = df['Cholesterol'] * df['Slope of ST']
    df['Y4'] = df['Cholesterol'] * df['Number of vessels fluro']
    df['Y5'] = df['BP'] * df['Slope of ST']
    df['Y6'] = df['BP'] * df['Number of vessels fluro']

    return df

In [47]:
X = create_features(X)

In [48]:
#Custom transformers

from sklearn.base import BaseEstimator, TransformerMixin

#Binning Transformer
class Binning(BaseEstimator, TransformerMixin):
    def __init__(self, col_to_bin, num_bins, new_col_name ,labels=None):
        self.col_to_bin = col_to_bin
        self.num_bins = num_bins
        self.labels = labels
        self.new_col_name = new_col_name

    def fit(self, X, y=None):
        X = X.copy()
        _, self.bin_edges = pd.cut(X[self.col_to_bin], bins=self.num_bins, labels=False, retbins=True)
        return self

    def transform(self,X):
        X = X.copy() 
        X[self.new_col_name] = pd.cut(X[self.col_to_bin], bins=self.bin_edges, labels=False)
        return X

#GroupMean Transformer
class GroupMeanEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, groupby_col, agg_col, new_col_name):
        self.groupby_col = groupby_col
        self.agg_col = agg_col
        self.new_col_name = new_col_name

    def fit(self,X,y=None):
        if hasattr(X, "columns"):
            self.feature_names_ = X.columns
        else:
            X = pd.DataFrame(X, columns=self.feature_names_)
            
        self.means = X.groupby(self.groupby_col,observed=True)[self.agg_col].mean()
        return self

    def transform(self,X):
        X = X.copy()
        X[self.new_col_name] = X[self.groupby_col].map(self.means)
        return X

#FrequencyEncoder Transformer
class FreqEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols, normalize=True):
        self.cat_cols = cat_cols
        self.normalize = normalize
        self.freq_maps = {}

    def fit(self, X, y=None):
        for col in self.cat_cols:
            self.freq_maps[col] = X[col].value_counts(normalize=self.normalize)
        return self

    def transform(self, X):
        X = X.copy()

        for col in self.cat_cols:
            X[col + '_freq'] = X[col].map(self.freq_maps[col])
            X[col + '_freq'] = X[col + '_freq'].fillna(0) 
        return X

#TargetEncoder Transformer:
class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols):
        self.cat_cols = cat_cols
        self.mean_maps = {}
        self.global_mean = None
        
    def fit(self, X, y):
        if not hasattr(X, "groupby"):
            X = pd.DataFrame(X, columns=self.feature_names_)
        
        if not hasattr(y, "groupby"):
            y = pd.Series(y)
        
        X = X.copy()
        self.global_mean = y.mean()

        for col in self.cat_cols:
            self.mean_maps[col] = y.groupby(X[col]).mean()
        return self

    def transform(self, X):
        X = X.copy()
        
        for col in self.cat_cols:
            X[col + "_TE"] = X[col].map(self.mean_maps[col])
            X[col + "_TE"] = X[col + "_TE"].fillna(self.global_mean) #handling unseen values
        return X

## Preprocessing pipeline:

In [49]:
cat_cols = ['Sex','Chest pain type','FBS over 120','Exercise angina','EKG results']

In [50]:
from sklearn.pipeline import Pipeline 

preprocessor = Pipeline([
    ('Binning', Binning(col_to_bin='Age', num_bins=3, new_col_name='Age_bins')),
    ('GroupMeanEncoder_BP', GroupMeanEncoder(groupby_col='Age_bins', agg_col='BP', new_col_name='X1')),
    ('GroupMeanEncoder_Cholesterol', GroupMeanEncoder(groupby_col='Age_bins', agg_col='Cholesterol', new_col_name='X2')),
    ('GroupMeanEncoder_HR', GroupMeanEncoder(groupby_col='Age_bins', agg_col='Max HR', new_col_name='X3')),
    ('FreqEncoding', FreqEncoder(cat_cols=cat_cols)),
    ('TargetEncoding', TargetEncoder(cat_cols=cat_cols))
])

## Defining all best models:

In [51]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

#XGBoost:
best_xgb_model = Pipeline([
        ('prep', preprocessor),
        ('XGB', XGBClassifier(
            n_estimators= 2997, 
            learning_rate= 0.02897722764540242, 
            max_depth= 3,
            min_child_weight= 2, 
            gamma= 2.214493292110104, 
            subsample= 0.7125778371354599, 
            colsample_bytree= 0.934351105375458, 
            reg_alpha= 2.709037256626792, 
            reg_lambda= 0.0019825317162724676,
            
            random_state=42,
            eval_metric="logloss",
            tree_method="hist",
            device="cuda",
            verbosity=0
        ))
    ])

#CatBoost:
final_cb_model = Pipeline([  
        ('prep', preprocessor),
        ('catboost', CatBoostClassifier(
            iterations=1254,
            depth= 4, 
            learning_rate= 0.09787901496322517, 
            l2_leaf_reg= 48.73782544764864,
            task_type= 'GPU',
            devices= '0',
            verbose= False,
            random_seed= 42,))
    ])

#LGBM:
final_lgbm_model = Pipeline([
        ('prep', preprocessor),
        ('LGBM', LGBMClassifier(
            n_estimators= 1712,
            learning_rate= 0.02743719738580626,
            num_leaves= 24,
            max_depth= 4,
            min_child_samples= 29,
            subsample= 0.7009221068214425,
            colsample_bytree= 0.6046253918162702,
            reg_alpha= 0.028014049796877397,
            reg_lambda= 0.00813793499748922,
            random_state= 42,
            n_jobs= -1,
            verbose= -1
        ))
    ])

## OOF_Ensembling/Stacking

In [52]:
best_params ={
            
        'enable_categorical': True,
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'device': 'cuda',
        'random_state': 43
    }

In [53]:
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold

stack = StackingClassifier(
    estimators=[
        ("xgb",best_xgb_model),
        ("cb",final_cb_model),
        ("lgbm",final_lgbm_model)
    ],
    final_estimator=XGBClassifier(**best_params),
    cv=StratifiedKFold(random_state=42, n_splits=5, shuffle=True),
    stack_method='predict_proba'
)

stack.fit(X,y)



## Preparing test data:

In [54]:
test_path = '/kaggle/input/competitions/playground-series-s6e2/test.csv'
test_df = pd.read_csv(test_path)
test_df.head()

Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
0,630000,58,1,3,120,288,0,2,145,1,0.8,2,3,3
1,630001,55,0,2,120,209,0,0,172,0,0.0,1,0,3
2,630002,54,1,4,120,268,0,0,150,1,0.0,2,3,7
3,630003,44,0,3,112,177,0,0,168,0,0.9,1,0,3
4,630004,43,1,1,138,267,0,0,163,0,1.8,2,0,7


In [55]:
# Removing redundant features
X_test = test_df.drop('id', axis=1)

#applying the create_features on test set
X_test = create_features(X_test)

## Predicting on the test set:

In [56]:
y_pred = stack.predict_proba(X_test)[:,1]



## Submission csv:

In [57]:
submission_stacking_ensemble2 = pd.DataFrame({
    'id': test_df['id'],
    target:y_pred
})

submission_stacking_ensemble2.to_csv('submission_stacking_ensemble2.csv', index=False)