In [24]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
import xgboost as xgb
import lightgbm as lgb

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.pipeline import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import TargetEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.discriminant_analysis import StandardScaler


### Read Test+train Datasets

In [25]:
X_train_path = 'X_train.csv'
y_train_path = 'y_train.csv'
X_test_path = 'X_test.csv'

# Read the csv files
X_train = pd.read_csv(X_train_path, index_col=0)
y_train = pd.read_csv(y_train_path, index_col=0)
X_test = pd.read_csv(X_test_path, index_col=0)

# Join the train datasets
full_train_ds = X_train.join(y_train)
full_train_ds.head()


Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,id,Race,LeaveOrNot
1847,Bachelors,2016,Bangalore,3,27.0,Male,Yes,4,1847,white,0
2905,Bachelors,2012,Bangalore,3,28.0,Male,No,4,2905,white,0
4610,Bachelors,2015,Pune,3,26.0,Female,No,2,4610,white,1
2228,Bachelors,2012,Bangalore,3,29.0,Male,No,1,2228,black,0
5,Bachelors,2016,Bangalore,3,24.0,Male,No,0,5,white,0


In [32]:
from typing import Literal
from sklearn.base import BaseEstimator
from sklearn.compose import make_column_transformer


class DummyEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.get_dummies(X, drop_first=True)

class CustomLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, mapping):
        self.mapping = mapping
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.map(self.mapping)

def drop_bad_columns(X: pd.DataFrame):
    return X.dropna().copy()

class DropDuplicates(BaseEstimator, TransformerMixin):
    def __init__(self, subset):
        self.subset = subset
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.drop_duplicates(subset=self.subset)

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]


In [71]:
rnd_state = 10

# Define the pipeline
num_attrs = ['JoiningYear', 'PaymentTier', 'Age', 'ExperienceInCurrentDomain']
cat_attrs = ['Education', 'City', 'Gender', 'EverBenched', 'Race']
education_order = ["Bachelors", "Masters", "PHD"]

# Adding a scaler to the pipeline
ct = ColumnTransformer([
        ('TargetEncoder', TargetEncoder(), ['City', 'Race']),
        ('EducationLevelEncoder', OrdinalEncoder(categories=[education_order]), ['Education']),
    ], remainder='passthrough', verbose_feature_names_out=False)
ct.set_output(transform='pandas')

dummy_encoder = make_column_transformer((DummyEncoder(), ['EverBenched', 'Gender']))
#dummy_encoder.set_output(transform='pandas')

numeric_transformer = make_column_transformer(
    (SimpleImputer(strategy="median"), num_attrs),
    (StandardScaler(), num_attrs),
)
numeric_transformer.set_output(transform='pandas')

full_pipeline = Pipeline([
    ('ColumnTransformers', ct),
    #('DummyEncoder',dummy_encoder),
    ('NumericTransformer', numeric_transformer),
])

# # Fit and transform the data
X_train_prepared = full_pipeline.fit_transform(X_train, y_train)
X_train_prepared


  y = column_or_1d(y, warn=True)


Unnamed: 0,simpleimputer__JoiningYear,simpleimputer__PaymentTier,simpleimputer__Age,simpleimputer__ExperienceInCurrentDomain,standardscaler__JoiningYear,standardscaler__PaymentTier,standardscaler__Age,standardscaler__ExperienceInCurrentDomain
1847,2016.0,3.0,27.0,4.0,0.495792,0.543609,-0.808336,0.705485
2905,2012.0,3.0,28.0,4.0,-1.649326,0.543609,-0.602588,0.705485
4610,2015.0,3.0,26.0,2.0,-0.040488,0.543609,-1.014084,-0.574776
2228,2012.0,3.0,29.0,1.0,-1.649326,0.543609,-0.396840,-1.214907
5,2016.0,3.0,24.0,0.0,0.495792,0.543609,-1.425581,-1.855038
...,...,...,...,...,...,...,...,...
3335,2016.0,3.0,38.0,0.0,0.495792,0.543609,1.454894,-1.855038
1099,2017.0,2.0,29.0,5.0,1.032071,-1.246934,-0.396840,1.345616
2514,2016.0,3.0,31.0,5.0,0.495792,0.543609,0.014657,1.345616
3606,2013.0,2.0,36.0,3.0,-1.113046,-1.246934,1.043398,0.065355


In [31]:
# Defining the classifiers and their hyperparameters
classifiers = {
    'Logistic Regression': {
        'model': LogisticRegression(random_state=rnd_state),
        'params': {
            'C': [0.3, 0.5, 1],
            'penalty': ['l1', 'l2']
        }
    },
    'Decision Tree': {
        'model': DecisionTreeClassifier(random_state=rnd_state),
        'params': {
            'max_depth': [7, 8, 9],
            'min_samples_split': [14, 15, 16]
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3]
        }
    },
    'Gaussian Naive Bayes': {
        'model': GaussianNB(),
        'params': {}
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=rnd_state),
        'params': {
            'n_estimators': [100, 110],
            'max_depth': [15],
            'min_samples_split': [11, 12],
            'min_samples_leaf': [3, 4],
            'max_features': ['auto'],
            'bootstrap': [False]
        }
    },
    'Gradient Boosting': {
        'model': GradientBoostingClassifier(random_state=rnd_state),
        'params': {
            'n_estimators': [120, 130],
            'learning_rate': [0.03],
            'max_depth': [5],
            'subsample': [0.9],
            'min_samples_split': [3, 4, 5]
        }
    },
    'AdaBoost': {
        'model': AdaBoostClassifier(random_state=rnd_state),
        'params': {
            'n_estimators': [50],
            'learning_rate': [1.2, 1.3],
            'algorithm' : ['SAMME', 'SAMME.R']
        }
    },
    'XGBoost': {
        'model': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        'params': {
            'n_estimators': [110, 120],
            'learning_rate': [0.04, 0.05],
            'max_depth': [4, 5],
            'gamma': [0.1],
            'subsample': [0.9, 1.0],
            'colsample_bytree': [0.8, 0.9]
        }
    },
    'LightGBM': {
        'model': lgb.LGBMClassifier(force_row_wise=True, random_state=rnd_state),
        'params': {
            'n_estimators': [70, 80, 90],
            'learning_rate': [0.3, 0.4],
            'max_depth': [2,3, 4]
        }
    }
}
