In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.offline import iplot
import plotly.offline as py
py.init_notebook_mode()

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier, VotingClassifier

In [None]:
import pandas as pd
import os
import opendatasets as od

In [None]:
dataset = 'https://www.kaggle.com/competitions/playground-series-s4e1/data'
od.download(dataset)

In [None]:
df_test = pd.read_csv('playground-series-s4e1/test.csv')
df = pd.read_csv('playground-series-s4e1/train.csv')

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
len(df['Surname'].unique())

In [None]:
def FeatureEngineering(df):
    df['Senior'] = df['Age'].apply(lambda x: 1 if x > 60 else 0).astype('category')
    df['Active_By_CreditCard'] = df['HasCrCard'] * df['IsActiveMember']
    df['Active_By_CreditCard'] = df['Active_By_CreditCard'].astype('category')
    df['Products_By_Tenure']  = df['Tenure'] / df['NumOfProducts']
    df['AgeCat'] = np.round(df['Age'] / 20).astype('int').astype('category').astype('category')
    df['Zero_Balance'] = df['Balance'].apply(lambda x: 1 if x == 0 else 0).astype('category')
    df['HasCrCard'] = df['HasCrCard'].astype('category')
    df['Exited'] = df['Exited'].astype('category')
    df['IsActiveMember'] = df['IsActiveMember'].astype('category')
    
    return df

In [None]:
df = FeatureEngineering(df)

In [None]:
df = df.drop(columns=['id', 'CustomerId', 'Surname'])

In [None]:
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = df.select_dtypes(include=['object', 'category']).columns

In [None]:
import matplotlib.pyplot as plt

import matplotlib.pyplot as plt

def FindOutliers(df, name_of_feature):
    fig, ax = plt.subplots(1, 2, figsize=(14, 6), sharey=True)

    # Custom boxplot settings
    boxprops1 = dict(facecolor='skyblue', color='skyblue')
    whiskerprops1 = dict(color='skyblue')
    capprops1 = dict(color='skyblue')
    medianprops1 = dict(color='blue')
    flierprops1 = dict(marker='o', color='white', markersize=5)

    boxprops2 = dict(facecolor='lightgreen', color='lightgreen')
    whiskerprops2 = dict(color='lightgreen')
    capprops2 = dict(color='lightgreen')
    medianprops2 = dict(color='green')
    flierprops2 = dict(marker='o', color='white', markersize=5)

    # Box plot without outliers
    ax[0].boxplot(df[name_of_feature], vert=True, patch_artist=True, showfliers=False,
                  boxprops=boxprops1, whiskerprops=whiskerprops1, capprops=capprops1, medianprops=medianprops1)
    ax[0].set_title('Only Whiskers', fontsize=14, fontweight='bold')
    ax[0].set_xlabel(name_of_feature, fontsize=12)

    # Box plot with outliers
    ax[1].boxplot(df[name_of_feature], vert=True, patch_artist=True, showfliers=True,
                  boxprops=boxprops2, whiskerprops=whiskerprops2, capprops=capprops2, medianprops=medianprops2, flierprops=flierprops2)
    ax[1].set_title('Whiskers and Outliers', fontsize=14, fontweight='bold')
    ax[1].set_xlabel(name_of_feature, fontsize=12)

    # Set common properties for both subplots
    for a in ax:
        a.yaxis.label.set_color('black')
        a.tick_params(axis='x', colors='black')
        a.tick_params(axis='y', colors='black')
        a.spines['top'].set_color('black')
        a.spines['bottom'].set_color('black')
        a.spines['left'].set_color('black')
        a.spines['right'].set_color('black')

    # Common y-axis label
    fig.text(0.04, 0.5, name_of_feature, va='center', ha='center', rotation='vertical', color='black', fontsize=12)

    # Main title
    fig.suptitle(f'Outliers Of {name_of_feature}', fontsize=16, color='black', fontweight='bold')

    plt.show()

In [None]:
for column in numeric_columns:
    FindOutliers(df, column)

In [None]:
plt.style.use('ggplot')
for i in numeric_columns:
    plt.hist(df[i], color='teal', edgecolor='black', linewidth=1.2)
    plt.title("Distribution of " + i)
    plt.xlabel("Values of " + i)
    plt.ylabel("Frequency")
    plt.grid(True, linestyle='--', linewidth=0.5, color='gray')
    plt.show() 

In [None]:
sns.set_style("dark")
for i in categorical_columns:
    plt.bar(df[i].value_counts().index, df[i].value_counts().values, color='#b5651d')
    plt.title("Bar Chart of " + i)
    plt.xlabel("Categories")
    plt.ylabel("Frequency")
    plt.xticks(rotation=45, ha='right')
    plt.grid(True, linestyle='--', linewidth=0.5, color='gray')
    plt.show()


In [None]:
sns.set_style("darkgrid")

# Plot the heatmap with a different colormap
plt.figure(figsize=(10, 8))  # Optional: Adjust the figure size
sns.heatmap(df[numeric_columns].corr(), cmap='coolwarm', annot=True, fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap of Numeric Columns")
plt.show()

In [None]:
df_test = pd.read_csv('playground-series-s4e1/test.csv')
df = pd.read_csv('playground-series-s4e1/train.csv')

In [None]:
df = FeatureEngineering(df)
df = df.drop(columns=['id', 'CustomerId'])

In [None]:
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
categorical_columns = categorical_columns.drop(['Exited', 'Surname'])
surname_column = 'Surname'

In [None]:
numerical_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
    ('one_hot_encoder', OneHotEncoder())
])

surname_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(max_features=3000)),
    ('svd', TruncatedSVD(n_components=3))
])

In [None]:
preprocessing = ColumnTransformer(transformers=[
    ('categorical', categorical_transformer, categorical_columns),
    ('numerical', numerical_transformer, numeric_columns),
    ('surname', surname_transformer, surname_column)
], remainder = 'passthrough')

preprocessing_pipeline = Pipeline([
    ('preprocessing', preprocessing)
])

In [None]:
X = df.drop('Exited', axis=1)
y = df['Exited']

In [None]:
X = preprocessing_pipeline.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42, stratify = df['Exited'])

In [None]:
X.shape

In [None]:
def GetBasedModelList(class_weights=None):
    models = []
    models.append(('LR', LogisticRegression(max_iter=1000, class_weight=class_weights)))
    models.append(('KNN'  , KNeighborsClassifier()))
    models.append(('CART' , DecisionTreeClassifier(class_weight=class_weights)))
    models.append(('AB'   , AdaBoostClassifier()))
    models.append(('GBM'  , GradientBoostingClassifier()))
    models.append(('RF'   , RandomForestClassifier(class_weight=class_weights)))
    models.append(('ET'   , ExtraTreesClassifier(class_weight=class_weights)))
    models.append(('XGB'  , XGBClassifier()))
    return models

In [None]:
def TrainSetPerformance(models, X_train, y_train):
    names = []
    roc_auc = []
    scoring = 'roc_auc'
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    
    for name, model in models:     
        roc_auc_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)
     
        print(f'The score of {name} (ROC AUC): {roc_auc_scores.mean():.4f}')
        print('-------------------------------------------------------------')
        
        names.append('based' + str(name) + 'train')
        roc_auc.append(roc_auc_scores)
    
    return names, roc_auc

In [None]:
class PlotResults():
  def __Trace(self, model_name, values):
    trace = go.Box(
        y = values,
        name = model_name,
        boxpoints = False
    )
    return trace

  def PlotBox(self, names, results):
    data = []
    for i in range(len(names)):
      data.append(self.__Trace(names[i], results[i]))

    layout = go.Layout(
        title = 'Comparing Models'
    )

    fig = go.Figure(data=data, layout=layout)
    iplot(fig)
        

In [None]:
def ScoreDataframe(names, roc_auc):
    def __RoundingResults(roc_auc):
        scores = []
        for score in roc_auc:
            if isinstance(score, np.ndarray):
                scores.append(np.round(score.mean(), 4))
            else:
                try: 
                    scores.append(np.round(float(score), 4))
                except ValueError:
                    scores.append(score)
        return scores

    scores = __RoundingResults(roc_auc)

    df = pd.DataFrame({'Name': names, 'ROC_AUC': scores})

    return df

In [None]:
models = GetBasedModelList(class_weights = 'balanced')
names, roc_auc = TrainSetPerformance(models, X_train, y_train)
PlotResults().PlotBox(names, roc_auc)
score_based_models = ScoreDataframe(names, roc_auc)
score_based_models

In [None]:
def GetModelDict(class_weights=None):
    models = {
        'LR': LogisticRegression(max_iter=1000, class_weight=class_weights),
        'KNN': KNeighborsClassifier(algorithm='brute'),
        'CART': DecisionTreeClassifier(class_weight=class_weights),
        'AB': AdaBoostClassifier(),
        'GBM': GradientBoostingClassifier(),
        'RF': RandomForestClassifier(class_weight=class_weights),
        'ET': ExtraTreesClassifier(class_weight=class_weights),
        'XGB': XGBClassifier()
    }
    return models

In [None]:
param_grids = {
    'LR': {
        'penalty': ['l1', 'l2'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'saga'],
        'max_iter': [100, 500, 1000]
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    },
    'CART': {
        'criterion': ['gini', 'entropy'],
        'splitter': ['best', 'random'],
        'max_depth': [None, 10, 20, 30, 50],
        'min_samples_split': [2, 10, 20],
        'min_samples_leaf': [1, 5, 10]
    },
    'AB': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.5, 1.0]
    },
    'GBM': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.5],
        'max_depth': [3, 5, 7],
        'subsample': [0.6, 0.8, 1.0],
        'min_samples_split': [2, 10],
        'min_samples_leaf': [1, 5]
    },
    'RF': {
        'n_estimators': [50, 100, 200],
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 10],
        'min_samples_leaf': [1, 5],
        'bootstrap': [True, False]
    },
    'ET': {
        'n_estimators': [50, 100, 200],
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 10],
        'min_samples_leaf': [1, 5],
        'bootstrap': [True, False]
    },
    'XGB': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.5],
        'max_depth': [3, 5, 7],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'gamma': [0, 0.1, 0.5]
    }
}

In [None]:
models = GetModelDict()

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
scoring = 'roc_auc'

roc_auc = []
names = []

for name, model in models.items():
    try:
        grid = RandomizedSearchCV(model, param_grids[name], cv=cv, scoring=scoring, n_jobs=-1, verbose=0, error_score=np.nan, n_iter=100, random_state=42)
        grid.fit(X_train, y_train)
        best_score = grid.best_score_
        best_params = grid.best_params_
        names.append('tuned' + str(name) + 'train')
        roc_auc.append(best_score)
        print(f'Best roc_auc of {name} is: {best_score:.4f}')
        print(f'Best params of {name} are: {best_params}')
        print('-------------------------------------------------------------------------------------------')
    except Exception as e:
        print(f'Error occurred for {name}: {e}')
        names.append(name)
        roc_auc.append(np.nan)
        continue
        
score_tuned_models = ScoreDataframe(names, roc_auc)
score_tuned_models

In [None]:
def PredictOnTestSet(models, X_train, X_test, y_train, y_test):
    names = []
    roc_auc = []
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        
        if hasattr(model, 'predict_proba'):
            pred_proba = model.predict_proba(X_test)[:,1]
            
        elif hasattr(model, 'decision_function'):
            pred_proba = model.decision_function(X_test)
        
        roc_auc_result = roc_auc_score(y_test, pred_proba)
        roc_auc.append(roc_auc_result)
        names.append('tuned' + str(name) + 'test')
        
        print(f'ROC_AUC on test set of {name} is: {roc_auc_result:.4f}')
        print(classification_report(y_test, pred))
        print('------------------------------------------------------------------------')
        
    
    return names, roc_auc
        

In [None]:
def GetTunedModelDict(class_weight=None):
    models = {
        'LR': LogisticRegression(max_iter=1000, C=100, penalty='l1', solver='saga', class_weight=class_weight),
        'KNN': KNeighborsClassifier(algorithm='kd_tree', n_neighbors=9, weights='distance'),
        'CART': DecisionTreeClassifier(splitter='random', max_depth=10, criterion='gini', 
                                       min_samples_split=2, min_samples_leaf=5, class_weight=class_weight),
        'AB': AdaBoostClassifier(learning_rate=1.0, n_estimators=200),
        'GBM': GradientBoostingClassifier(learning_rate=0.1, max_depth=5, n_estimators=200, 
                                          subsample=1.0, min_samples_split=10, min_samples_leaf=5),
        'RF': RandomForestClassifier(n_estimators=100, max_depth=10, criterion='entropy', 
                                     min_samples_split=10, min_samples_leaf=5, bootstrap=False, class_weight=class_weight),
        'ET': ExtraTreesClassifier(n_estimators=200, criterion='gini', max_depth=None, 
                                   min_samples_split=2, min_samples_leaf=5, bootstrap=False, class_weight=class_weight),
        'XGB': XGBClassifier(learning_rate=0.1, max_depth=5, n_estimators=200, subsample=0.8, 
                             colsample_bytree=0.8, gamma=0.5)
    }
    return models

In [None]:
models = GetTunedModelDict()
names, roc_auc = PredictOnTestSet(models, X_train, X_test, y_train, y_test)

In [None]:
score_tuned_models_test = ScoreDataframe(names, roc_auc)
score_tuned_models_test

In [None]:
comparingModels = pd.concat([
    score_based_models,
    score_tuned_models,
    score_tuned_models_test
], axis = 1)
comparingModels

In [None]:
models = GetTunedModelDict()
models = [(name, model) for name, model in models.items()]
voting = VotingClassifier(models, voting='soft', n_jobs=-1, verbose=0)
voting.fit(X_train, y_train)
pred = voting.predict(X_test)
pred_proba = voting.predict_proba(X_test)[:,1]
roc_auc = roc_auc_score(y_test, pred_proba)
print(f'ROC_AUC of VotingClassifier are: {roc_auc:.4f}')
print(classification_report(y_test, pred))

In [None]:
def SelectedModelDict(class_weight=None):
    models = {
        'AB': AdaBoostClassifier(learning_rate=0.5, n_estimators=100),
        'GBM': GradientBoostingClassifier(learning_rate=0.5, max_depth=3, n_estimators=100),
        'RF': RandomForestClassifier(n_estimators=100, max_depth=20, criterion='entropy', class_weight=class_weight),
        'XGB': XGBClassifier(learning_rate=0.5, max_depth=3, n_estimators=100)
    }
    return models


In [None]:
models = SelectedModelDict(class_weight = 'balanced')
models = [(name, model) for name, model in models.items()]
voting = VotingClassifier(models, voting='soft', n_jobs=-1, verbose=0)
voting.fit(X_train, y_train)
pred = voting.predict(X_test)
pred_proba = voting.predict_proba(X_test)[:,1]
roc_auc = roc_auc_score(y_test, pred_proba)
print(f'ROC_AUC of VotingClassifier are: {roc_auc:.4f}')
print(classification_report(y_test, pred))

In [None]:
models = GetTunedModelDict()
models = [(name, model) for name, model in models.items()]

meta_model = LogisticRegression(max_iter=1000)

param_grid_meta = {
    'final_estimator__penalty': ['l2'],
    'final_estimator__C': [0.1, 1.0, 10.0], 
}

stacking = StackingClassifier(estimators=models, final_estimator=meta_model, n_jobs=-1, verbose=0, cv=3)

scoring = 'roc_auc'

grid = RandomizedSearchCV(stacking, param_grid_meta, cv=3, scoring=scoring, n_jobs=-1, verbose=0, n_iter=100, random_state=42)
grid.fit(X_train, y_train)

pred_proba = grid.predict_proba(X_test)[:,1]
pred = grid.predict(X_test)
roc_auc = roc_auc_score(y_test, pred_proba)  
print(f'ROC_AUC of Stacking is: {roc_auc:.4f}')
print(f'Best params of Stacking are: {grid.best_params_}')
print(classification_report(y_test, pred))