# <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#005B46; font-size:140%; text-align:left;padding: 0px; border-bottom: 3px solid #003300"> Introduction</p>

In [None]:
# Misc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os
from copy import deepcopy
from functools import partial
import gc
import warnings

# Import sklearn classes for model selection, cross validation, and performance evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, f1_score
from sklearn.metrics import precision_score, recall_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from category_encoders import OneHotEncoder, OrdinalEncoder, CountEncoder
from imblearn.under_sampling import RandomUnderSampler

# Import libraries for Hypertuning
import optuna

# Import libraries for gradient boosting
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.svm import NuSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from catboost import CatBoost, CatBoostRegressor, CatBoostClassifier
from catboost import Pool

In [None]:
# Seaborn
rc = {
    "axes.facecolor": "#FAEEE9",
    "figure.facecolor": "#FAEEE9",
    "axes.edgecolor": "#000000",
    "grid.color": "#EBEBE7",
    "font.family": "arial",
    "axes.labelcolor": "#000000",
    "xtick.color": "#000000",
    "ytick.color": "#000000",
    "grid.alpha": 0.4
}
sns.set(rc=rc)

# Useful line of code to set the display option so we could see all the columns in pd dataframe
pd.set_option('display.max_columns', None)

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Functions
def print_sl():
    print("=" * 50)
    print()

def show_na(df, column):
    sns.countplot(x='outcome', data=df[df[column].isnull()])
    plt.show() 

# <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#005B46; font-size:140%; text-align:left;padding: 0px; border-bottom: 3px solid #003300"> Data Preprocessing</p>

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s3e22/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s3e22/test.csv')
sample_submission = pd.read_csv('/kaggle/input/playground-series-s3e22/sample_submission.csv')

train_orig = pd.read_csv('/kaggle/input/horse-survival-dataset/horse.csv')

train.drop('id',axis=1,inplace=True)
test.drop('id',axis=1,inplace=True)

print('Data Loaded Succesfully!')
print_sl()

print(f'train shape: {train.shape}')
print(f'are there any null values in train: {train.isnull().any().any()}\n')

print(f'test shape: {test.shape}')
print(f'are there any null values in test: {test.isnull().any().any()}\n')

print(f'train_orig shape: {train_orig.shape}')
print(f'are there any null values in test: {train_orig.isnull().any().any()}\n')

categorical_cols = ['surgery', 'age', 'temp_of_extremities', 'peripheral_pulse', 'mucous_membrane', 'capillary_refill_time',
                   'pain', 'peristalsis', 'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux', 'rectal_exam_feces',
                   'abdomen', 'abdomo_appearance', 'surgical_lesion', 'cp_data']

num_cols = ['hospital_number', 'rectal_temp', 'pulse', 'respiratory_rate', 'nasogastric_reflux_ph', 'packed_cell_volume', 'total_protein',
           'abdomo_protein', 'lesion_1', 'lesion_2', 'lesion_3']

target = 'outcome'

train.head()

<div class="alert alert-block alert-warning">  
    <b>💡 Info:</b> The dataset contains a significant number of NA values.
</div>

# <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#005B46; font-size:140%; text-align:left;padding: 0px; border-bottom: 3px solid #003300">EDA</p>

### Target Distribution

In [None]:
# https://www.kaggle.com/code/kimtaehun/eda-and-baseline-with-multiple-models
def plot_count(df: pd.core.frame.DataFrame, col: str, title_name: str='Train') -> None:
    # Set background color
    
    f, ax = plt.subplots(1, 2, figsize=(14, 7))
    plt.subplots_adjust(wspace=0.2)

    s1 = df[col].value_counts()
    N = len(s1)

    outer_sizes = s1
    inner_sizes = s1/N

    outer_colors = ['#9E3F00', '#eb5e00', '#ff781f']
    inner_colors = ['#ff6905', '#ff8838', '#ffa66b']

    ax[0].pie(
        outer_sizes,colors=outer_colors, 
        labels=s1.index.tolist(), 
        startangle=90, frame=True, radius=1.3, 
        explode=([0.05]*(N-1) + [.3]),
        wedgeprops={'linewidth' : 1, 'edgecolor' : 'white'}, 
        textprops={'fontsize': 12, 'weight': 'bold'}
    )

    textprops = {
        'size': 13, 
        'weight': 'bold', 
        'color': 'white'
    }

    ax[0].pie(
        inner_sizes, colors=inner_colors,
        radius=1, startangle=90,
        autopct='%1.f%%', explode=([.1]*(N-1) + [.3]),
        pctdistance=0.8, textprops=textprops
    )

    center_circle = plt.Circle((0,0), .68, color='black', fc='white', linewidth=0)
    ax[0].add_artist(center_circle)

    x = s1
    y = s1.index.tolist()
    sns.barplot(
        x=x, y=y, ax=ax[1],
        palette='YlOrBr_r', orient='horizontal'
    )

    ax[1].spines['top'].set_visible(False)
    ax[1].spines['right'].set_visible(False)
    ax[1].tick_params(
        axis='x',         
        which='both',      
        bottom=False,      
        labelbottom=False
    )

    for i, v in enumerate(s1):
        ax[1].text(v, i+0.1, str(v), color='black', fontweight='bold', fontsize=12)

    plt.setp(ax[1].get_yticklabels(), fontweight="bold")
    plt.setp(ax[1].get_xticklabels(), fontweight="bold")
    ax[1].set_xlabel(col, fontweight="bold", color='black')
    ax[1].set_ylabel('count', fontweight="bold", color='black')

    f.suptitle(f'{title_name}', fontsize=18, fontweight='bold')
    plt.tight_layout()
    plt.show()

plot_count(train, 'outcome', 'Target Variable(Outcome) Distribution')

### Categorical Variables

In [None]:
plt.figure(figsize=(14, len(categorical_cols)*3))

for i, col in enumerate(categorical_cols):
    
    plt.subplot(len(categorical_cols)//2 + len(categorical_cols) % 2, 2, i+1)
    sns.countplot(x=col, hue="outcome", data=train, palette='YlOrRd')
    plt.title(f"{col} countplot by outcome", fontweight = 'bold')
    plt.ylim(0, train[col].value_counts().max() + 10)
    
plt.tight_layout()
plt.show()

### Numerical Variables

In [None]:
plt.figure(figsize=(14, len(num_cols) * 3))

for i, col in enumerate(num_cols):
    # Plotting for outcome
    plt.subplot(len(num_cols), 2, i+1)
    sns.histplot(x=col, hue="outcome", data=train, bins=30, kde=True, palette='YlOrRd')
    plt.title(f"{col} distribution for outcome", fontweight="bold")
    plt.ylim(0, train[col].value_counts().max() + 10)
    
plt.tight_layout()
plt.show()

### Scatter Matrix

In [None]:
# https://www.kaggle.com/code/yaaangzhou/playground-s3-e22-eda-modeling/notebook
def plot_pair(df_train,num_var,target,plotname):
    '''
    Funtion to make a pairplot:
    df_train: total data
    num_var: a list of numeric variable
    target: target variable
    '''
    g = sns.pairplot(data=df_train, x_vars=num_var, y_vars=num_var, hue=target, corner=True,  palette='YlOrRd')
    g._legend.set_bbox_to_anchor((0.8, 0.7))
    g._legend.set_title(target)
    g._legend.loc = 'upper left'
    g._legend.get_title().set_fontsize(14)
    for item in g._legend.get_texts():
        item.set_fontsize(14)

    plt.suptitle(plotname, ha='center', fontweight='bold', fontsize=25, y=0.98)
    plt.show()

plot_pair(train, num_cols, target, plotname = 'Scatter Matrix with Target')

### Correlation Heatmap

In [None]:
# Create a copy of the dataframe
df_encoded = train.copy()

# Assuming these are your categorical variables, including 'outcome'
categorical_vars = ['surgery', 'age', 'temp_of_extremities', 'peripheral_pulse', 
                    'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis', 
                    'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux', 
                    'rectal_exam_feces', 'abdomen', 'abdomo_appearance', 'surgical_lesion', 
                    'cp_data', 'outcome']

# Label encode categorical columns
label_encoders = {}
for column in categorical_vars:
    le = LabelEncoder()
    df_encoded[column] = le.fit_transform(train[column])
    label_encoders[column] = le

def plot_correlation_heatmap(df: pd.core.frame.DataFrame, title_name: str = 'Train correlation') -> None:
    excluded_columns = ['id']
    columns_without_excluded = [col for col in df.columns if col not in excluded_columns]
    corr = df[columns_without_excluded].corr()
    
    fig, axes = plt.subplots(figsize=(14, 10))
    mask = np.zeros_like(corr)
    mask[np.triu_indices_from(mask)] = True
    sns.heatmap(corr, mask=mask, linewidths=.5, cmap='YlOrBr_r', annot=True, annot_kws={"size": 6})
    plt.title(title_name)
    plt.show()

# Plot correlation heatmap for encoded dataframe
plot_correlation_heatmap(df_encoded, 'Encoded Dataset Correlation')

# <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#005B46; font-size:140%; text-align:left;padding: 0px; border-bottom: 3px solid #003300">Data Cleaning</p>

In [None]:
def data_cleaning(df, num_cols=num_cols, training=True):

#     df.peripheral_pulse.fillna('novalue', inplace=True)
#     df.temp_of_extremities.fillna('cool', inplace=True)
#     df.mucous_membrane.fillna('pale_pink', inplace=True)
#     df.capillary_refill_time.fillna('more_3_sec', inplace=True)
#     df.peristalsis.fillna('absent', inplace=True)
#     df.abdominal_distention.fillna('none', inplace=True)
#     df.nasogastric_tube.fillna('none', inplace=True)
#     df.nasogastric_reflux.fillna('none', inplace=True)
    
#     # not sure
#     df.rectal_exam_feces.fillna('absent', inplace=True)
#     df.abdomen.fillna('normal', inplace=True) 
    
#     if training:
#         #df.loc[df.pain.isnull() & (df.outcome == 'euthanized')].pain.fillna('depressed', inplace=True)
#         df.loc[(df.pain.isnull()) & (df.outcome == 'euthanized'), 'pain'] = 'depressed'
#         df.pain.fillna('mild_pain', inplace=True)
        
#         # df.loc[df.abdomo_appearance.isnull() & (df.outcome == 'died')].abdomo_appearance.fillna('serosanguious', inplace=True)
#         df.loc[df.abdomo_appearance.isnull() & (df.outcome == 'died'), 'abdomo_appearance'] = 'serosanguious'
#         df.abdomo_appearance.fillna('cloudy', inplace=True)
        
#     else:
#         df.pain.fillna('mild_pain', inplace=True)
#         df.abdomo_appearance.fillna('cloudy', inplace=True)
    
    for col in num_cols:
        df[col].fillna(0, inplace=True)
        
    df["pain"] = df["pain"].replace('slight', 'moderate')
    df["peristalsis"] = df["peristalsis"].replace('distend_small', 'normal')
    df["rectal_exam_feces"] = df["rectal_exam_feces"].replace('serosanguious', 'absent')
    df["nasogastric_reflux"] = df["nasogastric_reflux"].replace('slight', 'none')
        
    df["temp_of_extremities"] = df["temp_of_extremities"].fillna("normal").map({'cold': 0, 'cool': 1, 'normal': 2, 'warm': 3})
    df["peripheral_pulse"] = df["peripheral_pulse"].fillna("normal").map({'absent': 0, 'reduced': 1, 'normal': 2, 'increased': 3})
    df["capillary_refill_time"] = df["capillary_refill_time"].fillna("3").map({'less_3_sec': 0, '3': 1, 'more_3_sec': 2})
    df["pain"] = df["pain"].fillna("depressed").map({'alert': 0, 'depressed': 1, 'moderate': 2, 'mild_pain': 3, 'severe_pain': 4, 'extreme_pain': 5})
    df["peristalsis"] = df["peristalsis"].fillna("hypomotile").map({'hypermotile': 0, 'normal': 1, 'hypomotile': 2, 'absent': 3})
    df["abdominal_distention"] = df["abdominal_distention"].fillna("none").map({'none': 0, 'slight': 1, 'moderate': 2, 'severe': 3})
    df["nasogastric_tube"] = df["nasogastric_tube"].fillna("none").map({'none': 0, 'slight': 1, 'significant': 2})
    df["nasogastric_reflux"] = df["nasogastric_reflux"].fillna("none").map({'less_1_liter': 0, 'none': 1, 'more_1_liter': 2})
    df["rectal_exam_feces"] = df["rectal_exam_feces"].fillna("absent").map({'absent': 0, 'decreased': 1, 'normal': 2, 'increased': 3})
    df["abdomen"] = df["abdomen"].fillna("distend_small").map({'normal': 0, 'other': 1, 'firm': 2,'distend_small': 3, 'distend_large': 4})
    df["abdomo_appearance"] = df["abdomo_appearance"].fillna("serosanguious").map({'clear': 0, 'cloudy': 1, 'serosanguious': 2})
        
    return df

def encode(df, categorical_vars):
    label_encoders = {}
    for col in categorical_vars:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le
    return df

def features_engineering(df):
    df['lesion_2'] = df['lesion_2'].apply(lambda x:1 if x>0 else 0)
    data_preprocessed = df.copy()
     
    data_preprocessed["abs_rectal_temp"] = (data_preprocessed["rectal_temp"] - 37.8).abs()
    data_preprocessed.drop(columns=["rectal_temp"])
    
    return data_preprocessed

In [None]:
train = data_cleaning(train)
test = data_cleaning(test, training=False)
train_orig = data_cleaning(train_orig)

train = encode(train, categorical_cols)
test = encode(test, categorical_cols)
train_orig = encode(train_orig, categorical_cols)

total = pd.concat([train, train_orig], ignore_index=True)
total.drop_duplicates(inplace=True)
total.drop('lesion_3',axis=1,inplace=True)
total = features_engineering(total)
test = features_engineering(test)
test.drop('lesion_3',axis=1,inplace=True)

print(f'train shape: {train.shape}')
print(f'are there any null values in train: {train.isnull().any().any()}\n')

print(f'test shape: {test.shape}')
print(f'are there any null values in test: {test.isnull().any().any()}\n')

print(f'total shape: {total.shape}')
print(f'are there any null values in test: {total.isnull().any().any()}\n')

total.head()

# <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#005B46; font-size:140%; text-align:left;padding: 0px; border-bottom: 3px solid #003300">Model Building</p>

In [None]:
X_train = total.drop(columns=[target])
y_train = total[target].map({'died':0,'euthanized':1,'lived':2})
X_test = test

print(f'X_train shape: {X_train.shape}')

print(f'X_test shape: {X_test.shape}')

print(f'y_train shape: {y_train.shape}')

del train, test, total
gc.collect();

X_train.head()

In [None]:
class Splitter:
    def __init__(self, n_splits=5, test_size=0.2):
        self.n_splits = n_splits
        self.test_size = test_size

    def split_data(self, X, y, random_state_list):
        for random_state in random_state_list:
            kf = KFold(n_splits=self.n_splits, random_state=random_state, shuffle=True)
            for train_index, val_index in kf.split(X, y):
                X_train, X_val = X.iloc[train_index], X.iloc[val_index]
                y_train, y_val = y.iloc[train_index], y.iloc[val_index]
                yield X_train, X_val, y_train, y_val, val_index

In [None]:
class Classifier:
    def __init__(self, n_estimators=100, device="cpu", random_state=0):
        self.n_estimators = n_estimators
        self.device = device
        self.random_state = random_state
        self.models = self._define_model()
        self.models_name = list(self._define_model().keys())
        self.len_models = len(self.models)
        
    def _define_model(self):
        
        xgb_optuna1 = {
            'n_estimators': 500,
            'learning_rate': 0.14825592807938784,
            'booster': 'gbtree',
            'lambda': 8.286104243394034,
            'alpha': 3.218706261523848,
            'subsample': 0.9641392997798903,
            'colsample_bytree': 0.6489144243365093,
            'max_depth': 4, 
            'min_child_weight': 3,
            'eta': 1.230361841253566,
            'gamma': 0.007588382469327802, 
            'grow_policy': 'depthwise',
            'random_state': self.random_state,
        }

        if self.device == 'gpu':
            xgb_params['tree_method'] = 'gpu_hist'
            xgb_params['predictor'] = 'gpu_predictor'
        
        lgb_optuna1 = {
            'num_iterations': 200,
            'learning_rate': 0.05087818591635374,
            'max_depth': 10,
            'lambda': 4.428505451747609,
            'alpha': 4.34921696876783,
            'subsample': 0.512929283477029,
            'colsample_bytree': 0.5421760951211009, 
            'min_child_weight': 4,
            'random_state': self.random_state,
            'verbose': -1,
        }
      
        cat_optuna1 = {
            'iterations': 700,          
            'learning_rate': 0.06806932341035855,
            'depth': 3,
            'l2_leaf_reg': 4.246994639881441,
            'bagging_temperature': 0.08262764367292164,
            'random_strength': 6.922710769000274, 
            'border_count': 88,
            'random_state': self.random_state,
            'verbose': False,
        }
      
        hist_params = {
            'l2_regularization': 0.01,
            'early_stopping': True,
            'learning_rate': 0.01,
            'max_iter': self.n_estimators,
            'max_depth': 4,
            'max_bins': 255,
            'min_samples_leaf': 10,
            'max_leaf_nodes':10,
            'class_weight':'balanced',
            'random_state': self.random_state
        }
        models = {
            'xgb01': xgb.XGBClassifier(**xgb_optuna1),
            'lgb01': lgb.LGBMClassifier(**lgb_optuna1),
            #'hgb': HistGradientBoostingClassifier(**hist_params),
            'cat01': CatBoostClassifier(**cat_optuna1),
        }
        
        return models

In [None]:
%%time

# Config
random_state = 42
random_state_list =[42]
n_estimators = 100
device = 'cpu'
n_splits=5
early_stopping_rounds = 500
verbose = False


# Split Data
splitter = Splitter(n_splits=n_splits)
splits = splitter.split_data(X_train, y_train, random_state_list=random_state_list)
    
# Initialize an array for storing test predictions
classifier = Classifier(n_estimators=n_estimators, device=device, random_state=random_state)

test_predss = np.zeros((X_test.shape[0]))
oof_predss = np.zeros((X_train.shape[0]))
ensemble_score = []
weights = []
models_name = [_ for _ in classifier.models_name if ('xgb' in _) or ('lgb' in _) or ('cat' in _)]
trained_models = dict(zip(models_name, [[] for _ in range(classifier.len_models)]))
score_dict = dict(zip(classifier.models_name, [[] for _ in range(len(classifier.models_name))]))

for i, (X_train_, X_val, y_train_, y_val, val_index) in enumerate(splits):
    
    n = i % n_splits
    m = i // n_splits
    

    # Classifier models
    classifier = Classifier(n_estimators, device, random_state)
    models = classifier.models

    # Store oof and test predictions for each base model
    oof_preds = []
    test_preds = []

    # Loop over each base model and fit it
    for name, model in models.items():
        if ('xgb' in name) or ('lgb' in name) or ('cat' in name):
            model.fit(X_train_, y_train_, eval_set=[(X_val, y_val)], early_stopping_rounds=early_stopping_rounds, verbose=verbose)
            
        else:
            model.fit(X_train_, y_train_)
            
        if name in trained_models.keys():
            trained_models[f'{name}'].append(deepcopy(model))

        y_val_pred = model.predict(X_val)

        score = f1_score(y_val, y_val_pred, average='micro')
        score_dict[name].append(score)
        print(f'{name} [FOLD-{n} SEED-{random_state_list[m]}] F1 score: {score:.5f}')
        
        oof_preds.append(y_val_pred)

    ## Ensemble Model
    model_tuples = [(key, value) for key, value in models.items()]
    voting_classifier = VotingClassifier(estimators=model_tuples
                                                    , voting='soft')
    
    voting_classifier.fit(X_train_, y_train_)
    ensemble_pred = voting_classifier.predict(X_val)
    ens_f1 = f1_score(y_val, ensemble_pred, average='micro')
    
    print('Fold', i+1, '==> Ensemble Model oof F1 score is ==>', ens_f1)
    ensemble_score.append(ens_f1)
    
    # Predict to X_test by the best ensemble weights
    print('Average F1 of Ensemble Model is:', np.mean(ensemble_score))
    print_sl()

    gc.collect()

# <p style="font-family:JetBrains Mono; font-weight:bold; letter-spacing: 2px; color:#005B46; font-size:140%; text-align:left;padding: 0px; border-bottom: 3px solid #003300">Predict and Submit</p>

In [None]:
voting_classifier.fit(X_train, y_train)
test_predss = voting_classifier.predict(X_test)

# xgboost = models['xgb01'].fit(X_train, y_train)
# test_predss = xgboost.predict(X_test)

In [None]:
submission = pd.DataFrame({'id': sample_submission['id'], 'outcome': test_predss})
submission['outcome'] = submission['outcome'].map({0:'died',1:'euthanized',2:'lived'})
submission.to_csv('submission.csv',index=False)
submission