# <p style="padding:15px; background-color:#E55807; font-family:JetBrains Mono; font-weight:bold; color:#000000; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 15px 15px; border: 5px solid #000000">Import Libraries</p>

In [None]:
!pip install sklego

import numpy as np # linear algebra
import pandas as pd # data processing
from pandas.api.types import is_numeric_dtype
import seaborn as sns
import matplotlib.pyplot as plt
import optuna

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, train_test_split, GridSearchCV


# Models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, ExtraTreesRegressor, BaggingRegressor, StackingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklego.linear_model import LADRegression
from catboost import CatBoostRegressor


# Ignore warnings ;)
import warnings
warnings.simplefilter("ignore")

# <p style="padding:15px; background-color:#E55807; font-family:JetBrains Mono; font-weight:bold; color:#000000; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 15px 15px; border: 5px solid #000000">Import the data</p> 

In [None]:
# files path
train_path     = "/kaggle/input/playground-series-s3e16/train.csv"
test_path      = "/kaggle/input/playground-series-s3e16/test.csv"
original_path  = "/kaggle/input/crab-age-prediction/CrabAgePrediction.csv"
synthetic_path = "/kaggle/input/ps-s3-e16-synthetic-train-data/train_synthetic.csv"

# function to import our dataset 
def import_data(train_path, test_path, original_path, synthetic_path):
    train     =  pd.read_csv(train_path)
    test      =  pd.read_csv(test_path)
    original  =  pd.read_csv(original_path)
    synthetic =  pd.read_csv(synthetic_path)
    
    return train, test, original, synthetic

train, test, original, synthetic = import_data(train_path, test_path, original_path, synthetic_path)

The train dataset is a synthetic dataset generated from the [Crab Age Prediction](https://www.kaggle.com/datasets/sidhus/crab-age-prediction) dataset(original). These are the descriptions of the variables in this dataset:

<ul>
<li> Sex: Gender of the Crab - Male, Female and Indeterminate </li>
<li> Length: Length of the Crab in feet </li>
<li> Diameter: Diameter of the Crab in feet </li>
<li> Height: Height of the Crab in feet </li>
<li> Weight: Weight of the Crab in ounces </li>
<li> Shucked Weight: Weight without the shell in ounces </li>
<li> Viscera Weight: Weight that wraps around the crab's abdominal organs in ounces </li>
<li> Shell Weight: Weight of the Shell in ounces </li>
<li> Age: Age of the Crab in months</li>


# <p style="padding:15px; background-color:#E55807; font-family:JetBrains Mono; font-weight:bold; color:#000000; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 15px 15px; border: 5px solid #000000">Exploratory Data Analysis - EDA</p> 

In [None]:
train.head(3)

In [None]:
original.head(3)

In [None]:
synthetic.head(3)

In [None]:
test.head(3)

Now, we'll try to use some descriptive statistics

# <p style="padding:15px; background-color:#E55807; font-family:JetBrains Mono; font-weight:bold; color:#000000; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 15px 15px; border: 5px solid #000000">1. Univariate Statistics</p> 

We can use the .describe() method from pandas to see basic stats like count, mean, standard deviation, minimum, maximum, quantiles...

In [None]:
train.describe().T

In [None]:
# fonction to calculate univariate stats like pandas describe method
def univariate_stats(df):
    #df.drop('id', axis=1, inplace=True)
    output_df = pd.DataFrame(columns=['Count', 'Missing', 'Unique', 'Dtype', 'IsNumeric', 'Mode', 'Mean', 'Min', '25%', 'Median', '75%', 'Max', 'Std', 'Skew', 'Kurt'])
    
    for col in df:
        if is_numeric_dtype(df[col]):
            output_df.loc[col] = [df[col].count(), df[col].isnull().sum(), df[col].nunique(), df[col].dtype, is_numeric_dtype(df[col]), df[col].mode().values[0], df[col].mean(), df[col].min(), df[col].quantile(.25), df[col].median(), df[col].quantile(.75), df[col].max(), df[col].std(), df[col].skew(), df[col].kurt() ]
        else:
            output_df.loc[col] = [df[col].count(), df[col].isnull().sum(), df[col].nunique(), df[col].dtype, is_numeric_dtype(df[col]), df[col].mode().values[0], '-', '-', '-', '-', '-', '-', '-', '-', '-' ]

    return output_df.sort_values(by=['IsNumeric', 'Unique'], ascending=False)


pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)


In [None]:
# Call the function on train
univariate_stats(train)

In [None]:
# Call the function to check univariate stats on the original dataset
univariate_stats(original)

In [None]:
# Call the function to check univariate stats on test dataset
univariate_stats(test)

In [None]:
# Call the function to check univariate stats on synthetic dataset
univariate_stats(synthetic)

In [None]:
# List of numerical columns and categorical columns

numeric_cols = train.select_dtypes(include=['float64']).columns.tolist()
categ_cols   = train.select_dtypes(include=['object']).columns.tolist()
target       = 'Age'
numeric_cols
categ_cols

In [None]:
def plot_histograms(df_train, df_test, original, synthetic,target_col, n_cols=3):
    n_rows = (len(df_train.columns) - 1) // n_cols + 1

    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(18, 4*n_rows))
    axes = axes.flatten()

    for i, var_name in enumerate(df_train.columns.tolist()):
        ax = axes[i]
        sns.distplot(df_train[var_name], kde=True, ax=ax, label='Train')      # plot train data
        sns.distplot(original[var_name], kde=True, ax=ax, label='Original')   # plot original data
        sns.distplot(synthetic[var_name], kde=True, ax=ax, label='Synthetic')   # plot original data
        if var_name != target_col:
            sns.distplot(df_test[var_name], kde=True, ax=ax, label='Test')    # plot test data
        
        ax.set_title(f'{var_name} Distribution (Train vs Test)')
        ax.legend()

    plt.tight_layout()
    plt.show()
        
plot_histograms(train[numeric_cols], test[numeric_cols], synthetic, original[numeric_cols], target, n_cols=4)

In [None]:
def plot_distribution(df, hue, title='', drop_cols=[]):
    sns.set_style('whitegrid')

    cols = df.columns.drop([hue] + drop_cols)
    n_cols = 2
    n_rows = (len(cols) - 1) // n_cols + 1

    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(14, 4*n_rows))

    for i, var_name in enumerate(cols):
        row = i // n_cols
        col = i % n_cols

        ax = axes[row, col]
        sns.histplot(data=df, x=var_name, kde=True, ax=ax, hue=hue) # sns.distplot(df_train[var_name], kde=True, ax=ax, label='Train')
        ax.set_title(f'{var_name} Distribution')

    fig.suptitle(f'{title} Distribution Plot by {hue}', fontweight='bold', fontsize=16)
    plt.tight_layout()
    plt.show()
    
plot_distribution(train, hue='Sex', title='Train data')
plot_distribution(test, hue='Sex', title='Test data')
plot_distribution(original, hue='Sex', title='Original data')
plot_distribution(synthetic, hue='Sex', title= 'Synthetic data')

In [None]:
def plot_boxplot(df, hue, title='', drop_cols=[], n_cols=3):
    sns.set_style('whitegrid')

    cols = df.columns.drop([hue] + drop_cols)
    n_rows = (len(cols) - 1) // n_cols + 1

    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(14, 4*n_rows))

    for i, var_name in enumerate(cols):
        row = i // n_cols
        col = i % n_cols

        ax = axes[row, col]
        sns.boxplot(data=df, x=hue, y=var_name, ax=ax, showmeans=True, 
                    meanprops={"marker":"s","markerfacecolor":"white", "markeredgecolor":"blue", "markersize":"5"})
        ax.set_title(f'{var_name} by {hue}')
        ax.set_xlabel('')

    fig.suptitle(f'{title} Boxplot by {hue}', fontweight='bold', fontsize=16)
    plt.tight_layout()
    plt.show()
    
plot_boxplot(train, hue='Sex', title='Train data', n_cols=2)
plot_boxplot(original, hue='Sex', title='Original data', n_cols=2)
plot_boxplot(test, hue='Sex', title='Test data', n_cols=2)

In [None]:
def plot_violinplot(df, hue, title='', drop_cols=[], n_cols=2):
    sns.set_style('whitegrid')

    cols = df.columns.drop([hue] + drop_cols)
    n_rows = (len(cols) - 1) // n_cols + 1

    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(18, 4*n_rows))

    for i, var_name in enumerate(cols):
        row = i // n_cols
        col = i % n_cols

        ax = axes[row, col]
        sns.violinplot(data=df, x=hue, y=var_name, ax=ax, inner='quartile')
        ax.set_title(f'{var_name} Distribution')

    fig.suptitle(f'{title} Violin Plot by {hue}', fontweight='bold', fontsize=16)
    plt.tight_layout()
    plt.show()
    
plot_violinplot(train, hue='Sex', title='Train data', n_cols=2)

# <p style="padding:15px; background-color:#E55807; font-family:JetBrains Mono; font-weight:bold; color:#000000; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 15px 15px; border: 5px solid #000000">2. Bivariate Statistics</p> 

<p style="padding:15px; background-color:#E55807; font-weight:bold; color:#FFD700; font-size:100%; border-radius: 15px 15px;">
    <b>💡 Recall: There are three types of bivariate analysis.</b><br>       
<ul> 
<li> <b>Numerical - Numerical: Pearson's Correlation</b> </li> <br>
The correlation represents the strength of a linear relationship between two numerical variables. If there is no correlation between the two variables, there is no tendency to change along with the values of the second quantity.  <br>  <br>
<li> <b>Categorical - Numerical: one-way ANOVA(3 + groups) or t-test (exactly 2 groups)</b>       </li> <br>
The ANOVA test is used to determine whether there is a significant difference among the averages of more than two groups that are statistically different from each other. <br><br>
<li> <b>Categorical - Categorical: Chi-square Test</b>       </li> <br>
It is calculated based on the difference between expected frequencies and the observed frequencies in one or more categories of the frequency table.
</ul>
</p> 

In [None]:
# this just an intermediate function that will be used in bivstats for one-way ANOVA
def anova(df, feature, label):
    import pandas as pd
    import numpy as np
    from scipy import stats
    
    groups = df[feature].unique()
    df_grouped = df.groupby(feature)
    group_labels = []
    for g in groups:
        g_list = df_grouped.get_group(g)
        group_labels.append(g_list[label])
        
    return stats.f_oneway(*group_labels)

# function to calculate bivariate stats; Pearson' correlation, p-value and one-way ANOVA
def bivstats(df, label):
    from scipy import stats
    import pandas as pd
    import numpy as np
    
    # Create an empty DataFrame to store output
    output_df = pd.DataFrame(columns=['Stat', '+/-', 'Effect size', 'p-value'])
    
    for col in df:
        if col != label:
            if df[col].isnull().sum() == 0:
                if is_numeric_dtype(df[col]):   # Only calculate r, 
                    r, p = stats.pearsonr(df[label], df[col])
                    output_df.loc[col] = ['r', np.sign(r), abs(round(r, 3)), round(p,6)]
                    
                else:
                    F, p = anova(df[[col, label]], col, label)
                    output_df.loc[col] = ['F', '', round(F, 3), round(p,6)]
                    
            else:
                output_df.loc[col] = [np.nan, np.nan, np.nan, np.nan]

    return output_df.sort_values(by=['Effect size', 'Stat'], ascending=[False, False])
 
pd.options.display.float_format = '{:.5f}'.format
bivstats(train, target)

In [None]:
def plot_heatmap(df, title):
    # Create a mask for the diagonal elements
    mask = np.zeros_like(df.astype(float).corr())
    mask[np.triu_indices_from(mask)] = True

    # Set the colormap and figure size
    colormap = plt.cm.RdBu_r
    plt.figure(figsize=(8, 8))

    # Set the title and font properties
    plt.title(f'{title} correlation of features', fontweight='bold', y=1.02, size=8)

    # Plot the heatmap with the masked diagonal elements
    sns.heatmap(df.astype(float).corr(), linewidths=0.1, vmax=1.0, vmin=-1.0, 
                square=True, cmap=colormap, linecolor='white', annot=True, annot_kws={"size": 14, "weight": "bold"},
                mask=mask)

plot_heatmap(train[numeric_cols + [target]], title='Train data')
plot_heatmap(test[numeric_cols], title='Test data')
plot_heatmap(original[numeric_cols + [target]], title='Original')
plot_heatmap(synthetic[numeric_cols + [target]], title='Synthetic')

<p style="padding:15px; background-color:#E55807; font-weight:bold; color:#FFD700; font-size:100%; border-radius: 15px 15px;">
    <b>💡 
Since the features in our datasets are strongly correlated to each others, applying PCA could be a good idea. <br>
        Let's visualize some graphics to gain more insights.
</b>
</p> 



In [None]:
def plot_scatter_with_fixed_col(df, fixed_col, hue=False, drop_cols=[], size=5, title=''):
    sns.set_style('whitegrid')
    
    if hue:
        cols = df.columns.drop([hue, fixed_col] + drop_cols)
    else:
        cols = df.columns.drop([fixed_col] + drop_cols)
    n_cols = 2
    n_rows = (len(cols) - 1) // n_cols + 1
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(size, size/n_cols*n_rows), sharex=False, sharey=False)
    fig.suptitle(f'{title} Set Scatter Plot with Target Column by {hue}', fontsize=20, fontweight='bold', y=1)

    for i, col in enumerate(cols):
        n_row = i // n_cols
        n_col = i % n_cols
        ax = axes[n_row, n_col]

        ax.set_xlabel(f'{col}', fontsize=12)
        ax.set_ylabel(f'{fixed_col}', fontsize=12)

        # Plot the scatterplot
        if hue:
            sns.scatterplot(data=df, x=col, y=fixed_col, hue=hue, ax=ax,
                            s=40, edgecolor='gray', alpha=0.3, palette='bright')
            ax.legend(title=hue, title_fontsize=12, fontsize=12) # loc='upper right'
        else:
            sns.scatterplot(data=df, x=col, y=fixed_col, ax=ax,
                            s=40, edgecolor='gray', alpha=0.3)

        ax.tick_params(axis='both', which='major', labelsize=8)
        ax.set_title(f'{col}', fontsize=16)
    
    plt.tight_layout(pad=0.5, h_pad=0.5, w_pad=0.5)
    plt.show()
    
plot_scatter_with_fixed_col(train, fixed_col=target, hue='Sex', size=10, title='Train data')
plot_scatter_with_fixed_col(original, fixed_col=target, hue='Sex', size=10, title='Original data')
plot_scatter_with_fixed_col(synthetic, fixed_col=target, hue='Sex', size=10, title='Synthetic data')

In [None]:
# sns.pairplot(data=train, vars=['Age', 'Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight'], hue='Sex')
# plt.show()

# <p style="padding:15px; background-color:#E55807; font-family:JetBrains Mono; font-weight:bold; color:#000000; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 15px 15px; border: 5px solid #000000">Feature Engineering</p> 

In [None]:
train["Data Type"] = 0
test["Data Type"] = 1
original["Data Type"] = 2
synthetic["Data Type"] = 3

ids = []
for i in range(len(original)):
    ids.append(i + 123419)

original["id"] = ids
synthetic["id"] += 127312

# concatenate datasets
df_concat = pd.concat([train, original, synthetic], ignore_index=True)
df_concat = df_concat.drop_duplicates()
df_all = pd.concat([df_concat, test], ignore_index=True)
df_all

In [None]:
df_all = pd.get_dummies(df_all)
df_all

In [None]:
df_all[df_all['Height'] == 0]['Height']

In [None]:
h1 = df_all[df_all["Height"] != 0]
h0 = df_all[df_all["Height"] == 0]
print(h1.shape, h0.shape)

# prediction of Height by Random Forest Regressor

x_h1 = h1.drop(columns=["Height", "Age", "Data Type"], axis=1)
y_h1 = h1["Height"]
x_h0 = h0.drop(columns=["Height", "Age", "Data Type"], axis=1)

rfr = RandomForestRegressor(n_jobs=-1, random_state=42)
rfr.fit(x_h1, y_h1)
preds_height = rfr.predict(x_h0)

In [None]:
len(preds_height)

In [None]:
cnt = 0
for i in range(len(df_all)):
    if df_all.loc[i, "Height"] == 0:
        df_all.loc[i, "Height"] = preds_height[cnt]
        cnt += 1

df_all["Height"].describe()

In [None]:
df_all[df_all['Height'] == 0]['Height']

In [None]:
# Prepare our final dataset for train

train = df_all[df_all["Data Type"] != 1]
train.sort_values("id", inplace=True)
train.reset_index(drop=True, inplace=True)

train = train.drop(columns=["id", "Data Type"], axis=1)
train

In [None]:
# dataset for test
test = df_all[df_all["Data Type"] == 1]
test.sort_values("id", inplace=True)
test.reset_index(drop=True, inplace=True)
test.drop(columns=["id", "Age", "Data Type"], inplace=True)
test

In [None]:
# function for PCA features but I'll keep this as a last resort
def add_pca_features(X_train, X_test):    
    
    # Select the columns for PCA
    pca_features = X_train.select_dtypes(include=['float64']).columns.tolist()
    n_components = 4 # len(pca_features)

    # Create the pipeline
    pipeline = make_pipeline(StandardScaler(), PCA(n_components=n_components))
    
    # Perform PCA
    pipeline.fit(X_train[pca_features])

    # Create column names for PCA features
    pca_columns = [f'PCA_{i}' for i in range(n_components)]

    # Add PCA features to the dataframe
    X_train[pca_columns] = pipeline.transform(X_train[pca_features])
    X_test[pca_columns] = pipeline.transform(X_test[pca_features])

    return X_train, X_test

<p style="padding:15px; background-color:#E55807; font-weight:bold; color:#FFD700; font-size:100%; border-radius: 15px 15px;">
    <b>💡 We expose here many techniques and features that we can use in our dataset but we'll use only few of them.
    </b>
</p> 

In [None]:
# function to add more features to the dataset
def feature_engineering(df): 
    
    # Clean the weights by capping the over weights with total body weights
    """df['Shell Weight']=np.where(df['Shell Weight']>df['Weight'],df['Weight'],df['Shell Weight'])
    df['Viscera Weight']=np.where(df['Viscera Weight']>df['Weight'],df['Weight'],df['Viscera Weight'])
    df['Shucked Weight']=np.where(df['Shucked Weight']>df['Weight'],df['Weight'],df['Shucked Weight'])"""
    
    # Adding brand news features
    df['Shucked Weight ratio'] = df['Shucked Weight'] / df['Weight']
    #df['Viscera Weight ratio'] = df['Viscera Weight'] / df['Weight']    # dropped due to low correlation with our target
    df['Shell Weight ratio']   = df['Shell Weight'] / df['Weight']
    
    df['Volume'] = df['Length'] * df['Diameter'] * df['Height']
    
    #df['Meat Yield'] = df['Shucked Weight'] / (df['Weight'] + df['Shell Weight'])
    #df['Weight_to_Shucked_Weight'] = df['Weight'] / df['Shucked Weight']
    
    """df['dim1']   = df['Length'] * df['Diameter']
    df['dim2']   = df['Length'] * df['Height']
    df['dim3']   = df['Height'] * df['Diameter']"""
    
    # Crab BMI
    df['bmi']=df['Weight']/(df['Height']**2)
    
    # Water Loss during experiment
    df["water_loss"]=df["Weight"]-df["Shucked Weight"]-df['Viscera Weight']-df['Shell Weight']
    df["water_loss"]=np.where(  df["water_loss"]<0,
                                min(df["Shucked Weight"].min(), df["Viscera Weight"].min(), df["Shell Weight"].min()),
                                df["water_loss"]
                             )
    
    # Crab density approx
    df['density'] = df['Weight']/(df['Volume'])
    df['BSA'] = np.sqrt( (df['Weight']* 0.0283) * (df['Height']*30.48) / 3600 )
    
    news_cols = ['Shucked Weight ratio', 'Shell Weight ratio', 'Volume', 'bmi', "water_loss", 'density', 'BSA']
    
    return df, news_cols

In [None]:
train_eng,  news_cols = feature_engineering(train)
test_eng, news_cols   = feature_engineering(test)
train_eng

In [None]:
#plot_heatmap(train_eng[numeric_cols + news_cols + [target]], title='Train_eng data')


corr_mat_data = train_eng.corr()
data_mask = np.triu(np.ones_like(corr_mat_data, dtype = bool))
cmap = sns.diverging_palette(100, 7, s = 75, l = 40, n = 20, center = 'light', as_cmap = True)

#fig, axes = plt.subplots(1, 1, figsize = (25, 10))
plt.figure(figsize=(12, 12))
sns.heatmap(corr_mat_data, annot = True, cmap = cmap, fmt = '.2f', center = 0,
            annot_kws = {'size': 12}, mask = data_mask).set_title('Correlations train features');

In [None]:
columns_to_drop = ['Sex_I', 'Age']
X = train_eng.drop(columns=columns_to_drop, axis=1)
test_eng = test_eng.drop('Sex_I', axis=1)
Y = train_eng['Age']
Y

In [None]:
X

In [None]:
"""#scaling the data
scaler = StandardScaler()

X = pd.DataFrame(scaler.fit_transform(X))
test_eng = pd.DataFrame(scaler.transform(test_eng))
test_eng"""

<p style="padding:15px; background-color:#E55807; font-weight:bold; color:#FFD700; font-size:100%; border-radius: 15px 15px;">
    <b>💡 Scaling : <br>
         As we'll use Gradient Boosting Decision Trees(GBDT) models, so scaling the data might not be necessary.

</b>
</p> 

In [None]:
X, X_val, Y, Y_val = train_test_split(X, Y, test_size=0.1, random_state=42)
X_val

# <p style="padding:15px; background-color:#E55807; font-family:JetBrains Mono; font-weight:bold; color:#000000; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 15px 15px; border: 5px solid #000000">Models building</p>  

In [None]:
hist_cv_scores, hist_preds = list(), list()
lgb_cv_scores, lgb_preds   = list(), list()
xgb_cv_scores, xgb_preds   = list(), list()
cat_cv_scores, cat_preds   = list(), list()

ens_cv_scores, ens_preds = list(), list()



imp_hist = pd.DataFrame()
#imp_cat = pd.DataFrame()
imp_xgb = pd.DataFrame()
imp_ens = pd.DataFrame()
imp = pd.DataFrame()

skf = KFold(n_splits = 10, random_state = 42, shuffle = True)
    
for i, (train_ix, test_ix) in enumerate(skf.split(X, Y)):
        
    X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
    Y_train, Y_test = Y.iloc[train_ix], Y.iloc[test_ix]
    
    print('=====================================================================')
    

    #==================================================== LightGBM ==========================================================#
    
    lgb_params = {
                    "objective": "regression_l1", # ="mae"
                    "metric": "mae",
                    "learning_rate": 0.03, # 0.01
                    "n_estimators": 5000,
                    "max_depth": 10,
                    "num_leaves": 255,
                    "reg_alpha": 0.1, 
                    "reg_lambda": 0.1, 
                    "subsample": 0.4 
                }
    
    lgb_md = LGBMRegressor(**lgb_params).fit(X_train, Y_train)

    # Validation
    lgb_pred_1 = lgb_md.predict(X_test)
    lgb_score_fold = mean_absolute_error(Y_test, lgb_pred_1)    
    lgb_cv_scores.append(lgb_score_fold)
    
    # Prediction
    lgb_pred_2 = lgb_md.predict(test_eng)
    lgb_preds.append(lgb_pred_2)
    
    # Importance
    _imp = pd.DataFrame({"features": X.columns, "importance": lgb_md.feature_importances_})
    imp = pd.concat([imp, _imp], axis=0, ignore_index=True)

    print('Fold N°', i, '==> LightGBM -          MAE: ====>', lgb_score_fold)

    
    #==================================================== HistGradientBoosting ====================================================#
    
    
    hist_md = HistGradientBoostingRegressor(loss = 'absolute_error',
                                            l2_regularization = 0.01,
                                            early_stopping = False,
                                            learning_rate = 0.01,
                                            max_iter = 1000,
                                            max_depth = 15,
                                            max_bins = 255,
                                            min_samples_leaf = 30,
                                            max_leaf_nodes = 30).fit(X_train, Y_train)
   
    # Validation
    hist_pred_1 = hist_md.predict(X_test)
    hist_score_fold = mean_absolute_error(Y_test, hist_pred_1)
    hist_cv_scores.append(hist_score_fold)

    # Prediction
    hist_pred_2 = hist_md.predict(test_eng)
    hist_preds.append(hist_pred_2)
    
    print('Fold N°', i, '==> HistGradient -       MAE: ====>', hist_score_fold)

    
    #======================================================== XGBoost ============================================================#
    

    xgb_md = XGBRegressor(objective = 'reg:pseudohubererror',
                          tree_method = 'hist',
                          colsample_bytree = 0.9, 
                          gamma = 0.65, 
                          learning_rate = 0.01, 
                          max_depth = 7, 
                          min_child_weight = 20, 
                          n_estimators = 5000,
                          subsample = 0.7,
                          random_state = 42).fit(X_train, Y_train,
                                            #eval_set = [(X_train, Y_train), (X_test, Y_test)],
                                            verbose=0
                                            )
    # Validation
    xgb_pred_1 = xgb_md.predict(X_test)
    xgb_score_fold = mean_absolute_error(Y_test, xgb_pred_1)    
    xgb_cv_scores.append(xgb_score_fold)

    # Prediction
    xgb_pred_2 = xgb_md.predict(test_eng)
    xgb_preds.append(xgb_pred_2)
    
    print('Fold N°', i, '==> XGBoost -        MAE: ====>', xgb_score_fold)

    
    #========================================================= CatBoost ========================================================#
   
    
    cat_md = CatBoostRegressor(loss_function = 'MAE',
                               iterations = 1000,
                               learning_rate = 0.03,
                               depth = 10, 
                               random_strength = 0.2,
                               bagging_temperature = 0.7,
                               border_count = 254,
                               l2_leaf_reg = 0.001,
                               verbose = False,
                               grow_policy = 'Lossguide',
                               task_type = 'CPU',
                               random_state = 42).fit(X_train, Y_train)
    
    # Validation
    cat_pred_1 = cat_md.predict(X_test)
    cat_score_fold = mean_absolute_error(Y_test, cat_pred_1)    
    cat_cv_scores.append(cat_score_fold)
    
    # Prediction
    cat_pred_2 = cat_md.predict(test_eng)
    cat_preds.append(cat_pred_2)
    
    print('Fold N°', i, '==> CatBoost       - MAE: ====>', cat_score_fold)

    
    #========================================================= LAD Ensemble =========================================================#
    
    x = pd.DataFrame({'hist': np.round(hist_pred_1.tolist()), 
                      'lgb': np.round(lgb_pred_1.tolist()),
                      'xgb': np.round(xgb_pred_1.tolist()), 
                      'cat': np.round(cat_pred_1.tolist())}
                    )
    y = Y_test
    
    x_test = pd.DataFrame({'hist': np.round(hist_pred_2.tolist()), 
                           'lgb': np.round(lgb_pred_2.tolist()),
                           'xgb': np.round(xgb_pred_2.tolist()), 
                           'cat': np.round(cat_pred_2.tolist())}
                         )
    
    lad_md = LADRegression().fit(x, y)
    
    # Validation
    lad_pred = lad_md.predict(x)     
    ens_score = mean_absolute_error(y, lad_pred)
    ens_cv_scores.append(ens_score)
    
    #Predictions
    lad_pred_test = lad_md.predict(x_test)
    ens_preds.append(lad_pred_test)
    
    print('Fold N°', i, '==> LAD Model 1 ensemble - MAE: ====>', ens_score)

In [None]:
# Display important features for LGBMRegressor
imp = imp.groupby("features")["importance"].agg(["mean", "std"])
imp.columns = ["importance", "importance_std"]
imp["importance_cov"] = imp["importance_std"] / imp["importance"]
imp = imp.reset_index(drop=False)
display(imp.sort_values("importance", ascending=False, ignore_index=True))

In [None]:
lgb_cv_score = np.mean(lgb_cv_scores)
print(f"Score on CV test data             ======> {lgb_cv_score}")
preds_val = lgb_md.predict(X_val)
preds_val_score = mean_absolute_error(Y_val, preds_val)
print(f"Score on Valid data (unseen data) ======> {preds_val_score}")

In [None]:
# HistGB scores

hist_cv_score = np.mean(hist_cv_scores)
print(f"Score on CV test data             ======> {hist_cv_score}")

hist_preds_val = hist_md.predict(X_val)
hist_preds_val_score = mean_absolute_error(Y_val, hist_preds_val)
print(f"Score on Valid data (unseen data) ======> {hist_preds_val_score}")

In [None]:
# XGBoost scores

xgb_cv_score = np.mean(xgb_cv_scores)
print(f"Score on CV test data             ======> {xgb_cv_score}")

xgb_preds_val = xgb_md.predict(X_val)
xgb_preds_val_score = mean_absolute_error(Y_val, xgb_preds_val)
print(f"Score on Valid data (unseen data) ======> {xgb_preds_val_score}")

<p style="padding:15px; background-color:#E55807; font-weight:bold; color:#FFD700; font-size:100%; border-radius: 15px 15px;">
    <b>💡 Saving the models : <br>
        We can use joblib to save the models we trained so we can use them later if needed.

</b>
</p> 

In [None]:
"""from joblib import dump, load
dump(lgb_md, 'lgb_md.joblib') 
dump(hist_md, 'hist_md.joblib') 
dump(xgb_md, 'xgb_md.joblib') 
dump(lad_md, 'lad_md.joblib') """

# <p style="padding:15px; background-color:#E55807; font-family:JetBrains Mono; font-weight:bold; color:#000000; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 15px 15px; border: 5px solid #000000">Submissions</p>  

In [None]:
submission = pd.read_csv("/kaggle/input/playground-series-s3e16/sample_submission.csv")

In [None]:
# Visualize models performances
hist_cv_score = np.mean(hist_cv_scores)
lgb_cv_score = np.mean(lgb_cv_scores)
xgb_cv_score = np.mean(xgb_cv_scores)
cat_cv_score = np.mean(cat_cv_scores)
ens_cv_score = np.mean(ens_cv_scores)


model_perf = pd.DataFrame({'Models': [ 'HistGradient' ,'LightGBM', 'XGBoost', 'CatBoost', 'LAD Model'],
                           'CV-scores': [ hist_cv_score, lgb_cv_score, xgb_cv_score, cat_cv_score, ens_cv_score]
                          })

plt.figure(figsize = (8, 8))
ax = sns.barplot(y = 'Models', x = 'CV-scores', data = model_perf)
ax.bar_label(ax.containers[0]);

In [None]:
unique_targets = np.unique(train['Age'])
def mattop_post_process(preds):
     return np.array([min(unique_targets, key = lambda x: abs(x - pred)) for pred in preds])

In [None]:
ens_preds_test = mattop_post_process(pd.DataFrame(ens_preds).apply(np.mean, axis = 0))

submission['Age'] = ens_preds_test.astype(int)
submission.to_csv('LAD_model.csv', index = False)