<a href="https://colab.research.google.com/github/Cralsic123/Model-selection-for-abalone-ring/blob/main/Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.3-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.8/78.8 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
from xgboost import XGBRegressor


In [None]:
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c playground-series-s4e4

In [None]:
import zipfile
with zipfile.ZipFile('/content/playground-series-s4e4.zip', 'r') as zip_ref:
    zip_ref.extractall('./')



In [None]:
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")

In [None]:
train_df.head()

In [None]:
train_df.info()


In [None]:
def transform_weight(df) :
    df['Weight_diff_1'] = df['Whole weight'] - df['Whole weight.1']
    df['Weight_diff_2'] = df['Whole weight'] - df['Whole weight.2'] - df['Shell weight']
    return df

train_df = transform_weight(train_df)
test_df = transform_weight(test_df)


In [None]:
pd.DataFrame(train_df.Rings.value_counts())

In [None]:
test_df.shape

In [None]:
numeric_cols = ['Length', 'Diameter', 'Height', 'Whole weight',
       'Whole weight.1', 'Whole weight.2', 'Shell weight',
       'Weight_diff_1', 'Weight_diff_2']
categorical_cols = ['Sex']
train_to_scale = train_df[numeric_cols]
test_to_scale = test_df[numeric_cols]

In [None]:
test_to_scale.shape

In [None]:
sns.set_style("whitegrid")
palette = sns.color_palette("hls", len(numeric_cols))
fig, axs = plt.subplots(len(numeric_cols), figsize=(20, len(numeric_cols)*6))

for i, col in enumerate(numeric_cols):
    sns.violinplot(x=train_to_scale[col], ax=axs[i], inner="quartile", palette=[palette[i]])
    axs[i].set_title('Feature Distribution for ' + col, fontsize=30)
    axs[i].set_xlabel(col, fontsize=26)
    axs[i].set_ylabel('Density', fontsize=26)
plt.tight_layout()
plt.show()

Feature distribution of Length and Diameter look same showing that the Longest shell measurement and measurement Perpendicular to length for an abalone are proportional

In [None]:
correlation_matrix = train_to_scale.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='Greens', fmt='.2f', linewidths=.5)
plt.title('Correlation Matrix (Lower Triangle)')
plt.show()

In [None]:
sc = StandardScaler()
def scale_num_cols(df) :
    scaled_df = pd.DataFrame(sc.fit_transform(df), columns=df.columns)
    return scaled_df

scaled_train = scale_num_cols(train_to_scale)
scaled_test = scale_num_cols(test_to_scale)

In [None]:
scaled_train.head()

ONE HOT ENCODING

In [None]:
train_to_ohe = train_df[categorical_cols]
test_to_ohe = test_df[categorical_cols]
train_to_ohe.head()

In [None]:
ohe_train = pd.get_dummies(train_to_ohe, columns =train_to_ohe.columns )
ohe_test  = pd.get_dummies(test_to_ohe, columns = train_to_ohe.columns)
ohe_train.head()

In [None]:
train_df_1 = pd.concat([ohe_train, scaled_train], axis =1)
test_df_1 = pd.concat([ohe_test, scaled_test], axis =1)
train_df_1.head()

In [None]:
test_df_1.shape

In [None]:
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
y = train_df['Rings']
X= train_df_1

In [None]:
X.columns.shape[0]

In [None]:
!pip install lightgbm

In [None]:
!pip install catboost

In [None]:

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import KFold

def train_model(model_type='xgboost', n_splits=10, X=X, y=y, x_test=test_df_1, model_params=None, use_gpu=False):
    n_splits = n_splits

    # KFold
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    # cross-validation results
    cv_results = []
    y_test_preds = []

    y_val_preds = np.zeros((X.shape[0], 1))
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):

        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        # Model
        if model_type == 'xgboost':
            if model_params is None:
                model_params = {}
            if use_gpu:
                model_params['device'] = 'cuda'
            model = XGBRegressor(**model_params)
        elif model_type == 'xgboostRMSLE':
            if model_params is None:
                model_params = {}
            if use_gpu:
                model_params['device'] = 'cuda'
            model = XGBRegressor(**model_params)
        elif model_type == 'lightgbm':
            if model_params is None:
                model_params = {}
            model = LGBMRegressor(**model_params)
        elif model_type == 'catboost':
            if model_params is None:
                model_params = {}
            if use_gpu:
                model_params['task_type'] = 'GPU'
            model = CatBoostRegressor(**model_params, verbose = 0)
        elif model_type == 'randomforest':
            if model_params is None:
                model_params = {}
            model = RandomForestRegressor(**model_params)
        elif model_type == 'histgradientboosting':
            if model_params is None:
                model_params = {}
            model = HistGradientBoostingRegressor(**model_params)
        elif model_type == 'neuralnetwork':
            if model_params is None:
                model_params = {}
            model = MLPRegressor(**model_params)

        model.fit(X_train, y_train)
        y_val_pred = model.predict(X_val)

        y_val_preds[val_idx] = y_val_pred.reshape(-1,1)

        y_test_pred = model.predict(x_test)
        y_test_preds.append(y_test_pred)

        # Evaluating the model
        mse = mean_squared_log_error(y_val, y_val_pred)
        rmse = np.sqrt(mse)

        print(f'Fold {fold + 1}, Root Mean Squared Logarithmic Error on Validation Set: {rmse}')
        print('-'*70)

        # results
        cv_results.append(rmse)

    # average cross-validation result
    average_cv_result = sum(cv_results) / n_splits
    print(f'\nAverage Root Mean Squared Logarithmic Error across {n_splits} folds: {average_cv_result}')

    return cv_results, model, y_val_preds, y_test_preds

FOR XG BOOST

In [None]:
best_params_xgb =  {'max_depth': 10,'random_state':42,'device': 'cuda','booster': 'gbtree','n_estimators': 1137, 'tree_method': 'hist','min_child_weight': 7,  'grow_policy': 'lossguide', 'gamma': 0.03816426816838989, 'subsample': 0.486382907668344, 'objective': 'reg:squarederror','reg_lambda': 1.7487237399420372, 'reg_alpha': 0.013043045359306716,'learning_rate': 0.011733966748427322, 'colsample_bytree': 0.5748511749872887,
}

In [None]:
cv_results_xgb, xgb_model, y_val_pred_xgb, y_test_pred_xgb = train_model(model_type='xgboost', n_splits=10, X=X, y=y, use_gpu=True, model_params = best_params_xgb)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Fold 1, Root Mean Squared Logarithmic Error on Validation Set: 0.14925396892770834
----------------------------------------------------------------------
Fold 2, Root Mean Squared Logarithmic Error on Validation Set: 0.15224837073064945
----------------------------------------------------------------------
Fold 3, Root Mean Squared Logarithmic Error on Validation Set: 0.14789578017335248
----------------------------------------------------------------------
Fold 4, Root Mean Squared Logarithmic Error on Validation Set: 0.1529757102016355
----------------------------------------------------------------------
Fold 5, Root Mean Squared Logarithmic Error on Validation Set: 0.1495646934787606
----------------------------------------------------------------------
Fold 6, Root Mean Squared Logarithmic Error on Validation Set: 0.14844357801223915
----------------------------------------------------------------------
Fold 7, Root Mean Squared Logarithmic Error on Validation Set: 0.1468901598394

In [None]:
predictions = xgb_model.predict(test_df_1)

In [None]:
predictions

array([ 9.896539,  9.473007, 10.117287, ..., 11.843922, 12.766056,
        9.069344], dtype=float32)

In [None]:
results_df_DL = pd.DataFrame({'id': test_df['id'], 'Rings': predictions.flatten()})
results_df_DL.to_csv("submission.csv", index=False)

Trying a different weights

In [None]:
best_params_xgb =  {'max_depth': 11,'random_state':42,'device': 'cuda','booster': 'gbtree','n_estimators': 1237, 'tree_method': 'hist','min_child_weight': 8,  'grow_policy': 'lossguide', 'gamma': 0.03999426816838989, 'subsample': 0.486382907668344, 'objective': 'reg:squarederror','reg_lambda': 1.9487237399420372, 'reg_alpha': 0.003043045359306716,'learning_rate': 0.010033966748427322, 'colsample_bytree': 0.5748511749872887,
}

In [None]:
cv_results_xgb, xgb_model, y_val_pred_xgb, y_test_pred_xgb = train_model(model_type='xgboost', n_splits=10, X=X, y=y, use_gpu=True, model_params = best_params_xgb)

Fold 1, Root Mean Squared Logarithmic Error on Validation Set: 0.14944908674040538
----------------------------------------------------------------------
Fold 2, Root Mean Squared Logarithmic Error on Validation Set: 0.15206277166938378
----------------------------------------------------------------------
Fold 3, Root Mean Squared Logarithmic Error on Validation Set: 0.14791356737825906
----------------------------------------------------------------------
Fold 4, Root Mean Squared Logarithmic Error on Validation Set: 0.15294533260992255
----------------------------------------------------------------------
Fold 5, Root Mean Squared Logarithmic Error on Validation Set: 0.1496616205546562
----------------------------------------------------------------------
Fold 6, Root Mean Squared Logarithmic Error on Validation Set: 0.14845323115583575
----------------------------------------------------------------------
Fold 7, Root Mean Squared Logarithmic Error on Validation Set: 0.146978245232

In [None]:
predictions1 = xgb_model.predict(test_df_1)

In [None]:
results_df_DL = pd.DataFrame({'id': test_df['id'], 'Rings': predictions1.flatten()})
results_df_DL.to_csv("submission_2.csv", index=False)

NOW LGBM


In [None]:
!pip install OpenCL

[31mERROR: Could not find a version that satisfies the requirement OpenCL (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for OpenCL[0m[31m
[0m

In [None]:
import lightgbm as lgb

model = lgb.LGBMRegressor(device='cpu')

In [None]:

best_params_lgbm = {'metric':'rmse', 'device':'gpu', 'verbosity': -1,'max_depth': 15,'random_state':42,'num_leaves': 138, 'n_estimators': 913, 'boosting_type': 'gbdt', 'min_child_samples': 34,
     'objective':'regression',
     'subsample_for_bin': 185680,
     'subsample': 0.799314727120346,
     'reg_alpha': 5.916235901972299e-09,
     'reg_lambda': 6.943912907338958e-08,
     'learning_rate': 0.01851440025520457,
     'colsample_bytree': 0.4339090795122026,
}

In [None]:
train_data = lgb.Dataset(X, label=y)

In [None]:
lgbm_model = model.train(best_params_lgbm, train_data , num_boost_round=100)

AttributeError: 'LGBMRegressor' object has no attribute 'train'

In [None]:
best_params_cb = {'depth': 15, 'max_bin': 464,'random_state':42,'task_type': 'CPU', 'eval_metric': 'RMSE', 'min_data_in_leaf': 78, 'loss_function': 'RMSE', 'grow_policy': 'Lossguide', 'bootstrap_type': 'Bernoulli', 'subsample': 0.83862137638162, 'l2_leaf_reg': 8.365422739510098, 'random_strength': 3.296124856352495, 'learning_rate': 0.09992185242598203,
}

In [None]:
cv_results_cb, catboost_model, y_val_pred_cb, y_test_pred_cb = train_model(model_type='catboost', n_splits=10, X=X, y=y, use_gpu=True, model_params = best_params_cb)

Fold 1, Root Mean Squared Logarithmic Error on Validation Set: 0.1487939046473861
----------------------------------------------------------------------
Fold 2, Root Mean Squared Logarithmic Error on Validation Set: 0.15185873891050983
----------------------------------------------------------------------
Fold 3, Root Mean Squared Logarithmic Error on Validation Set: 0.14804516477154814
----------------------------------------------------------------------
Fold 4, Root Mean Squared Logarithmic Error on Validation Set: 0.15303377543552646
----------------------------------------------------------------------
Fold 5, Root Mean Squared Logarithmic Error on Validation Set: 0.14943827091109965
----------------------------------------------------------------------
Fold 6, Root Mean Squared Logarithmic Error on Validation Set: 0.14863405079182676
----------------------------------------------------------------------
Fold 7, Root Mean Squared Logarithmic Error on Validation Set: 0.146732306642

In [None]:
predictions = catboost_model.predict(test_df_1)

In [None]:
predictions_net = (predictions + predictions1)/2

In [None]:
results_df_DL = pd.DataFrame({'id': test_df['id'], 'Rings': predictions_net.flatten()})
results_df_DL.to_csv("submission_3.csv", index=False)

In [None]:
best_params_rf  ={'n_estimators': 112,
 'max_depth': 10,
 'min_samples_split': 3,
 'min_samples_leaf': 3}

In [None]:
cv_results_rf, rfmodel, y_val_pred_rf, y_test_pred_rf = train_model(model_type='randomforest', n_splits=10, X=X, y=y, use_gpu=False, model_params = best_params_rf)

Fold 1, Root Mean Squared Logarithmic Error on Validation Set: 0.1518702421983524
----------------------------------------------------------------------
Fold 2, Root Mean Squared Logarithmic Error on Validation Set: 0.15455017496622162
----------------------------------------------------------------------
Fold 3, Root Mean Squared Logarithmic Error on Validation Set: 0.14981095241790263
----------------------------------------------------------------------
Fold 4, Root Mean Squared Logarithmic Error on Validation Set: 0.15443494380121128
----------------------------------------------------------------------
Fold 5, Root Mean Squared Logarithmic Error on Validation Set: 0.15126177840807856
----------------------------------------------------------------------
Fold 6, Root Mean Squared Logarithmic Error on Validation Set: 0.15050388892070407
----------------------------------------------------------------------
Fold 7, Root Mean Squared Logarithmic Error on Validation Set: 0.148486909166

MLP Regressor

In [None]:
best_params_mlpr ={'hidden_layer_sizes': (100,),
 'activation': 'logistic',
 'learning_rate_init': 0.03416272700029748,
 'max_iter': 906}

In [None]:
cv_results_nn, mlprmodel, y_val_pred_nn, y_test_pred_nn = train_model(model_type='neuralnetwork', n_splits=10, X=X, y=y, use_gpu=False, model_params = best_params_mlpr)

Fold 1, Root Mean Squared Logarithmic Error on Validation Set: 0.15300324627553388
----------------------------------------------------------------------
Fold 2, Root Mean Squared Logarithmic Error on Validation Set: 0.15617644818486193
----------------------------------------------------------------------
Fold 3, Root Mean Squared Logarithmic Error on Validation Set: 0.15246401984882385
----------------------------------------------------------------------
Fold 4, Root Mean Squared Logarithmic Error on Validation Set: 0.16308565070568956
----------------------------------------------------------------------
Fold 5, Root Mean Squared Logarithmic Error on Validation Set: 0.15511173430441982
----------------------------------------------------------------------
Fold 6, Root Mean Squared Logarithmic Error on Validation Set: 0.1518240659747843
----------------------------------------------------------------------
Fold 7, Root Mean Squared Logarithmic Error on Validation Set: 0.149150781507

In [None]:
best_params_xgbrsmle = {'max_depth': 10, 'random_state':42,
    'device': 'cuda',
    'booster': 'gbtree',
    'n_estimators': 1137,
    'tree_method': 'hist',
    'min_child_weight': 7,
    'grow_policy': 'lossguide',
    'gamma': 0.03816426816838989,
    'subsample': 0.486382907668344,
    'objective': 'reg:squaredlogerror',
    'reg_lambda': 1.7487237399420372,
    'reg_alpha': 0.013043045359306716,
    'learning_rate': 0.011733966748427322,
    'colsample_bytree': 0.5748511749872887,
}

In [None]:
cv_results_xgbrmsle, xgbrmslemodel, y_val_pred_xgbrmsle, y_test_pred_xgbrmsle = train_model(model_type='xgboostRMSLE', n_splits=10, X=X, y=y, use_gpu=True, model_params = best_params_xgbrsmle)

Fold 1, Root Mean Squared Logarithmic Error on Validation Set: 0.152719045512609
----------------------------------------------------------------------
Fold 2, Root Mean Squared Logarithmic Error on Validation Set: 0.15610986097639581
----------------------------------------------------------------------
Fold 3, Root Mean Squared Logarithmic Error on Validation Set: 0.15113234239084575
----------------------------------------------------------------------
Fold 4, Root Mean Squared Logarithmic Error on Validation Set: 0.1554569618326883
----------------------------------------------------------------------
Fold 5, Root Mean Squared Logarithmic Error on Validation Set: 0.15331953200554752
----------------------------------------------------------------------
Fold 6, Root Mean Squared Logarithmic Error on Validation Set: 0.15198692731935162
----------------------------------------------------------------------
Fold 7, Root Mean Squared Logarithmic Error on Validation Set: 0.14965581905307

HIST GRADIENT BOOSTING

In [None]:
best_params_hgb ={'max_iter': 720,
                  'max_depth': 10,
                  'learning_rate': 0.06454729355575252,
                  'min_samples_leaf': 14}

In [None]:
cv_results_hgb, hgbmodel, y_val_pred_hgb, y_test_pred_hgb = train_model(model_type='histgradientboosting', n_splits=10, X=X, y=y, use_gpu=False, model_params = best_params_hgb)

Fold 1, Root Mean Squared Logarithmic Error on Validation Set: 0.15044881912396812
----------------------------------------------------------------------
Fold 2, Root Mean Squared Logarithmic Error on Validation Set: 0.15448539281802626
----------------------------------------------------------------------
Fold 3, Root Mean Squared Logarithmic Error on Validation Set: 0.14922556477893117
----------------------------------------------------------------------
Fold 4, Root Mean Squared Logarithmic Error on Validation Set: 0.15433188287466718
----------------------------------------------------------------------
Fold 5, Root Mean Squared Logarithmic Error on Validation Set: 0.15105690460629617
----------------------------------------------------------------------
Fold 6, Root Mean Squared Logarithmic Error on Validation Set: 0.14941532194994855
----------------------------------------------------------------------
Fold 7, Root Mean Squared Logarithmic Error on Validation Set: 0.14734176611

Now applying optuna weights

In [None]:
from functools import partial

class OptunaWeights:
    def __init__(self, random_state, n_trials=5000):
        # Optuna study object which will hold the optimization results.
        self.study = None
        # Optimal weights for the predictions determined by Optuna.
        self.weights = None
        # Random state for reproducibility. This is used for the CMA-ES sampler.
        self.random_state = random_state
        # Number of trials for hyperparameter optimization. This is the number of different sets of weights that Optuna will try.
        self.n_trials = n_trials

    def _objective(self, trial, y_true, y_preds):
        # Defining the weights for the predictions from each model.
        # Optuna will suggest different weights to try in each trial.
        weights = [trial.suggest_float(f"weight{n}", 0, 1) for n in range(len(y_preds) - 1)]
        # The last weight is determined so that the sum of all weights is 1.
        weights.append(1 - sum(weights))

        # Calculating the weighted prediction. This is the average of the predictions from each model,
        # with each prediction being weighted by the weights determined by Optuna.
        weighted_pred = np.average(np.array(y_preds), axis=0, weights=weights)

        y_true, weighted_pred = np.abs(y_true), np.abs(weighted_pred)
        # Calculating the root mean squared logarithmic error (RMSLE) of the weighted prediction.
        # This is the metric that Optuna is trying to minimize for our task
        rmsle = np.sqrt(mean_squared_log_error(y_true, weighted_pred))

        return rmsle

    def fit(self, y_true, y_preds):
        # Setting Optuna logging verbosity to ERROR to avoid cluttering the output with logs.
        optuna.logging.set_verbosity(optuna.logging.ERROR)

        # Creating a CMA-ES sampler for hyperparameter optimization.
        # This is a type of sampler that uses the Covariance Matrix Adaptation Evolution Strategy,
        # which is good for optimizing continuous variables, like our weights.
        sampler = optuna.samplers.CmaEsSampler(seed=self.random_state)

        # Creating a Hyperband pruner for early stopping.
        # This will stop trials early if they are not promising, which can save time.
        pruner = optuna.pruners.HyperbandPruner()

        # Creating an Optuna study with the specified sampler, pruner, and objective direction.
        # The objective direction is 'minimize' because we want to minimize the RMSLE.
        self.study = optuna.create_study(sampler=sampler, pruner=pruner, study_name="OptunaWeights",
                                         direction='minimize')

        # Creating a partial function for the objective with y_true and y_preds as fixed arguments.
        # This is necessary because Optuna's optimize method only accepts a function with one argument, the trial.
        objective_partial = partial(self._objective, y_true=y_true, y_preds=y_preds)

        # Optimizing the study with the specified number of trials.
        # The progress bar will show the progress of the optimization.
        self.study.optimize(objective_partial, n_trials=self.n_trials, show_progress_bar=True)

        # Extracting the best weights from the study's best parameters.
        # These are the weights that gave the lowest RMSLE in the trials.
        weights = [self.study.best_params[f"weight{n}"] for n in range(len(y_preds) - 1)]
        # The last weight is determined so that the sum of all weights is 1.
        weights.append(1 - sum(weights))

        # Store the optimal weights in the weights attribute.
        self.weights = weights


# Creating an instance of OptunaWeights with random state 42.
# This instance can be used to find the optimal weights for combining the predictions from different models.
ow = OptunaWeights(1)

In [None]:
!pip install cmaes -q

In [None]:
# Fitting the OptunaWeights object with the true target values and out-of-fold predictions from multiple models
ow.fit(train_df['Rings'], y_preds=[y_val_pred_xgb, y_val_pred_cb,y_val_pred_rf,y_val_pred_nn,y_val_pred_xgbrmsle ])

  0%|          | 0/5000 [00:00<?, ?it/s]

In [None]:
# Getting the optimal weights
weights = ow.weights
lst_optuna_weights = np.asarray(weights)
sum_lst_optuna_weights = np.sum(lst_optuna_weights)
print(sum_lst_optuna_weights)
lst_optuna_weights, sum(lst_optuna_weights)

1.0


(array([0.15100683, 0.55096443, 0.03348257, 0.03249783, 0.23204834]), 1.0)

In [None]:
sorted_weights = sorted(list(lst_optuna_weights))
sorted_weights

[0.032497833534150096,
 0.03348256638068388,
 0.15100683490564482,
 0.2320483379522802,
 0.5509644272272409]

In [None]:
# Fitting the OptunaWeights object with the true target values and out-of-fold predictions from multiple models
ow.fit(train_df['Rings'], y_preds=[y_val_pred_xgb,y_val_pred_xgbrmsle])

  0%|          | 0/5000 [00:00<?, ?it/s]

In [None]:
weights = ow.weights
lst_optuna_weights = np.asarray(weights)
sum_lst_optuna_weights = np.sum(lst_optuna_weights)
print(sum_lst_optuna_weights)
lst_optuna_weights, sum(lst_optuna_weights)


1.0


(array([0.74852032, 0.25147968]), 1.0)

Using optimized weights to average the prediction

In [None]:
predictions_1 = np.mean(y_test_pred_xgb, axis=0)
predictions_2 = np.mean(y_test_pred_cb, axis=0)
predictions_3 = np.mean(y_test_pred_rf, axis=0)
predictions_4 = np.mean(y_test_pred_hgb, axis=0)
predictions_5 = np.mean(y_test_pred_nn, axis=0)
# predictions_6 = np.mean(y_test_pred_xgbrmsle, axis=0)

In [None]:
predictions_1 = ((lst_optuna_weights[0]) * predictions_1 +
        (lst_optuna_weights[1]) * predictions_2 +
        (lst_optuna_weights[2]) * predictions_3
                 )

IndexError: index 2 is out of bounds for axis 0 with size 2

In [None]:
ow_3 = OptunaWeights(42)
ow_3.fit(train_df['Rings'], y_preds=[y_val_pred_rf, y_val_pred_hgb,y_val_pred_nn])

weights = ow_3.weights
lst_optuna_weights = np.asarray(weights)
sum_lst_optuna_weights = np.sum(lst_optuna_weights)

  0%|          | 0/5000 [00:00<?, ?it/s]

In [None]:
lst_optuna_weights

array([0.3279636 , 0.57673797, 0.09529843])

In [None]:
predictions_3 =      ((lst_optuna_weights[0]) * predictions_3+
               (lst_optuna_weights[1]) * predictions_4+
               (lst_optuna_weights[2]) * predictions_5  )

In [None]:
predictions_3

array([ 9.86464229,  9.66938267, 10.08108997, ..., 12.40344676,
       13.00467386,  8.74953755])

VOTING REGRESSOR

In [None]:
from sklearn.ensemble import VotingRegressor

In [None]:
test_df_1.shape

(60411, 12)

In [None]:
voting = VotingRegressor([('xgb', xgb_model), ('hgb', hgbmodel), ('rmsle', xgbrmslemodel)], weights=list(lst_optuna_weights))

voting.fit(train_df_1, y)
predictions_2 = voting.predict(test_df_1)

In [None]:
predictions_2

array([ 9.69437118,  9.58302179, 10.13693052, ..., 12.23437374,
       13.28801781,  8.69648518])

In [None]:
predictions_3.shape

(60411,)

In [None]:
predictions = (predictions_1 + predictions_2 + predictions_3)/3
predictions

array([ 9.83082616,  9.56882199, 10.09203149, ..., 12.21043849,
       13.18202478,  8.77045496])

In [None]:
y_pred = pd.DataFrame(predictions)
y_pred.columns = ['Rings']
y_pred.head()

Unnamed: 0,Rings
0,9.830826
1,9.568822
2,10.092031
3,10.27974
4,7.505324


In [None]:
ids = test_df['id']

In [None]:
submission_df = pd.DataFrame()
submission_df = y_pred
submission_df['id'] = ids
submission_df['id'] = submission_df['id'].apply(lambda x : int(x))
submission_df.head()

Unnamed: 0,Rings,id
0,9.830826,90615
1,9.568822,90616
2,10.092031,90617
3,10.27974,90618
4,7.505324,90619


In [None]:
submission_df.to_csv('submission.csv', index= False)

In [None]:
df1 = pd.read_csv('submission.csv')

In [None]:
df2 = pd.read_csv('test.csv')

In [None]:
len(df1)

60411

In [None]:
len(df2)

60411

In [None]:
df2.tail()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight
60406,151021,I,0.345,0.26,0.085,0.1775,0.0735,0.0265,0.05
60407,151022,F,0.525,0.41,0.145,0.8445,0.3885,0.167,0.205
60408,151023,I,0.59,0.44,0.155,1.122,0.393,0.2,0.265
60409,151024,F,0.66,0.525,0.19,1.4935,0.5885,0.3575,0.435
60410,151025,F,0.43,0.34,0.12,0.415,0.1525,0.091,0.0905
