In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.base import BaseEstimator, TransformerMixin
import zipfile
import os

from google.colab import drive


drive.mount('/content/drive')

path_to_zip = '/content/drive/My Drive/data/database.csv.zip'
with zipfile.ZipFile(path_to_zip, 'r') as zf:
    zf.extractall()

Mounted at /content/drive


# Load data

In [None]:
data = pd.read_csv('database.csv')
data.head()

Unnamed: 0,Date,Time,Latitude,Longitude,Type,Depth,Depth Error,Depth Seismic Stations,Magnitude,Magnitude Type,...,Magnitude Seismic Stations,Azimuthal Gap,Horizontal Distance,Horizontal Error,Root Mean Square,ID,Source,Location Source,Magnitude Source,Status
0,01/02/1965,13:44:18,19.246,145.616,Earthquake,131.6,,,6.0,MW,...,,,,,,ISCGEM860706,ISCGEM,ISCGEM,ISCGEM,Automatic
1,01/04/1965,11:29:49,1.863,127.352,Earthquake,80.0,,,5.8,MW,...,,,,,,ISCGEM860737,ISCGEM,ISCGEM,ISCGEM,Automatic
2,01/05/1965,18:05:58,-20.579,-173.972,Earthquake,20.0,,,6.2,MW,...,,,,,,ISCGEM860762,ISCGEM,ISCGEM,ISCGEM,Automatic
3,01/08/1965,18:49:43,-59.076,-23.557,Earthquake,15.0,,,5.8,MW,...,,,,,,ISCGEM860856,ISCGEM,ISCGEM,ISCGEM,Automatic
4,01/09/1965,13:32:50,11.938,126.427,Earthquake,15.0,,,5.8,MW,...,,,,,,ISCGEM860890,ISCGEM,ISCGEM,ISCGEM,Automatic


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23412 entries, 0 to 23411
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Date                        23412 non-null  object 
 1   Time                        23412 non-null  object 
 2   Latitude                    23412 non-null  float64
 3   Longitude                   23412 non-null  float64
 4   Type                        23412 non-null  object 
 5   Depth                       23412 non-null  float64
 6   Depth Error                 4461 non-null   float64
 7   Depth Seismic Stations      7097 non-null   float64
 8   Magnitude                   23412 non-null  float64
 9   Magnitude Type              23409 non-null  object 
 10  Magnitude Error             327 non-null    float64
 11  Magnitude Seismic Stations  2564 non-null   float64
 12  Azimuthal Gap               7299 non-null   float64
 13  Horizontal Distance         160

In [None]:
import datetime
import time

timestamp = []
for d, t in zip(data['Date'], data['Time']):
    try:
        ts = datetime.datetime.strptime(d+' '+t, '%m/%d/%Y %H:%M:%S')
        timestamp.append(time.mktime(ts.timetuple()))
    except ValueError:
        timestamp.append('ValueError')

timeStamp = pd.Series(timestamp)
data['Timestamp'] = timeStamp.values

final_data = data.drop(['Date', 'Time'], axis=1)
final_data = final_data[final_data.Timestamp != 'ValueError']
final_data.head()

Unnamed: 0,Latitude,Longitude,Type,Depth,Depth Error,Depth Seismic Stations,Magnitude,Magnitude Type,Magnitude Error,Magnitude Seismic Stations,Azimuthal Gap,Horizontal Distance,Horizontal Error,Root Mean Square,ID,Source,Location Source,Magnitude Source,Status,Timestamp
0,19.246,145.616,Earthquake,131.6,,,6.0,MW,,,,,,,ISCGEM860706,ISCGEM,ISCGEM,ISCGEM,Automatic,-157630542.0
1,1.863,127.352,Earthquake,80.0,,,5.8,MW,,,,,,,ISCGEM860737,ISCGEM,ISCGEM,ISCGEM,Automatic,-157465811.0
2,-20.579,-173.972,Earthquake,20.0,,,6.2,MW,,,,,,,ISCGEM860762,ISCGEM,ISCGEM,ISCGEM,Automatic,-157355642.0
3,-59.076,-23.557,Earthquake,15.0,,,5.8,MW,,,,,,,ISCGEM860856,ISCGEM,ISCGEM,ISCGEM,Automatic,-157093817.0
4,11.938,126.427,Earthquake,15.0,,,5.8,MW,,,,,,,ISCGEM860890,ISCGEM,ISCGEM,ISCGEM,Automatic,-157026430.0


In [None]:
final_data['Timestamp'] = pd.to_datetime(final_data['Timestamp'])
final_data.sort_values('Timestamp').head()

Unnamed: 0,Latitude,Longitude,Type,Depth,Depth Error,Depth Seismic Stations,Magnitude,Magnitude Type,Magnitude Error,Magnitude Seismic Stations,Azimuthal Gap,Horizontal Distance,Horizontal Error,Root Mean Square,ID,Source,Location Source,Magnitude Source,Status,Timestamp
0,19.246,145.616,Earthquake,131.6,,,6.0,MW,,,,,,,ISCGEM860706,ISCGEM,ISCGEM,ISCGEM,Automatic,1969-12-31 23:59:59.842369458
1,1.863,127.352,Earthquake,80.0,,,5.8,MW,,,,,,,ISCGEM860737,ISCGEM,ISCGEM,ISCGEM,Automatic,1969-12-31 23:59:59.842534189
2,-20.579,-173.972,Earthquake,20.0,,,6.2,MW,,,,,,,ISCGEM860762,ISCGEM,ISCGEM,ISCGEM,Automatic,1969-12-31 23:59:59.842644358
3,-59.076,-23.557,Earthquake,15.0,,,5.8,MW,,,,,,,ISCGEM860856,ISCGEM,ISCGEM,ISCGEM,Automatic,1969-12-31 23:59:59.842906183
4,11.938,126.427,Earthquake,15.0,,,5.8,MW,,,,,,,ISCGEM860890,ISCGEM,ISCGEM,ISCGEM,Automatic,1969-12-31 23:59:59.842973570


In [None]:
final_data['year'] = pd.DatetimeIndex(final_data['Timestamp']).year
final_data['month'] = pd.DatetimeIndex(final_data['Timestamp']).month
final_data['day'] = pd.DatetimeIndex(final_data['Timestamp']).day
final_data['hour'] = pd.DatetimeIndex(final_data['Timestamp']).hour
final_data['minute'] = pd.DatetimeIndex(final_data['Timestamp']).minute
final_data['second'] = pd.DatetimeIndex(final_data['Timestamp']).second
final_data['microsecond'] = pd.DatetimeIndex(final_data['Timestamp']).microsecond

final_data.head()

Unnamed: 0,Latitude,Longitude,Type,Depth,Depth Error,Depth Seismic Stations,Magnitude,Magnitude Type,Magnitude Error,Magnitude Seismic Stations,...,Magnitude Source,Status,Timestamp,year,month,day,hour,minute,second,microsecond
0,19.246,145.616,Earthquake,131.6,,,6.0,MW,,,...,ISCGEM,Automatic,1969-12-31 23:59:59.842369458,1969,12,31,23,59,59,842369
1,1.863,127.352,Earthquake,80.0,,,5.8,MW,,,...,ISCGEM,Automatic,1969-12-31 23:59:59.842534189,1969,12,31,23,59,59,842534
2,-20.579,-173.972,Earthquake,20.0,,,6.2,MW,,,...,ISCGEM,Automatic,1969-12-31 23:59:59.842644358,1969,12,31,23,59,59,842644
3,-59.076,-23.557,Earthquake,15.0,,,5.8,MW,,,...,ISCGEM,Automatic,1969-12-31 23:59:59.842906183,1969,12,31,23,59,59,842906
4,11.938,126.427,Earthquake,15.0,,,5.8,MW,,,...,ISCGEM,Automatic,1969-12-31 23:59:59.842973570,1969,12,31,23,59,59,842973


In [None]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23409 entries, 0 to 23411
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   Latitude                    23409 non-null  float64       
 1   Longitude                   23409 non-null  float64       
 2   Type                        23409 non-null  object        
 3   Depth                       23409 non-null  float64       
 4   Depth Error                 4460 non-null   float64       
 5   Depth Seismic Stations      7096 non-null   float64       
 6   Magnitude                   23409 non-null  float64       
 7   Magnitude Type              23406 non-null  object        
 8   Magnitude Error             327 non-null    float64       
 9   Magnitude Seismic Stations  2564 non-null   float64       
 10  Azimuthal Gap               7298 non-null   float64       
 11  Horizontal Distance         1604 non-null   float64   

In [None]:
final_data = final_data[['Latitude', 'Longitude', 'Type', 'Depth', 'Magnitude',
                         'Magnitude Type', 'Root Mean Square', 'Source', 
                         'Location Source', 'Magnitude Source', 'Status', 'year',
                         'month', 'day', 'hour', 'minute', 'second', 'microsecond']]

In [None]:
final_data.head()

Unnamed: 0,Latitude,Longitude,Type,Depth,Magnitude,Magnitude Type,Root Mean Square,Source,Location Source,Magnitude Source,Status,year,month,day,hour,minute,second,microsecond
0,19.246,145.616,Earthquake,131.6,6.0,MW,,ISCGEM,ISCGEM,ISCGEM,Automatic,1969,12,31,23,59,59,842369
1,1.863,127.352,Earthquake,80.0,5.8,MW,,ISCGEM,ISCGEM,ISCGEM,Automatic,1969,12,31,23,59,59,842534
2,-20.579,-173.972,Earthquake,20.0,6.2,MW,,ISCGEM,ISCGEM,ISCGEM,Automatic,1969,12,31,23,59,59,842644
3,-59.076,-23.557,Earthquake,15.0,5.8,MW,,ISCGEM,ISCGEM,ISCGEM,Automatic,1969,12,31,23,59,59,842906
4,11.938,126.427,Earthquake,15.0,5.8,MW,,ISCGEM,ISCGEM,ISCGEM,Automatic,1969,12,31,23,59,59,842973


In [None]:
class TargetEncoder(BaseEstimator, TransformerMixin):
    """Target encoder.
    Replaces categorical column(s) with the mean target value for
    each category.
    """
    
    def __init__(self, cols=None):
        """Target encoder Parameters
        ----------
        cols : list of str
            Columns to target encode.  Default is to target 
            encode all categorical columns in the DataFrame.
        """
        if isinstance(cols, str):
            self.cols = [cols]
        else:
            self.cols = cols
        
        
    def fit(self, X, y):
        """Fit target encoder to X and y Parameters
        ----------
        X : pandas DataFrame, shape [n_samples, n_columns]
            DataFrame containing columns to encode
        y : pandas Series, shape = [n_samples]
            Target values.
            
        Returns
        -------
        self : encoder
            Returns self.
        """
        
        # Encode all categorical cols by default
        if self.cols is None:
            self.cols = [col for col in X 
                         if str(X[col].dtype)=='object']

        # Check columns are in X
        for col in self.cols:
            if col not in X:
                raise ValueError('Column \''+col+'\' not in X')

        # Encode each element of each column
        self.maps = dict() #dict to store map for each column
        for col in self.cols:
            tmap = dict()
            uniques = X[col].unique()
            for unique in uniques:
                tmap[unique] = y[X[col]==unique].mean()
            self.maps[col] = tmap
            
        return self

        
    def transform(self, X, y=None):
        """Perform the target encoding transformation. Parameters
        ----------
        X : pandas DataFrame, shape [n_samples, n_columns]
            DataFrame containing columns to encode
            
        Returns
        -------
        pandas DataFrame
            Input DataFrame with transformed columns
        """
        Xo = X.copy()
        for col, tmap in self.maps.items():
            vals = np.full(X.shape[0], np.nan)
            for val, mean_target in tmap.items():
                vals[X[col]==val] = mean_target
            Xo[col] = vals
        return Xo
            
            
    def fit_transform(self, X, y=None):
        """Fit and transform the data via target encoding. Parameters
        ----------
        X : pandas DataFrame, shape [n_samples, n_columns]
            DataFrame containing columns to encode
        y : pandas Series, shape = [n_samples]
            Target values (required!).

        Returns
        -------
        pandas DataFrame
            Input DataFrame with transformed columns
        """
        return self.fit(X, y).transform(X, y)

In [None]:
cat_cols = ['Type', 'Magnitude Type', 'Source', 'Location Source', 
            'Magnitude Source', 'Status']

enc_cols_magnitude = []
enc_cols_depth = []


data_enc_magnitude = final_data[cat_cols].copy()
y_enc_magnitude = final_data['Magnitude'].copy()
data_enc_depth = final_data[cat_cols].copy()
y_enc_depth = final_data['Depth'].copy()

te = TargetEncoder()
magnitude_encoded = te.fit_transform(data_enc_magnitude, y_enc_magnitude)
te = TargetEncoder()
depth_encoded = te.fit_transform(data_enc_depth, y_enc_depth)


for col in cat_cols:
  enc_cols_magnitude.append(col+'_enc_magnitude') 
  enc_cols_depth.append(col+'_enc_depth')


final_data[enc_cols_magnitude] = magnitude_encoded
final_data[enc_cols_depth] = depth_encoded


del data_enc_magnitude
del y_enc_magnitude
del data_enc_depth
del y_enc_depth

for col in cat_cols:
  del final_data[col]

In [None]:
final_data.head()

Unnamed: 0,Latitude,Longitude,Depth,Magnitude,Root Mean Square,year,month,day,hour,minute,...,Source_enc_magnitude,Location Source_enc_magnitude,Magnitude Source_enc_magnitude,Status_enc_magnitude,Type_enc_depth,Magnitude Type_enc_depth,Source_enc_depth,Location Source_enc_depth,Magnitude Source_enc_depth,Status_enc_depth
0,19.246,145.616,131.6,6.0,,1969,12,31,23,59,...,6.007805,6.008524,6.007481,6.005616,71.294448,77.03974,68.54187,70.251918,70.267403,69.443653
1,1.863,127.352,80.0,5.8,,1969,12,31,23,59,...,6.007805,6.008524,6.007481,6.005616,71.294448,77.03974,68.54187,70.251918,70.267403,69.443653
2,-20.579,-173.972,20.0,6.2,,1969,12,31,23,59,...,6.007805,6.008524,6.007481,6.005616,71.294448,77.03974,68.54187,70.251918,70.267403,69.443653
3,-59.076,-23.557,15.0,5.8,,1969,12,31,23,59,...,6.007805,6.008524,6.007481,6.005616,71.294448,77.03974,68.54187,70.251918,70.267403,69.443653
4,11.938,126.427,15.0,5.8,,1969,12,31,23,59,...,6.007805,6.008524,6.007481,6.005616,71.294448,77.03974,68.54187,70.251918,70.267403,69.443653


In [None]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23409 entries, 0 to 23411
Data columns (total 24 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Latitude                        23409 non-null  float64
 1   Longitude                       23409 non-null  float64
 2   Depth                           23409 non-null  float64
 3   Magnitude                       23409 non-null  float64
 4   Root Mean Square                17350 non-null  float64
 5   year                            23409 non-null  int64  
 6   month                           23409 non-null  int64  
 7   day                             23409 non-null  int64  
 8   hour                            23409 non-null  int64  
 9   minute                          23409 non-null  int64  
 10  second                          23409 non-null  int64  
 11  microsecond                     23409 non-null  int64  
 12  Type_enc_magnitude              

In [None]:
final_data.columns.tolist()

['Latitude',
 'Longitude',
 'Depth',
 'Magnitude',
 'Root Mean Square',
 'year',
 'month',
 'day',
 'hour',
 'minute',
 'second',
 'microsecond',
 'Type_enc_magnitude',
 'Magnitude Type_enc_magnitude',
 'Source_enc_magnitude',
 'Location Source_enc_magnitude',
 'Magnitude Source_enc_magnitude',
 'Status_enc_magnitude',
 'Type_enc_depth',
 'Magnitude Type_enc_depth',
 'Source_enc_depth',
 'Location Source_enc_depth',
 'Magnitude Source_enc_depth',
 'Status_enc_depth']

In [None]:
final_data.to_csv('/content/final_data.csv')

In [None]:
final_data = pd.read_csv('/content/final_data.csv')

In [None]:
X = final_data[['Latitude', 'Longitude', 'Root Mean Square', 'year', 'month',
                'day', 'hour', 'minute', 'second', 'microsecond', 
                'Type_enc_magnitude', 'Magnitude Type_enc_magnitude', 
                'Source_enc_magnitude', 'Location Source_enc_magnitude', 
                'Magnitude Source_enc_magnitude', 'Status_enc_magnitude',
                'Type_enc_depth', 'Magnitude Type_enc_depth', 'Source_enc_depth',
                'Location Source_enc_depth', 'Magnitude Source_enc_depth', 
                'Status_enc_depth']]
                
y = final_data[['Magnitude', 'Depth']]

In [None]:
X_train = X.loc[0:19000]
X_test = X.loc[19000:]
y_train = y.loc[0:19000]
y_test = y.loc[19000:]
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(19001, 22) (19001, 2) (4409, 22) (4409, 2)


# XGboost model class

In [None]:
class GBModel:
  def __init__(
      self,
      n_models=1,
      early_stopping_rounds=40,
      use_cv_train=True,
      cv_folds=5,
      params_list=None,
      random_seed=42,
      model_name="default_gb_model",
      save_models_path=None):
    self.params_list = params_list
    self.random_seed = random_seed
    self.n_models = n_models

    self.boosting_regressor = xgb
    self.boosters = [None for i in range(self.n_models)]
    self.customize_data = xgb.DMatrix
    self.customize_data_predict = xgb.DMatrix
    self.train_rmse_mean = 'train-rmse-mean'
    self.eval_params = lambda dtrain, dvalid: {
        "evals": [(dtrain, 'train'), (dvalid, 'valid')]
    }
    if self.params_list is None:
      self.random_seed = 42
      self.seeds = np.random.choice(range(1000),
                                    self.n_models,
                                    replace=False)
      
      self.params_list = [{
          'eta': 0.1,
          'eval_metric': 'rmse',
          'max_depth': 4,
          'nthread': 8,
          'objective': 'reg:squarederror',
          'tree_method': 'hist',
          'random_seed': self.seeds[i],
      } for i in range(self.n_models)]

    else:
      self.seeds = [
          self.params_list[i]['random_seed']
          for i in range(self.n_models)
      ]

    self.boost_type = 'xgb'
    self.early_stopping_rounds = early_stopping_rounds
    self.use_cv_train = use_cv_train
    self.cv_folds = cv_folds
    self.model_name = model_name
    self.save_models_path = save_models_path
    self.num_trees_list = False
    self.evals = []


  def train(
      self,
      x_train,
      y_train,
      x_valid=None,
      y_valid=None,):
    np.random.seed(self.random_seed)
    dtrain = self.customize_data(x_train, label=y_train)

    if x_valid is not None:
      dvalid = self.customize_data(x_valid,
                                   label=y_valid)
      print(f"Training {self.n_models} models:")
      for n in tqdm(range(self.n_models)):
        _evals = {}
        self.boosters[n] = self.boosting_regressor.train(
            self.params_list[n],
            dtrain,
            **self.eval_params(dtrain, dvalid),
            verbose_eval=False,
            evals_result=_evals,
            num_boost_round=10000,
            early_stopping_rounds=self.early_stopping_rounds)
        
        self._save_bst_model(
            bst=self.boosters[n],
            bst_model_name=f'{self.model_name}_seed_{self.seeds[n]}',
            save_path=self.save_models_path)
        

      self.evals.append(_evals)

    else:
      if self.num_trees_list is False:
        self.num_trees_list = [10000 for n in range(self.n_models)]

      print(f"Training {self.n_models} models:")
      for n in tqdm(range(self.n_models)):
        self.boosters[n] = self.boosting_regressor.train(
            self.params_list[n],
            dtrain,
            num_boost_round=self.num_trees_list[n])
        self._save_bst_model(
            bst=self.boosters[n],
            bst_model_name=f'{self.model_name}_seed_{self.seeds[n]}',
            save_path=self.save_models_path)
        

  def cv(
      self,
      x_train,
      y_train):
    dtrain = self.customize_data(x_train, label=y_train)
    cv_results_list = []
    self.num_trees_list = []
    print(f'Calculating number of trees for {self.n_models} models')

    for n in tqdm(range(self.n_models)):
      cv_res = self.boosting_regressor.cv(
          self.params_list[n],
          dtrain,
          nfold=self.cv_folds,
          num_boost_round=10000,
          verbose_eval=False,
          early_stopping_rounds=self.early_stopping_rounds,
          seed=self.seeds[n],
          metrics='rmse')
      cv_results_list.append(cv_res)
      num_boost_round = len(cv_res[self.train_rmse_mean])
      self.num_trees_list.append(num_boost_round)

    return cv_results_list

  
  def fit(
      self,
      x_train,
      y_train,
      x_valid=None,
      y_valid=None):
    if self.use_cv_train:
      self.cv(x_train, y_train)
    self.train(x_train, y_train, x_valid, y_valid)


  def load_fitted_model(self):
    for n in range(len(self.seeds)):
      bst = xgb.Booster({'nthread': 4})
      bst.load_model(
          f'{self.save_models_path}{self.model_name}_seed_{self.seeds[n]}'
      )
      self.boosters[n] = bst

  
  def predict(self, X):
    data_to_predict = self.customize_data_predict(X)
    self.single_bst_prediction = []
    score = pd.Series(np.zeros((X.shape[0], )), index=X.index)
    for n in range(self.n_models):
      prediction = self.boosters[n].predict(data_to_predict)
      self.single_bst_prediction.append(prediction)
      score += prediction

    score /= self.n_models
    return score


  def _save_bst_model(
      self,
      bst,
      bst_model_name,
      save_path):
    if save_path:
      bst.save_model(f'{save_path}{bst_model_name}')
      return


In [None]:
# seeds = np.random.randint(1, high=1000, size=10)
seeds = [687, 749, 577, 158, 755, 904,  15, 187, 940, 606]
seeds

[687, 749, 577, 158, 755, 904, 15, 187, 940, 606]

In [None]:
n_models = 10
model_name = 'final_model_Magnitude'
save_path = '/content/'

n_thread = 12

xgb_params_1 = {
    'eta': 0.1,
    'max_depth': 4,
    'max_bin': 100,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'reg_lambda': 0.5,
    'min_child_weight': 0.01,
    'random_seed': seeds[0],
    'booster': 'gbtree',
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'objective': 'reg:squarederror',
    'verbosity': 0,
    'nthread': n_thread,
    'gamma': 0.5
}

xgb_params_2 = {
    'eta': 0.1,
    'max_depth': 4,
    'max_bin': 100,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'reg_lambda': 0.5,
    'min_child_weight': 0.01,
    'random_seed': seeds[1],
    'booster': 'gbtree',
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'objective': 'reg:squarederror',
    'verbosity': 0,
    'nthread': n_thread,
    'gamma': 0.5
}

xgb_params_3 = {
    'eta': 0.1,
    'max_depth': 4,
    'max_bin': 100,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'reg_lambda': 0.5,
    'min_child_weight': 0.01,
    'random_seed': seeds[2],
    'booster': 'gbtree',
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'objective': 'reg:squarederror',
    'verbosity': 0,
    'nthread': n_thread,
    'gamma': 0.5
}

xgb_params_4 = {
    'eta': 0.1,
    'max_depth': 4,
    'max_bin': 100,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'reg_lambda': 0.5,
    'min_child_weight': 0.01,
    'random_seed': seeds[3],
    'booster': 'gbtree',
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'objective': 'reg:squarederror',
    'verbosity': 0,
    'nthread': n_thread,
    'gamma': 0.5
}

xgb_params_5 = {
    'eta': 0.1,
    'max_depth': 4,
    'max_bin': 100,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'reg_lambda': 0.5,
    'min_child_weight': 0.01,
    'random_seed': seeds[4],
    'booster': 'gbtree',
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'objective': 'reg:squarederror',
    'verbosity': 0,
    'nthread': n_thread,
    'gamma': 0.5
}

xgb_params_6 = {
    'eta': 0.1,
    'max_depth': 4,
    'max_bin': 100,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'reg_lambda': 0.5,
    'min_child_weight': 0.01,
    'random_seed': seeds[5],
    'booster': 'gbtree',
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'objective': 'reg:squarederror',
    'verbosity': 0,
    'nthread': n_thread,
    'gamma': 0.5
}

xgb_params_7 = {
    'eta': 0.1,
    'max_depth': 4,
    'max_bin': 100,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'reg_lambda': 0.5,
    'min_child_weight': 0.01,
    'random_seed': seeds[6],
    'booster': 'gbtree',
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'objective': 'reg:squarederror',
    'verbosity': 0,
    'nthread': n_thread,
    'gamma': 0.5
}

xgb_params_8 = {
    'eta': 0.1,
    'max_depth': 4,
    'max_bin': 100,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'reg_lambda': 0.5,
    'min_child_weight': 0.01,
    'random_seed': seeds[7],
    'booster': 'gbtree',
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'objective': 'reg:squarederror',
    'verbosity': 0,
    'nthread': n_thread,
    'gamma': 0.5
}

xgb_params_9 = {
    'eta': 0.1,
    'max_depth': 4,
    'max_bin': 100,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'reg_lambda': 0.5,
    'min_child_weight': 0.01,
    'random_seed': seeds[8],
    'booster': 'gbtree',
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'objective': 'reg:squarederror',
    'verbosity': 0,
    'nthread': n_thread,
    'gamma': 0.5
}

xgb_params_10 = {
    'eta': 0.1,
    'max_depth': 4,
    'max_bin': 100,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'reg_lambda': 0.5,
    'min_child_weight': 0.01,
    'random_seed': seeds[9],
    'booster': 'gbtree',
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'objective': 'reg:squarederror',
    'verbosity': 0,
    'nthread': n_thread,
    'gamma': 0.5
}


xgb_params_list = [xgb_params_1, xgb_params_2, xgb_params_3, xgb_params_4, xgb_params_5,
                   xgb_params_6, xgb_params_7, xgb_params_8, xgb_params_9, xgb_params_10]

In [None]:
xgb_model = GBModel(
    n_models=10,
    early_stopping_rounds=40,
    use_cv_train=True,
    cv_folds=5,
    params_list=xgb_params_list,
    random_seed=42,
    model_name=model_name,
    save_models_path=save_path)

In [None]:
is_model_fitted = True

if is_model_fitted:
  xgb_model.load_fitted_model()
else:
  xgb_model.fit(X_train, y_train['Magnitude'])

In [None]:
prediction_Magnitude = xgb_model.predict(X_test)


In [None]:
from sklearn.metrics import mean_squared_error
y_true = y_test['Magnitude']
y_pred = prediction_Magnitude
mean_squared_error(y_true, y_pred, squared=False)

0.413541073841804

In [None]:
n_models = 10
model_name = 'final_model_Depth'
save_path = '/content/'

n_thread = 12

xgb_params_1 = {
    'eta': 0.1,
    'max_depth': 4,
    'max_bin': 100,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'reg_lambda': 0.5,
    'min_child_weight': 0.01,
    'random_seed': seeds[0],
    'booster': 'gbtree',
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'objective': 'reg:squarederror',
    'verbosity': 0,
    'nthread': n_thread,
    'gamma': 0.5
}

xgb_params_2 = {
    'eta': 0.1,
    'max_depth': 4,
    'max_bin': 100,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'reg_lambda': 0.5,
    'min_child_weight': 0.01,
    'random_seed': seeds[1],
    'booster': 'gbtree',
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'objective': 'reg:squarederror',
    'verbosity': 0,
    'nthread': n_thread,
    'gamma': 0.5
}

xgb_params_3 = {
    'eta': 0.1,
    'max_depth': 4,
    'max_bin': 100,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'reg_lambda': 0.5,
    'min_child_weight': 0.01,
    'random_seed': seeds[2],
    'booster': 'gbtree',
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'objective': 'reg:squarederror',
    'verbosity': 0,
    'nthread': n_thread,
    'gamma': 0.5
}

xgb_params_4 = {
    'eta': 0.1,
    'max_depth': 4,
    'max_bin': 100,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'reg_lambda': 0.5,
    'min_child_weight': 0.01,
    'random_seed': seeds[3],
    'booster': 'gbtree',
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'objective': 'reg:squarederror',
    'verbosity': 0,
    'nthread': n_thread,
    'gamma': 0.5
}

xgb_params_5 = {
    'eta': 0.1,
    'max_depth': 4,
    'max_bin': 100,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'reg_lambda': 0.5,
    'min_child_weight': 0.01,
    'random_seed': seeds[4],
    'booster': 'gbtree',
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'objective': 'reg:squarederror',
    'verbosity': 0,
    'nthread': n_thread,
    'gamma': 0.5
}

xgb_params_6 = {
    'eta': 0.1,
    'max_depth': 4,
    'max_bin': 100,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'reg_lambda': 0.5,
    'min_child_weight': 0.01,
    'random_seed': seeds[5],
    'booster': 'gbtree',
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'objective': 'reg:squarederror',
    'verbosity': 0,
    'nthread': n_thread,
    'gamma': 0.5
}

xgb_params_7 = {
    'eta': 0.1,
    'max_depth': 4,
    'max_bin': 100,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'reg_lambda': 0.5,
    'min_child_weight': 0.01,
    'random_seed': seeds[6],
    'booster': 'gbtree',
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'objective': 'reg:squarederror',
    'verbosity': 0,
    'nthread': n_thread,
    'gamma': 0.5
}

xgb_params_8 = {
    'eta': 0.1,
    'max_depth': 4,
    'max_bin': 100,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'reg_lambda': 0.5,
    'min_child_weight': 0.01,
    'random_seed': seeds[7],
    'booster': 'gbtree',
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'objective': 'reg:squarederror',
    'verbosity': 0,
    'nthread': n_thread,
    'gamma': 0.5
}

xgb_params_9 = {
    'eta': 0.1,
    'max_depth': 4,
    'max_bin': 100,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'reg_lambda': 0.5,
    'min_child_weight': 0.01,
    'random_seed': seeds[8],
    'booster': 'gbtree',
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'objective': 'reg:squarederror',
    'verbosity': 0,
    'nthread': n_thread,
    'gamma': 0.5
}

xgb_params_10 = {
    'eta': 0.1,
    'max_depth': 4,
    'max_bin': 100,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'reg_lambda': 0.5,
    'min_child_weight': 0.01,
    'random_seed': seeds[9],
    'booster': 'gbtree',
    'tree_method': 'hist',
    'eval_metric': 'rmse',
    'objective': 'reg:squarederror',
    'verbosity': 0,
    'nthread': n_thread,
    'gamma': 0.5
}


xgb_params_list = [xgb_params_1, xgb_params_2, xgb_params_3, xgb_params_4, xgb_params_5,
                   xgb_params_6, xgb_params_7, xgb_params_8, xgb_params_9, xgb_params_10]

In [None]:
xgb_model = GBModel(
    n_models=10,
    early_stopping_rounds=40,
    use_cv_train=True,
    cv_folds=5,
    params_list=xgb_params_list,
    random_seed=42,
    model_name=model_name,
    save_models_path=save_path)

In [None]:
is_model_fitted = False

if is_model_fitted:
  xgb_model.load_fitted_model()
else:
  xgb_model.fit(X_train, y_train['Depth'])

Calculating number of trees for 10 models


  0%|          | 0/10 [00:00<?, ?it/s]

Training 10 models:


  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
prediction_Depth = xgb_model.predict(X_test)


In [None]:
from sklearn.metrics import mean_squared_error
y_true = y_test['Depth']
y_pred = prediction_Depth
mean_squared_error(y_true, y_pred, squared=False)

65.49114406857844