In [1]:
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from xgboost import plot_importance
from lightgbm import plot_importance as plot_important_lgbm
from xgboost import cv
import lightgbm as lgb
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score

import pandas as pd
from tqdm import tqdm
from sklearn.metrics import mean_squared_error

from bayes_opt import BayesianOptimization
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval

from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor

import matplotlib.pyplot as plt
from xgboost import plot_importance
import seaborn as sns
import warnings

import gc
from mlxtend.regressor import StackingCVRegressor

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('/kaggle/input/saved-data/preprocess_train.csv')
test = pd.read_csv('/kaggle/input/saved-data/preprocess_test.csv')
test_ID = pd.read_csv('/kaggle/input/umojahack-2023-africa-carbon-dioxide/Test(3).csv')
test_ID = test_ID['ID_LAT_LON_YEAR_WEEK']
X = train.drop(['emission','ID_LAT_LON_YEAR_WEEK','Place','date'],axis=1)
y = train.emission
y = y.apply(np.log1p)
del train
gc.collect()

0

# **CatBoots**

In [3]:
cb_model = CatBoostRegressor(iterations=5000,
                             learning_rate=0.045,
                             depth=8,
                             eval_metric='RMSE',
                             random_seed = 42,
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             metric_period = 50,
                             od_wait=300,
                             task_type="GPU")

# **Xgboost**

In [4]:
params = {'eta': 0.19,
 'eval_metric': 'rmse',
 'gamma': 5.420256552806366,
 'n_estimators': 970,
 'subsample': 0.78}
xgb_model = XGBRegressor(**params,tree_method='gpu_hist')

# **LightGBM**

In [5]:
lgbm_param = {'bagging_fraction': 0.6214882520102827,
 'eval_metric': 'rmse',
 'learning_rate': 0.14,
 'max_depth': 11,
 'n_estimators': 890,
 'subsample': 0.89}
lgb_model=LGBMRegressor(**lgbm_param,device="gpu")

# **Stacking**

In [6]:
stack_gen = StackingCVRegressor(regressors=(cb_model, xgb_model, lgb_model),
                                meta_regressor=xgb_model,
                                use_features_in_secondary=True)

In [15]:
# stack_gen_model = stack_gen.fit(np.array(X), np.array(y))

In [16]:
cb_model=cb_model.fit(X, y,verbose=1000)
xgb_model=xgb_model.fit(X, y,verbose=100)
lgb_model=lgb_model.fit(X, y,verbose =400)

0:	learn: 2.3924142	total: 60.3ms	remaining: 5m 1s
1000:	learn: 0.0377967	total: 51.6s	remaining: 3m 26s
2000:	learn: 0.0213233	total: 1m 44s	remaining: 2m 36s
3000:	learn: 0.0151090	total: 2m 37s	remaining: 1m 44s
4000:	learn: 0.0115127	total: 3m 29s	remaining: 52.4s
4999:	learn: 0.0091065	total: 4m 23s	remaining: 0us


In [24]:
cb_predict = cb_model.predict(test)
xgb_predict = xgb_model.predict(test)
lgb_predict = lgb_model.predict(test)

In [45]:
preds = cb_predict*0.1 + xgb_predict*0.4 + lgb_predict*0.5
preds = np.expm1(preds)
sub = pd.DataFrame()
sub['ID_LAT_LON_YEAR_WEEK'] = test_ID
sub['emission'] = preds
sub.to_csv('ensemble_cb1_lgb5_xgb4.csv',index=False)