## Deliverables:
- Data Preprocessing Pipeline (pandas and sklearn)
- Feature Transformation Pipeline (sklearn and feature-engine)
- Model Training Pipeline (sklearn, xgboost, lgbm, catboost)
- Model Evaluation Pipeline (sklearn)
- HParam Search Pipeline (optuna)
- Export the final trained model and write the code for inference via the CLI

Range for accepted RMSE: 0-1

In [78]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import optuna
from catboost import CatBoostRegressor
import lightgbm as lgb
import xgboost as xgb

In [51]:
train_data =pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

In [52]:
train_data.head()

Unnamed: 0,index,lat,lon,startdate,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,...,wind-vwnd-925-2010-11,wind-vwnd-925-2010-12,wind-vwnd-925-2010-13,wind-vwnd-925-2010-14,wind-vwnd-925-2010-15,wind-vwnd-925-2010-16,wind-vwnd-925-2010-17,wind-vwnd-925-2010-18,wind-vwnd-925-2010-19,wind-vwnd-925-2010-20
0,0,0.0,0.833333,9/1/14,237.0,29.02,31.64,29.57,30.73,29.71,...,-27.68,-37.21,8.32,9.56,-2.03,48.13,28.09,-13.5,11.9,4.58
1,1,0.0,0.833333,9/2/14,228.9,29.02,31.64,29.57,30.73,29.71,...,-21.13,-36.57,8.77,21.17,4.44,48.6,27.41,-23.77,15.44,3.42
2,2,0.0,0.833333,9/3/14,220.69,29.02,31.64,29.57,30.73,29.71,...,-10.72,-34.16,6.99,32.16,5.01,48.53,19.21,-33.16,15.11,4.82
3,3,0.0,0.833333,9/4/14,225.28,29.02,31.64,29.57,30.73,29.71,...,0.33,-31.04,6.17,39.66,-1.41,50.59,8.29,-37.22,18.24,9.74
4,4,0.0,0.833333,9/5/14,237.24,29.02,31.64,29.57,30.73,29.71,...,9.83,-31.8,7.47,38.62,-5.21,54.73,-2.58,-42.3,21.91,10.95


In [53]:
test_data.head()

Unnamed: 0,index,lat,lon,startdate,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,...,wind-vwnd-925-2010-11,wind-vwnd-925-2010-12,wind-vwnd-925-2010-13,wind-vwnd-925-2010-14,wind-vwnd-925-2010-15,wind-vwnd-925-2010-16,wind-vwnd-925-2010-17,wind-vwnd-925-2010-18,wind-vwnd-925-2010-19,wind-vwnd-925-2010-20
0,375734,0.0,0.833333,11/1/22,339.88,30.88,30.92,29.17,31.02,29.47,...,-19.28,-39.77,-29.25,40.88,-8.31,14.91,-24.62,31.05,-23.69,6.27
1,375735,0.0,0.833333,11/2/22,334.63,30.88,30.92,29.17,31.02,29.47,...,-19.58,-43.14,-28.62,45.37,-5.42,16.97,-23.94,28.84,-20.61,14.16
2,375736,0.0,0.833333,11/3/22,337.83,30.88,30.92,29.17,31.02,29.47,...,-13.73,-44.22,-27.67,49.76,-1.31,21.44,-19.06,26.85,-16.78,13.42
3,375737,0.0,0.833333,11/4/22,345.81,30.88,30.92,29.17,31.02,29.47,...,-7.97,-49.47,-19.32,52.62,-0.44,21.65,-23.12,23.7,-18.62,10.69
4,375738,0.0,0.833333,11/5/22,357.39,30.88,30.92,29.17,31.02,29.47,...,-0.8,-56.07,-9.89,51.23,-7.57,19.86,-30.56,20.66,-25.08,19.64


## Feature Engineering

In [54]:
# converting startdate to datetiime 
train_data['startdate'] = pd.to_datetime(train_data['startdate'])
train_data['month'] = train_data['startdate'].dt.month
train_data['season'] = (train_data['startdate'].dt.month % 12) // 3 + 1 

test_data['startdate'] = pd.to_datetime(test_data['startdate'])
test_data['month'] = test_data['startdate'].dt.month
test_data['season'] = (test_data['startdate'].dt.month % 12) // 3 + 1
test_data['year'] = test_data['startdate'].dt.year

  train_data['startdate'] = pd.to_datetime(train_data['startdate'])
  test_data['startdate'] = pd.to_datetime(test_data['startdate'])


In [55]:
# one hot encoding
train_data = pd.get_dummies(train_data, columns=['climateregions__climateregion'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['climateregions__climateregion'], drop_first=True)

## Splitting the data for validation

In [56]:
# Splitting into train and test data
X = train_data.drop(['contest-tmp2m-14d__tmp2m', 'startdate'], axis=1)
y = train_data['contest-tmp2m-14d__tmp2m']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Training

In [None]:
# lgbm
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

model_lgbm = lgb.train(params, train_data, num_boost_round=1000, valid_sets=[train_data, val_data],
                       callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(period=100)])

Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 1.13617	valid_1's rmse: 1.14511
[100]	training's rmse: 1.13617	valid_1's rmse: 1.14511
[200]	training's rmse: 0.853082	valid_1's rmse: 0.865648
[200]	training's rmse: 0.853082	valid_1's rmse: 0.865648
[300]	training's rmse: 0.736806	valid_1's rmse: 0.751817
[300]	training's rmse: 0.736806	valid_1's rmse: 0.751817
[400]	training's rmse: 0.667249	valid_1's rmse: 0.684271
[400]	training's rmse: 0.667249	valid_1's rmse: 0.684271
[500]	training's rmse: 0.616443	valid_1's rmse: 0.63483
[500]	training's rmse: 0.616443	valid_1's rmse: 0.63483
[600]	training's rmse: 0.578587	valid_1's rmse: 0.598576
[600]	training's rmse: 0.578587	valid_1's rmse: 0.598576
[700]	training's rmse: 0.5473	valid_1's rmse: 0.568935
[700]	training's rmse: 0.5473	valid_1's rmse: 0.568935
[800]	training's rmse: 0.520874	valid_1's rmse: 0.543885
[800]	training's rmse: 0.520874	valid_1's rmse: 0.543885
[900]	training's rmse: 0.49732	valid

In [111]:
y_pred_lgbm = model_lgbm.predict(X_val, num_iteration=model_lgbm.best_iteration)

In [121]:
mse_lgbm = mean_squared_error(y_val, y_pred_lgbm)
rmse_lgbm = np.sqrt(mse_lgbm)
print(f'Validation RMSE (LGBM): {rmse_lgbm}')

Validation RMSE (LGBM): 0.502834088081686


In [None]:
# catboost model
params_catboost = {
    'iterations': 1000,
    'learning_rate': 0.05,
    'eval_metric': 'RMSE',
    'random_seed': 42,
    'verbose': 100
}
model_catboost = CatBoostRegressor(**params_catboost)
model_catboost.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100)


0:	learn: 9.4327313	test: 9.4431053	best: 9.4431053 (0)	total: 153ms	remaining: 2m 32s
100:	learn: 1.5170088	test: 1.5161310	best: 1.5161310 (100)	total: 14.8s	remaining: 2m 11s
100:	learn: 1.5170088	test: 1.5161310	best: 1.5161310 (100)	total: 14.8s	remaining: 2m 11s
200:	learn: 1.2142675	test: 1.2161385	best: 1.2161385 (200)	total: 33s	remaining: 2m 11s
200:	learn: 1.2142675	test: 1.2161385	best: 1.2161385 (200)	total: 33s	remaining: 2m 11s
300:	learn: 1.0543913	test: 1.0585269	best: 1.0585269 (300)	total: 50.1s	remaining: 1m 56s
300:	learn: 1.0543913	test: 1.0585269	best: 1.0585269 (300)	total: 50.1s	remaining: 1m 56s
400:	learn: 0.9490040	test: 0.9546945	best: 0.9546945 (400)	total: 1m 8s	remaining: 1m 41s
400:	learn: 0.9490040	test: 0.9546945	best: 0.9546945 (400)	total: 1m 8s	remaining: 1m 41s
500:	learn: 0.8698424	test: 0.8763802	best: 0.8763802 (500)	total: 1m 27s	remaining: 1m 27s
500:	learn: 0.8698424	test: 0.8763802	best: 0.8763802 (500)	total: 1m 27s	remaining: 1m 27s
600:	

<catboost.core.CatBoostRegressor at 0x1d22c7a6e90>

In [107]:
y_pred_catboost = model_catboost.predict(X_val)

In [120]:
# evaluation
rmse_catboost = np.sqrt(mean_squared_error(y_val, y_pred_catboost))
print(f'Validation RMSE (CatBoost): {rmse_catboost}')

Validation RMSE (CatBoost): 0.6786803550645675


In [None]:
# xgb model
params = {'base_score': 0.5, 
          'booster': 'gbtree',
          'tree_method': 'hist',
          'n_estimators': 1000,
          'objective': 'reg:squarederror',
          'max_depth': 6,
          'subsample': 0.5,
          'colsample_bytree': 0.5,
          'gamma': 1.4,
          'min_child_weight': 7,
          'learning_rate': 0.01}

reg_xgb = xgb.XGBRegressor(**params)
reg_xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=100)

[0]	validation_0-rmse:14.86983
[100]	validation_0-rmse:5.72927
[100]	validation_0-rmse:5.72927
[200]	validation_0-rmse:2.51720
[200]	validation_0-rmse:2.51720
[300]	validation_0-rmse:1.48955
[300]	validation_0-rmse:1.48955
[400]	validation_0-rmse:1.17270
[400]	validation_0-rmse:1.17270
[500]	validation_0-rmse:1.04515
[500]	validation_0-rmse:1.04515
[600]	validation_0-rmse:0.97163
[600]	validation_0-rmse:0.97163
[700]	validation_0-rmse:0.92033
[700]	validation_0-rmse:0.92033
[800]	validation_0-rmse:0.87961
[800]	validation_0-rmse:0.87961
[900]	validation_0-rmse:0.84502
[900]	validation_0-rmse:0.84502
[999]	validation_0-rmse:0.81427
[999]	validation_0-rmse:0.81427


0,1,2
,objective,'reg:squarederror'
,base_score,0.5
,booster,'gbtree'
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.5
,device,
,early_stopping_rounds,
,enable_categorical,False


## HParam Search Pipeline (optuna)

In [None]:
def objective_lgbm(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'verbose': -1
    }
    
    train_set = lgb.Dataset(X_train, label=y_train)
    val_set = lgb.Dataset(X_val, label=y_val)
    
    model = lgb.train(params, 
                     train_set,
                     valid_sets=[val_set],
                     num_boost_round=1000,
                     callbacks=[
                         lgb.early_stopping(stopping_rounds=50),
                         lgb.log_evaluation(period=0) 
                     ])
    
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    return rmse
study_lgbm = optuna.create_study(direction='minimize', study_name='lightgbm_optimization')
study_lgbm.optimize(objective_lgbm, n_trials=50)

print("Best LightGBM parameters:", study_lgbm.best_params)
print(f"Best RMSE: {study_lgbm.best_value:.4f}")

In [None]:
def objective_catboost(trial):
    params = {
        'iterations': 1000,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'random_strength': trial.suggest_float('random_strength', 1e-8, 10.0, log=True),
        'eval_metric': 'RMSE',
        'early_stopping_rounds': 50,
        'verbose': False
    }
    
    model = CatBoostRegressor(**params)
    model.fit(X_train, y_train,
             eval_set=[(X_val, y_val)],
             verbose=False)
    
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    return rmse

study_catboost = optuna.create_study(direction='minimize', study_name='catboost_optimization')
study_catboost.optimize(objective_catboost, n_trials=50)

print("Best CatBoost parameters:", study_catboost.best_params)
print(f"Best RMSE: {study_catboost.best_value:.4f}")

In [None]:
def objective_xgb(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'n_estimators': 1000,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
        'tree_method': 'hist'
    }
    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train,
             eval_set=[(X_val, y_val)],
             
             verbose=False)
    
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    return rmse
study_xgb = optuna.create_study(direction='minimize', study_name='xgboost_optimization')
study_xgb.optimize(objective_xgb, n_trials=50)

print("Best XGBoost parameters:", study_xgb.best_params)
print(f"Best RMSE: {study_xgb.best_value:.4f}")

## Model comparison

In [None]:
final_lgbm = lgb.train(study_lgbm.best_params, train_data, valid_sets=[val_data],num_boost_round=1000, callbacks=[lgb.early_stopping(stopping_rounds=50)])
final_catboost = CatBoostRegressor(**study_catboost.best_params)
final_catboost.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50, verbose=False)

final_xgb = xgb.XGBRegressor(**study_xgb.best_params)
final_xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=50, verbose=False)

y_pred_lgbm = final_lgbm.predict(X_test)
y_pred_catboost = final_catboost.predict(X_test)
y_pred_xgb = final_xgb.predict(X_test)
mse_lgbm = mean_squared_error(y_test, y_pred_lgbm)
mse_catboost = mean_squared_error(y_test, y_pred_catboost)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)

rmse_lgbm = np.sqrt(mse_lgbm)
rmse_catboost = np.sqrt(mse_catboost)
rmse_xgb = np.sqrt(mse_xgb)

results = pd.DataFrame({
    'Model': ['LightGBM', 'CatBoost', 'XGBoost'],
    'RMSE': [rmse_lgbm, rmse_catboost, rmse_xgb]
})
print(results.to_string(index=False))