# Predictive Modelling - Ladder Score Regression

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV, train_test_split,cross_val_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

### Preparing the data
Split the data into train, test, validation, and cross-validation datasets

In [33]:
df = pd.read_csv("data/imputed-data.csv")
df.drop(['kmean group', 'Status'], axis=1, inplace=True)
df['Year'] = df['Year'].astype('int')

model_df = df.copy(deep=True)
model_df['Country name'] = pd.factorize(model_df['Country name'])[0]
model_df['Regional indicator'] = pd.factorize(model_df['Regional indicator'])[0]

X_train, test, y_train, test_y = train_test_split(model_df.drop(['Ladder score'], axis=1), model_df['Ladder score'], test_size=0.2, random_state=1234)
train, val, train_y, val_y = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

cv_train = model_df.copy(deep=True)
cv_train_y = model_df['Ladder score']
cv_train.drop(['Ladder score'], axis=1, inplace=True)

## Hyperparameter Tuning
* Parameters are not optimal values, but rather included in this section as proof-of-concept
* Given a need for a more accurate predictive regression model, optimal parameters would have been selected

### XGBoost Tuning

In [34]:
def XGBoost_Tuning(train, train_y):
    param_tuning = {
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5, 7, 10],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.5, 0.7],
        'colsample_bytree': [0.5, 0.7],
        'n_estimators' : [100, 200, 500],
        'objective': ['reg:squarederror']
    }

    xgb_model = XGBRegressor()

    gsearch = GridSearchCV(estimator = xgb_model,
                           param_grid = param_tuning,                        
                           scoring = 'neg_mean_absolute_error',
                           cv = 5,
                           n_jobs = -1,
                           verbose = False)
    
    gsearch.fit(train,train_y)

    return gsearch.best_params_


xgb_tuned_params = XGBoost_Tuning(train, train_y)
tuned_xgb = XGBRegressor(**xgb_tuned_params)
tuned_xgb.fit(train, train_y)
tuned_xgb_preds = tuned_xgb.predict(test)
print(f'Tuned XGBoost Regressor MAE: {mean_absolute_error(test_y, tuned_xgb_preds):.4f}')
display(f'Best XGBoost Parameters from Grid Search CV: {xgb_tuned_params}')

Tuned XGBoost Regressor MAE: 0.1991


"Best XGBoost Parameters from Grid Search CV: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 500, 'objective': 'reg:squarederror', 'subsample': 0.7}"

### LightGBM

In [41]:
def LightGBM_Tuning(train, train_y):
    param_tuning = {
        'learning_rate': [0.01, 0.1],
        'num_leaves': [31, 60, 100],
        'reg_alpha': [0, 0.1, 0.5],
        'reg_lambda': [0, 0.1, 0.5],
        'subsample': [0.5, 0.7],
        'colsample_bytree': [0.5, 0.7],
        'n_estimators' : [100, 200, 500]
    }

    lgb_model = LGBMRegressor()

    gsearch = GridSearchCV(estimator = lgb_model,
                           param_grid = param_tuning,                        
                           scoring = 'neg_mean_absolute_error',
                           cv = 5,
                           n_jobs = -1,
                           verbose = False)
    
    gsearch.fit(train,train_y)

    return gsearch.best_params_

lgb_tuned_params = LightGBM_Tuning(train, train_y)
tuned_lgb = LGBMRegressor(**lgb_tuned_params)
tuned_lgb.fit(train, train_y)
tuned_lgb_preds = tuned_lgb.predict(test)
print(f'Tuned XGBoost Regressor MAE: {mean_absolute_error(test_y, tuned_lgb_preds):.4f}')
display(f'Best LightGBM Parameters from Grid Search CV: {lgb_tuned_params}')

Tuned XGBoost Regressor MAE: 0.2040


"Best LightGBM Parameters from Grid Search CV: {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'n_estimators': 500, 'num_leaves': 60, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'subsample': 0.5}"

## Regression Model Summary

#### CV & Test Set Scores

In [42]:
models = [LinearRegression(), Ridge(), SVR(), XGBRegressor(), XGBRegressor(**xgb_tuned_params), LGBMRegressor(), LGBMRegressor(**lgb_tuned_params)]
model_name = ['LR', 'Ridge', 'SVM', 'XGBoost', 'Tuned XGBoost','LightGBM',  'Tuned LightGBM']
test_model_summary = []

for i in range(len(models)):
    model = models[i]
    cv_scores = cross_val_score(model, cv_train, cv_train_y, cv=10, scoring = 'neg_mean_absolute_error')
    cv_mae = abs(cv_scores).mean()

    model.fit(train, train_y)
    preds = model.predict(test)
    tts_mae = mean_absolute_error(test_y, preds)
    
    model_score = pd.DataFrame([model_name[i], cv_mae, tts_mae]).transpose()
    test_model_summary.append(model_score)
    
test_model_summary = pd.concat(test_model_summary)
test_model_summary.reset_index(inplace=True, drop=True)
test_model_summary.columns = ['Model', 'CV Score', 'Test Set Score']

#### Validation Set Scores

In [43]:
final_train = train.append(test)
final_train_y = train_y.append(test_y)

val_model_summary = []
for i in range(len(models)):
    model = models[i]
    model.fit(final_train, final_train_y)
    preds = model.predict(val)
    tts_mae = mean_absolute_error(val_y, preds)
    
    model_score = pd.DataFrame([model_name[i], tts_mae]).transpose()
    val_model_summary.append(model_score)
    
val_model_summary = pd.concat(val_model_summary)
val_model_summary.reset_index(inplace=True, drop=True)
val_model_summary.columns = ['Model', 'Validation Set Score']

model_table = pd.merge(test_model_summary, val_model_summary, on='Model')

#### Ensemble Model Scores
* Note: Ensemble model used a weighted average of untuned LightGBM and XGBoost models, producing a more accurate result than either individually

In [50]:
lgb_model = LGBMRegressor()
xgb_model = XGBRegressor()

lgb_model.fit(train, train_y)
lgb_preds = lgb_model.predict(test)

xgb_model.fit(train, train_y)
xgb_preds = xgb_model.predict(test)

ensemble_preds = (lgb_preds*0.5) + (xgb_preds*0.5)
test_ensemble_score = mean_absolute_error(test_y, ensemble_preds)

lgb_model.fit(final_train, final_train_y)
lgb_preds = lgb_model.predict(val)

xgb_model.fit(final_train, final_train_y)
xgb_preds = xgb_model.predict(val)

ensemble_preds = (lgb_preds*0.5) + (xgb_preds*0.5)
val_ensemble_score = mean_absolute_error(val_y, ensemble_preds)

ensemble_summary = pd.DataFrame(['Ensemble (Untuned XGBoost + LightGBM)', 'NA', test_ensemble_score, val_ensemble_score]).transpose()
ensemble_summary.columns = ['Model', 'CV Score', 'Test Set Score', 'Validation Set Score']

In [51]:
final_model_table = pd.concat([model_table, ensemble_summary], axis=0, ignore_index=True)
final_model_table

Unnamed: 0,Model,CV Score,Test Set Score,Validation Set Score
0,LR,0.337919,0.293462,0.344255
1,Ridge,0.337984,0.293436,0.343823
2,SVM,0.437695,0.431099,0.440702
3,XGBoost,0.359812,0.207848,0.228388
4,Tuned XGBoost,0.350024,0.199123,0.220393
5,LightGBM,0.350588,0.201355,0.220094
6,Tuned LightGBM,0.374846,0.20397,0.229669
7,Ensemble (Untuned XGBoost + LightGBM),,0.198037,0.218025
