In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/datasetlowestaic/dataset_lowestAIC.csv
/kaggle/input/dataset-lowest-aic452022/dataset_lowestAIC_longlat.csv
/kaggle/input/iterated-12preds/iterated_12vars.csv


In [2]:
df = pd.read_csv(r'../input/dataset-lowest-aic452022/dataset_lowestAIC_longlat.csv', sep=';')

In [3]:
#remove unique idenifier & geodata
df = df.drop(['Unnamed: 0', 'Longitude', 'Latitude'], axis=1)

In [4]:
# import re
# df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

# ##SOURCE:https://github.com/awslabs/autogluon/issues/399

In [5]:
#initialize response- and predictor variables
y = df['mean_value_NO2']
x = df.drop(['mean_value_NO2'], axis=1)
feature_list = list(x.columns)

In [6]:
                       ## CONSTRUCTING LightGBM
                       ## Import the model we are using
        
import lightgbm as lgb
from sklearn.model_selection import train_test_split


##STEP GENERATING CROSS VALIDATION (80% TRAINING OF MODEL; 
        ##25% TESTING OF MODEL;EVALUATING MODEL PERFORMANCE)

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.25, random_state=20)
#random state is needed to ensure that same results are generated each time.

In [7]:
## ENSURING THAT X- AND Y-TRAINING SET CONTAIN SAME AMOUNT OF ROWS. SAME FOR TESTING.
print('Training Features Shape:',X_train.shape)
print('Training Labels Shape:', Y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', Y_test.shape)

Training Features Shape: (361, 12)
Training Labels Shape: (361,)
Testing Features Shape: (121, 12)
Testing Labels Shape: (121,)


In [8]:
## PARAMETERS FOR MODEL
model = lgb.LGBMRegressor(random_state=42)
model.fit(X_train, Y_train)


LGBMRegressor(random_state=42)

In [9]:
# ## PARAMETERS FOR MODEL
# model = lgb.LGBMRegressor(reg_alpha =2, reg_lambda = 0, max_depth = 5, learning_rate = 0.002, n_estimators =2000, random_state=42)
# model.fit(X_train, Y_train)

In [10]:
##DEFINING ORIGINAL Y-VALUES AND PREDICTED Y-VALUES
expected_y  = Y_test
predicted_y = model.predict(X_test)

In [11]:
expected_y_tr  = Y_train
predicted_y_tr = model.predict(X_train)

In [12]:
##EVALUATING PERFORMANCE OF LIGHTGBM MODEL

In [13]:
##RMSE

def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

print('RMSE training: ',rmse(predicted_y_tr, expected_y_tr))
print('RMSE testing: ', rmse(predicted_y, expected_y))

RMSE training:  2.777639899213866
RMSE testing:  8.051676840680084


In [14]:
## R2

from sklearn.metrics import r2_score
print('R2 score training: ', r2_score(expected_y_tr, predicted_y_tr))
print('R2 score testing: ', r2_score(expected_y, predicted_y))

R2 score training:  0.9513308407980061
R2 score testing:  0.6457102680338802


In [15]:
##MEAN ABSOLUTE ERROR 
from sklearn.metrics import mean_absolute_error
print('MAE training: ', mean_absolute_error(expected_y_tr, predicted_y_tr))
print('MAE testing: ', mean_absolute_error(expected_y, predicted_y))

MAE training:  1.8380576543860079
MAE testing:  5.530682448789121


In [16]:
#HYPERPARAMETER TUNING
#VERIFYING CURRENT RANDOM FOREST HYPERPARAMETERS 

from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(model.get_params())

Parameters currently in use:

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': 42,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}


In [17]:
#CREATE PARAMETER GRID

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)
{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [18]:
##RANDOM SEARCH TRAINING (https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74)

# Use the random grid to search for best hyperparameters
# First create the base model to tune

model = lgb.LGBMRegressor(random_state=42)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
model_random = RandomizedSearchCV(estimator = model, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
model_random.fit(X_train, Y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   25.6s




[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   46.3s finished


RandomizedSearchCV(cv=3, estimator=LGBMRegressor(random_state=42), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [19]:
model_random.best_params_

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 10,
 'bootstrap': True}

In [20]:
model_hyper = lgb.LGBMRegressor(n_estimators=200, min_samples_split=5, 
                                 min_samples_leaf=4, max_features='auto',
                                max_depth=10, bootstrap=True)

# Train the model on training data
model_hyper.fit(X_train, Y_train)

LGBMRegressor(bootstrap=True, max_depth=10, max_features='auto',
              min_samples_leaf=4, min_samples_split=5, n_estimators=200)

In [21]:
##DEFINING ORIGINAL Y-VALUES AND PREDICTED Y-VALUES
expected_y_H  = Y_test
predicted_y_H = model_hyper.predict(X_test)

expected_y_tr_H = Y_train
predicted_y_tr_H = model_hyper.predict(X_train)

In [22]:
##RMSE

def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

print('RMSE training dataset with hypertuning: ', rmse(predicted_y_tr_H, expected_y_tr_H))
print('RMSE testing dataset with hypertuning: ', rmse(predicted_y_H, expected_y_H))

RMSE training dataset with hypertuning:  1.5951686867543777
RMSE testing dataset with hypertuning:  8.341923649555483


In [23]:
##R2

from sklearn.metrics import r2_score
print('R2 score training dataset with hypertuning: ', r2_score(expected_y_tr_H, predicted_y_tr_H))
print('R2 score testing dataset with hypertuning: ', r2_score(expected_y_H, predicted_y_H))

R2 score training dataset with hypertuning:  0.9839485159591732
R2 score testing dataset with hypertuning:  0.6197070142373511


In [24]:
##MEAN ABSOLUTE ERROR 
from sklearn.metrics import mean_absolute_error
print('MAE training dataset with hypertuning: ', mean_absolute_error(expected_y_tr_H, predicted_y_tr_H))
print('MAE testing dataset with hypertuning: ', mean_absolute_error(expected_y_H, predicted_y_H))

MAE training dataset with hypertuning:  1.0449864526733343
MAE testing dataset with hypertuning:  5.7611211659137185


In [25]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
model = lgb.LGBMRegressor(random_state=42)
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = model, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [26]:
# Fit the grid search to the data
grid_search.fit(X_train, Y_train)
grid_search.best_params_

Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 300 tasks      | elapsed:   14.1s
[Parallel(n_jobs=-1)]: Done 706 tasks      | elapsed:   32.8s




[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:   40.3s finished


{'bootstrap': True,
 'max_depth': 80,
 'max_features': 2,
 'min_samples_leaf': 3,
 'min_samples_split': 8,
 'n_estimators': 100}

In [27]:
from sklearn.model_selection import GridSearchCV


# Create best grid model
model_grid = lgb.LGBMRegressor(random_state=42, bootstrap=True, max_depth=80, max_features=2, 
                         min_samples_leaf=3, min_samples_split=8, n_estimators=100)
model_grid.fit(X_train, Y_train)

LGBMRegressor(bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3,
              min_samples_split=8, random_state=42)

In [28]:
##DEFINING ORIGINAL Y-VALUES AND PREDICTED Y-VALUES (FOR TESTING DATASET)
expected_ygrid  = Y_test
predicted_ygrid = model_grid.predict(X_test) #Predicting the test set results using the random forest regressor model

In [29]:
##DEFINING ORIGINAL Y-VALUES AND PREDICTED Y-VALUES (FOR TRAINING DATASET)

expected_y_tr_grid = Y_train
predicted_y_tr_grid = model_grid.predict(X_train)
# print(expected_y_tr, predicted_y_tr)

In [30]:
##RMSE - TRAIN

def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

print("training rmse: ", rmse(predicted_y_tr_grid, expected_y_tr_grid))
print("testing rmse: ", rmse(predicted_ygrid, Y_test))

training rmse:  2.777639899213866
testing rmse:  8.051676840680084


In [31]:
##R2

from sklearn.metrics import r2_score
print("R2 score training data: ", r2_score(expected_y_tr_grid, predicted_y_tr_grid))
print("R2 score testing data: ", r2_score(Y_test, predicted_ygrid))

R2 score training data:  0.9513308407980061
R2 score testing data:  0.6457102680338802


In [32]:
##MEAN ABSOLUTE ERROR 
from sklearn.metrics import mean_absolute_error
print("MAE training data: ", mean_absolute_error(expected_y_tr_grid, predicted_y_tr_grid))
print("MAE testing data: ", mean_absolute_error(expected_ygrid, predicted_ygrid))

MAE training data:  1.8380576543860079
MAE testing data:  5.530682448789121
