## Load libraries¶

In [94]:
import numpy as np 

import pandas as pd 

import os

import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, train_test_split

import joblib

import warnings
warnings.filterwarnings('ignore')

In [89]:
precomputed = ['Ecosystem Vitality', 'Environmental Health', 'Air Quality', 'Sanitation & Drinking Water',
              'Biodiversity & Habitat']

In [90]:
drop_list = precomputed
drop_list.append('Environmental Performance Index') # target variable
drop_list.append('country_name')

In [91]:
df = pd.read_csv('../data/dataframes/epi_cleaned.csv')

In [63]:
df.columns
df.shape   
df.dtypes  
df.select_dtypes(include = ["float", "int"]).agg(["max", "min"])

Unnamed: 0.1,Unnamed: 0,year,Sanitation & Drinking Water,Unsafe drinking water,PM2.5 Exposure,Air Quality,Marine Protected Areas,Biodiversity & Habitat,Ecosystem Vitality,Wastewater Treatment,...,Agriculture,Fisheries,PM2.5 Exceedance,Household Air Quality,Access to Electricity,Health Impacts,Agricultural Subsidies,Child Mortality,Pesticide Regulation,GDP
max,3424,2020,100.0,100.0,100.0,100.0,100.0,100.0,90.09,100.0,...,100.0,94.09,100.0,100.0,100.0,100.0,100.0,100.0,96.0,23487800000000.0
min,0,2002,0.0,0.0,0.0,3.94,0.0,0.0,5.82,0.0,...,0.0,0.0,0.0,2.0,1.6,0.0,0.0,2.34,0.0,0.0


In [71]:
df.shape

(3362, 28)

## Read Data


In [86]:
(df.head(10))


Unnamed: 0.1,Unnamed: 0,year,country_name,Sanitation & Drinking Water,Unsafe drinking water,PM2.5 Exposure,Air Quality,Marine Protected Areas,Biodiversity & Habitat,Ecosystem Vitality,...,Agriculture,Fisheries,PM2.5 Exceedance,Household Air Quality,Access to Electricity,Health Impacts,Agricultural Subsidies,Child Mortality,Pesticide Regulation,GDP
0,0,2002,Mexico,35.44,41.86,91.43,81.34,80.19,56.46,43.26,...,46.5,26.34,69.78,82.8,98.2,70.92,21.0,70.92,72.0,742000000000.0
1,1,2003,Mexico,36.47,43.07,90.67,80.98,80.19,58.2,43.85,...,60.54,20.82,69.06,83.2,98.3,71.5,41.07,71.5,80.0,713000000000.0
2,2,2004,Mexico,37.53,44.34,92.13,82.32,80.19,59.97,45.85,...,90.0,21.67,71.22,83.6,98.4,72.04,100.0,72.04,80.0,770000000000.0
3,3,2005,Mexico,38.63,45.65,91.92,82.26,81.34,59.87,44.0,...,54.62,21.12,70.86,84.0,98.5,72.56,29.25,72.56,80.0,866000000000.0
4,4,2006,Mexico,39.76,47.02,94.25,84.01,81.34,60.26,44.02,...,53.29,20.99,73.38,84.4,98.6,73.1,26.58,73.1,80.0,965000000000.0
5,5,2007,Mexico,40.94,48.44,93.63,83.82,82.15,61.79,44.43,...,54.8,20.56,73.02,84.8,98.7,73.65,29.61,73.65,80.0,1040000000000.0
6,6,2008,Mexico,42.17,49.94,96.56,85.89,82.31,61.87,44.61,...,54.98,22.11,75.9,85.2,98.8,74.21,29.95,74.21,80.0,1100000000000.0
7,7,2009,Mexico,43.45,51.51,97.62,86.73,82.89,62.32,44.73,...,54.42,22.4,76.98,85.6,98.9,74.8,28.85,74.8,80.0,895000000000.0
8,8,2010,Mexico,44.79,53.16,99.81,88.2,82.89,62.32,44.8,...,54.91,22.84,78.78,86.0,99.0,75.41,29.82,75.41,80.0,1050000000000.0
9,9,2011,Mexico,46.2,54.9,98.95,87.43,82.89,62.32,45.06,...,55.21,25.34,77.34,86.0,99.0,76.03,30.41,76.03,80.0,1170000000000.0


In [66]:
print("Missing Values",df.isnull().sum().sum())
print("Column names",df.columns)

Missing Values 0
Column names Index(['Unnamed: 0', 'year', 'country_name', 'Sanitation & Drinking Water',
       'Unsafe drinking water', 'PM2.5 Exposure', 'Air Quality',
       'Marine Protected Areas', 'Biodiversity & Habitat',
       'Ecosystem Vitality', 'Wastewater Treatment', 'Water Resources',
       'Fish Stock Status', 'Environmental Performance Index',
       'Terrestrial biome protection (national weights)',
       'Environmental Health', 'Unsafe sanitation',
       'Terrestrial biome protection (global weights)', 'Agriculture',
       'Fisheries', 'PM2.5 Exceedance', 'Household Air Quality',
       'Access to Electricity', 'Health Impacts', 'Agricultural Subsidies',
       'Child Mortality', 'Pesticide Regulation', 'GDP'],
      dtype='object')


## 


In [81]:
import numpy as np
np.random.seed(42)

In [92]:
#set up X and y

X = df.drop(columns=drop_list)
y = df['Environmental Performance Index']

#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

In [93]:
#check baseline:

y.mean()

52.091386079714525

In [96]:
#instantiate RandomForest Classifier

rfr = RandomForestRegressor()

In [97]:
params = {'n_estimators': list(range(5,301,2)),
          'max_depth': list(range(1,16,1)),
         'min_samples_split': list(range(1,16,1)),
         'min_samples_leaf': list(range(1,16,1)),
         'max_features': ['auto', 'sqrt', 'log2']}

In [99]:
rfs = RandomizedSearchCV(rfr, params, cv=5, n_iter = 500, verbose = 1, random_state=42)

In [101]:
rfs.fit(X_train, y_train)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=500,
                   param_distributions={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                      10, 11, 12, 13, 14, 15],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                             7, 8, 9, 10, 11,
                                                             12, 13, 14, 15],
                                        'min_samples_split': [1, 2, 3, 4, 5, 6,
                                                              7, 8, 9, 10, 11,
                                                              12, 13, 14, 15],
                                        'n_estimators': [5, 7, 9, 11, 13, 15,
                                                         17, 19, 21, 23, 25, 2

In [107]:
rfs.best_score_

0.9745969066506348

In [110]:
rfs.score(X_train, y_train), rfs.score(X_test, y_test)

(0.9962999791597907, 0.9828941671164061)

In [103]:
rfs.best_params_

{'n_estimators': 129,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 13}

In [104]:
rfs.best_estimator_.feature_importances_

array([0.00835771, 0.0068308 , 0.18083547, 0.00886033, 0.011852  ,
       0.16591419, 0.13136049, 0.00452589, 0.03014305, 0.09238971,
       0.03282423, 0.01145629, 0.00611865, 0.00543817, 0.06334546,
       0.05196279, 0.08370225, 0.01735017, 0.05707268, 0.01603469,
       0.013625  ])

In [115]:
f = {'Column Name':X_train.columns,'Feature Importance':rfs.best_estimator_.feature_importances_}
rfs_df = pd.DataFrame(f)

rfs_df.sort_values(by=['Feature Importance'], ascending = False).head(10)

Unnamed: 0,Column Name,Feature Importance
2,Unsafe drinking water,0.180835
5,Wastewater Treatment,0.165914
6,Water Resources,0.13136
9,Unsafe sanitation,0.09239
16,Health Impacts,0.083702
14,Household Air Quality,0.063345
18,Child Mortality,0.057073
15,Access to Electricity,0.051963
10,Terrestrial biome protection (global weights),0.032824
8,Terrestrial biome protection (national weights),0.030143
