In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

from sklearn.preprocessing import StandardScaler

from scipy.stats import uniform, loguniform

import pickle

In [2]:
precomputed = ['Ecosystem Vitality', 'Environmental Health', 'Air Quality', 'Sanitation & Drinking Water']

In [3]:
df = pd.read_csv('../data/dataframes/epi_cleaned.csv')

In [4]:
df.drop(columns='Unnamed: 0', inplace=True)

In [5]:
epi_2020 = df[df['year'] == 2020]

In [6]:
epi_2020.reset_index(inplace=True)

In [7]:
index_list_high = epi_2020['Environmental Performance Index'].sort_values(ascending=False).head(10).index

In [8]:
epi_2020.loc[158, 'country_name']

'Denmark'

In [83]:
def country_list(num_countries, top=True):
    if top:
        index_list_high = epi_2020['Environmental Performance Index'].sort_values(ascending=False).head(num_countries).index
        top_country_list = []
        for index in index_list_high:
            top_country_list.append(epi_2020.loc[index, 'country_name'])
        return top_country_list
    else:
        index_list_low = epi_2020['Environmental Performance Index'].sort_values(ascending=False).tail(num_countries).index
        bot_country_list = []
        for index in index_list_low:
            bot_country_list.append(epi_2020.loc[index, 'country_name'])
        return bot_country_list

In [84]:
top_function = country_list(10, top=False)
top_function

['Burundi',
 'Chad',
 'Solomon Islands',
 'Madagascar',
 'Guinea',
 "Cote d'Ivoire",
 'Sierra Leone',
 'Afghanistan',
 'Myanmar',
 'Liberia']

In [11]:
top_country_list = []
for index in index_list_high:
    top_country_list.append(epi_2020.loc[index, 'country_name'])

In [12]:
top_country_list

['Denmark',
 'Luxembourg',
 'Switzerland',
 'United Kingdom',
 'France',
 'Austria',
 'Finland',
 'Sweden',
 'Norway',
 'Germany']

In [13]:
index_list_low = epi_2020['Environmental Performance Index'].sort_values(ascending=False).tail(10).index

In [14]:
index_list_low

Int64Index([141, 44, 16, 19, 170, 118, 24, 96, 52, 103], dtype='int64')

In [15]:
bot_country_list = []
for index in index_list_low:
    bot_country_list.append(epi_2020.loc[index, 'country_name'])

In [16]:
bot_country_list

['Burundi',
 'Chad',
 'Solomon Islands',
 'Madagascar',
 'Guinea',
 "Cote d'Ivoire",
 'Sierra Leone',
 'Afghanistan',
 'Myanmar',
 'Liberia']

In [17]:
epi_2020[epi_2020['country_name'].isin(top_country_list)]


Unnamed: 0,index,year,country_name,Sanitation & Drinking Water,Unsafe drinking water,PM2.5 Exposure,Air Quality,Marine Protected Areas,Biodiversity & Habitat,Ecosystem Vitality,...,Agriculture,Fisheries,PM2.5 Exceedance,Household Air Quality,Access to Electricity,Health Impacts,Agricultural Subsidies,Child Mortality,Pesticide Regulation,GDP
39,759,2020,Luxembourg,98.6,97.7,81.4,87.2,0.0,85.5,75.4,...,42.2,0.0,89.68,95.0,100.0,85.52,38.67,94.79,92.0,77233180000.0
42,816,2020,Germany,99.0,98.4,70.4,81.1,100.0,88.8,68.9,...,61.9,14.0,85.12,95.0,100.0,84.6,38.62,100.0,92.0,4782660000000.0
56,1082,2020,France,96.2,93.6,82.2,88.1,100.0,88.3,72.3,...,65.2,12.1,92.1,95.0,100.0,88.25,39.09,100.0,92.0,3419580000000.0
64,1262,2020,United Kingdom,100.0,100.0,75.4,84.7,100.0,88.0,74.3,...,54.3,8.8,97.57,95.0,100.0,95.26,36.07,100.0,96.0,3337150000000.0
107,2096,2020,Austria,94.7,100.0,73.9,81.3,0.0,85.5,74.0,...,68.0,0.0,81.13,95.0,100.0,87.04,38.98,100.0,92.0,536335000000.0
131,2552,2020,Norway,100.0,100.0,100.0,97.9,43.8,71.5,63.8,...,39.3,9.8,100.0,95.0,100.0,100.0,1.19,100.0,92.0,374384000000.0
150,2928,2020,Switzerland,100.0,100.0,87.8,90.6,0.0,63.0,72.5,...,47.6,0.0,92.1,95.0,100.0,79.31,6.48,100.0,92.0,620612000000.0
151,2962,2020,Finland,100.0,100.0,100.0,98.8,100.0,75.5,65.3,...,52.4,12.8,100.0,95.0,100.0,99.35,37.98,100.0,96.0,293524000000.0
158,3095,2020,Denmark,97.4,95.7,78.8,85.5,100.0,81.7,76.4,...,73.0,13.2,97.53,95.0,100.0,96.19,40.11,100.0,92.0,361273000000.0
168,3285,2020,Sweden,98.5,97.7,100.0,98.2,100.0,72.5,65.6,...,63.6,11.6,99.57,95.0,100.0,99.03,38.35,100.0,92.0,582487000000.0


# top 10 models:

In [18]:
top_10_df = df[df['country_name'].isin(top_country_list)]
bot_10_df = df[df['country_name'].isin(bot_country_list)]

# line reg:

In [19]:
df['Environmental Performance Index']

0       50.980
1       51.500
2       53.090
3       52.190
4       52.660
         ...  
3357    63.730
3358    56.625
3359    49.520
3360    40.110
3361    30.700
Name: Environmental Performance Index, Length: 3362, dtype: float64

In [20]:
precomputed = ['Ecosystem Vitality', 'Environmental Health', 'Air Quality', 'Sanitation & Drinking Water',
              'Biodiversity & Habitat']
drop_list = precomputed
drop_list.append('Environmental Performance Index')
drop_list.append('country_name')

In [75]:
len(df['country_name'].value_counts())

178

In [21]:
#  top_10_df = pd.get_dummies(top_10_df,columns=['country_name'], drop_first=False)

In [97]:
def quick_regressions(size, dataframe, column, target, top=True):
    if top:
        func_list = country_list(size, top=True)
        func_df = dataframe[dataframe[column].isin(func_list)]
    if not top:
        func_list = country_list(size, top=False)
        func_df = dataframe[dataframe[column].isin(func_list)]
        
    X = func_df.drop(columns=drop_list)
    y = func_df[target]
    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)
    
    sc = StandardScaler()
    Xs_train = sc.fit_transform(X_train)
    Xs_test = sc.transform(X_test)
    
    #line reg
    lr = LinearRegression()
    lr.fit(X_train,y_train)
    print(f'basic line reg scores: train:{lr.score(X_train,y_train)}')
    print(f'basic line reg scores: test:{lr.score(X_test,y_test)}')
    
    coefs = pd.DataFrame(zip(lr.coef_, X.columns), columns=['lr_coef', 'feature'])
    coefs.sort_values('lr_coef',ascending=False, inplace=True)
    coefs.reset_index(inplace=True)

    #lasso
    lasso = Lasso(random_state = 42, selection = 'random')

    lasso_pipe_params = {
        'alpha': loguniform(.001, 100),
        'tol': loguniform(.00001, .001),
        'max_iter': uniform(100, 10000)
    }

    lasso_rs = RandomizedSearchCV(lasso, param_distributions = lasso_pipe_params, 
                             cv = 5,n_iter=1000,verbose=1, n_jobs = -2, random_state = 42)
    lasso_rs.fit(Xs_train, y_train)
    print(f'lasso line reg scores: train:{lasso_rs.score(Xs_train,y_train)}')
    print(f'lasso line reg scores: test:{lasso_rs.score(Xs_test,y_test)}')

    coefs_lasso = pd.DataFrame(zip(lasso_rs.best_estimator_.coef_, X.columns), columns=['lasso_coef', 'feature'])
    coefs_lasso.sort_values('lasso_coef',ascending=False, inplace=True)
    coefs_lasso.reset_index(inplace=True)

    
    #ridge
    ridge = Ridge(random_state = 42)

    ridge_pipe_params = {
        'alpha': loguniform(.001, 100),
        'tol': loguniform(.00001, .001),
        'max_iter': uniform(100, 10000)
    }

    ridge_rs = RandomizedSearchCV(ridge, param_distributions = ridge_pipe_params,
                                  cv = 5, n_jobs = -2,n_iter=1000,verbose=1, random_state = 42)
    ridge_rs.fit(Xs_train, y_train)
    print(f'ridge line reg scores: train:{ridge_rs.score(Xs_train,y_train)}')
    print(f'ridge line reg scores: test:{ridge_rs.score(Xs_test,y_test)}')
    
    coefs_ridge = pd.DataFrame(zip(ridge_rs.best_estimator_.coef_, X.columns), columns=['ridge_coef', 'feature'])
    coefs_ridge.sort_values('ridge_coef',ascending=False, inplace=True)
    coefs_ridge.reset_index(inplace=True)
    
    
    return coefs, coefs_lasso, coefs_ridge

In [99]:
lr_coefs,lasso_coefs,ridge_coefs = quick_regressions(10,df,'country_name','Environmental Performance Index')

basic line reg scores: train:0.78424269976424
basic line reg scores: test:0.46661868670022055
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


  model = cd_fast.enet_coordinate_descent(


lasso line reg scores: train:0.7841529070989233
lasso line reg scores: test:0.4703925633212023
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
ridge line reg scores: train:0.7838209221603694
ridge line reg scores: test:0.4822377302677886


In [100]:
lr_coefs

Unnamed: 0,index,lr_coef,feature
0,17,0.5341023,Child Mortality
1,1,0.479586,Unsafe drinking water
2,8,0.4622853,Unsafe sanitation
3,0,0.3326541,year
4,11,0.2567794,Fisheries
5,4,0.1490971,Wastewater Treatment
6,5,0.1490971,Water Resources
7,2,0.1220077,PM2.5 Exposure
8,7,0.1098345,Terrestrial biome protection (national weights)
9,10,0.08932496,Agriculture


In [101]:
lasso_coefs

Unnamed: 0,index,lasso_coef,feature
0,11,4.884657,Fisheries
1,5,1.893622,Water Resources
2,0,1.80344,year
3,7,1.681097,Terrestrial biome protection (national weights)
4,9,1.258563,Terrestrial biome protection (global weights)
5,2,1.189123,PM2.5 Exposure
6,10,1.102158,Agriculture
7,8,0.912962,Unsafe sanitation
8,17,0.826105,Child Mortality
9,4,0.368869,Wastewater Treatment


In [102]:
ridge_coefs

Unnamed: 0,index,ridge_coef,feature
0,11,4.561395,Fisheries
1,7,1.941755,Terrestrial biome protection (national weights)
2,0,1.800101,year
3,10,1.149651,Agriculture
4,4,1.117919,Wastewater Treatment
5,5,1.117919,Water Resources
6,2,1.10262,PM2.5 Exposure
7,8,0.89939,Unsafe sanitation
8,9,0.854548,Terrestrial biome protection (global weights)
9,17,0.791315,Child Mortality


In [88]:
quick_regressions(10,df,'country_name','Environmental Performance Index', top=False)

basic line reg scores: train:0.9179824378864077
basic line reg scores: test:0.702150309186578
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
lasso line reg scores: train:0.9179758620234153
lasso line reg scores: test:0.7031394593845053
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
ridge line reg scores: train:0.9179824340078377
ridge line reg scores: test:0.7022137072597212


In [72]:
quick_regressions(25,df,'country_name','Environmental Performance Index')

basic line reg scores: train:0.6486158146747809
basic line reg scores: test:0.6760579025749429
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


  model = cd_fast.enet_coordinate_descent(


lasso line reg scores: train:0.6484350800484957
lasso line reg scores: test:0.6795102236232411
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
ridge line reg scores: train:0.6483660583935191
ridge line reg scores: test:0.6805997270174287


In [87]:
quick_regressions(25,df,'country_name','Environmental Performance Index', top=False)

basic line reg scores: train:0.8606041008070284
basic line reg scores: test:0.8595296857024323
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
lasso line reg scores: train:0.8594997721550421
lasso line reg scores: test:0.8653023089513517
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
ridge line reg scores: train:0.8595035542271364
ridge line reg scores: test:0.8657230883890634


In [103]:
lr_coefs,lasso_coefs,ridge_coefs = quick_regressions(50,df,'country_name','Environmental Performance Index')

basic line reg scores: train:0.8440435852408574
basic line reg scores: test:0.8536785905608141
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
lasso line reg scores: train:0.8434054455949893
lasso line reg scores: test:0.8523438774999561
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
ridge line reg scores: train:0.8437841256992428
ridge line reg scores: test:0.8538292516969871


In [104]:
lr_coefs

Unnamed: 0,index,lr_coef,feature
0,1,0.1499746,Unsafe drinking water
1,10,0.1214775,Agriculture
2,9,0.1129001,Terrestrial biome protection (global weights)
3,15,0.09511861,Health Impacts
4,5,0.08132196,Water Resources
5,4,0.08132196,Wastewater Treatment
6,0,0.06549245,year
7,12,0.06516915,PM2.5 Exceedance
8,8,0.05713509,Unsafe sanitation
9,14,0.05270767,Access to Electricity


In [105]:
lasso_coefs

Unnamed: 0,index,lasso_coef,feature
0,5,3.191184,Water Resources
1,1,2.446094,Unsafe drinking water
2,9,2.329401,Terrestrial biome protection (global weights)
3,10,2.155958,Agriculture
4,12,1.454824,PM2.5 Exceedance
5,4,1.303749,Wastewater Treatment
6,8,1.179147,Unsafe sanitation
7,15,0.808976,Health Impacts
8,14,0.359556,Access to Electricity
9,11,0.351256,Fisheries


In [106]:
ridge_coefs

Unnamed: 0,index,ridge_coef,feature
0,1,2.339906,Unsafe drinking water
1,9,2.297625,Terrestrial biome protection (global weights)
2,5,2.26596,Water Resources
3,4,2.26596,Wastewater Treatment
4,10,2.24709,Agriculture
5,12,1.443858,PM2.5 Exceedance
6,8,1.220963,Unsafe sanitation
7,15,0.852199,Health Impacts
8,11,0.531188,Fisheries
9,14,0.427044,Access to Electricity


In [107]:
lr_coefs,lasso_coefs,ridge_coefs = quick_regressions(50,df,'country_name','Environmental Performance Index', top=False)

basic line reg scores: train:0.819469212085264
basic line reg scores: test:0.7872904962378657
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
lasso line reg scores: train:0.8192660944551404
lasso line reg scores: test:0.7881463567302338
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
ridge line reg scores: train:0.8188289163517588
ridge line reg scores: test:0.7880136478893423


In [108]:
lr_coefs

Unnamed: 0,index,lr_coef,feature
0,5,0.2556765,Water Resources
1,0,0.2222111,year
2,8,0.1762799,Unsafe sanitation
3,15,0.1199866,Health Impacts
4,2,0.09590089,PM2.5 Exposure
5,11,0.09012456,Fisheries
6,7,0.07117025,Terrestrial biome protection (national weights)
7,9,0.06323472,Terrestrial biome protection (global weights)
8,13,0.0514391,Household Air Quality
9,10,0.02469145,Agriculture


In [109]:
lasso_coefs

Unnamed: 0,index,lasso_coef,feature
0,8,2.732334,Unsafe sanitation
1,7,2.428468,Terrestrial biome protection (national weights)
2,2,2.357742,PM2.5 Exposure
3,5,2.127292,Water Resources
4,15,2.108492,Health Impacts
5,9,2.039589,Terrestrial biome protection (global weights)
6,11,1.857057,Fisheries
7,0,1.167935,year
8,13,1.125892,Household Air Quality
9,10,0.62607,Agriculture


In [110]:
ridge_coefs

Unnamed: 0,index,ridge_coef,feature
0,8,2.616663,Unsafe sanitation
1,7,2.375113,Terrestrial biome protection (national weights)
2,2,2.221111,PM2.5 Exposure
3,9,2.055893,Terrestrial biome protection (global weights)
4,15,1.910047,Health Impacts
5,11,1.747561,Fisheries
6,5,1.309347,Water Resources
7,0,1.149936,year
8,13,1.11198,Household Air Quality
9,4,0.850648,Wastewater Treatment


In [89]:
quick_regressions(75,df,'country_name','Environmental Performance Index')

basic line reg scores: train:0.8821372076389266
basic line reg scores: test:0.8918323186859808
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
lasso line reg scores: train:0.8820027090667375
lasso line reg scores: test:0.8911694912967417
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
ridge line reg scores: train:0.882005795219131
ridge line reg scores: test:0.8914302918166839


In [90]:
quick_regressions(75,df,'country_name','Environmental Performance Index', top=False)

basic line reg scores: train:0.8477949240990064
basic line reg scores: test:0.8484801596228363
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
lasso line reg scores: train:0.8477722424127593
lasso line reg scores: test:0.8485043760555604
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
ridge line reg scores: train:0.8475449574898535
ridge line reg scores: test:0.848473402887403


In [23]:
# top_10_df.drop(columns=f'country_name_{top_country_list[0]}', inplace=True)

In [24]:
X = top_10_df.drop(columns=drop_list)
y = top_10_df['Environmental Performance Index']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

In [26]:
lr = LinearRegression()

In [27]:
lr.fit(X_train,y_train)

LinearRegression()

In [28]:
lr.score(X_train,y_train), lr.score(X_test,y_test)

(0.78424269976424, 0.46661868670022055)

In [29]:
print(list(zip(lr.coef_, X.columns)))

[(0.33265414917253827, 'year'), (0.4795860047773833, 'Unsafe drinking water'), (0.12200769341043965, 'PM2.5 Exposure'), (-0.020880069102779668, 'Marine Protected Areas'), (0.1490971193366169, 'Wastewater Treatment'), (0.14909711933661635, 'Water Resources'), (-0.1544882113532733, 'Fish Stock Status'), (0.10983448267713258, 'Terrestrial biome protection (national weights)'), (0.4622853368208674, 'Unsafe sanitation'), (0.04523350383700828, 'Terrestrial biome protection (global weights)'), (0.08932495511847308, 'Agriculture'), (0.25677937496044556, 'Fisheries'), (-0.08317968068073256, 'PM2.5 Exceedance'), (5.551115123125783e-17, 'Household Air Quality'), (1.942890293094024e-16, 'Access to Electricity'), (-0.15430382319155625, 'Health Impacts'), (-0.1261517774057178, 'Agricultural Subsidies'), (0.5341022810712324, 'Child Mortality'), (-0.042934725765342734, 'Pesticide Regulation'), (-1.6214807274650411e-12, 'GDP')]


In [30]:
coefs = pd.DataFrame(zip(lr.coef_, X.columns), columns=['coef', 'feature'])

In [31]:
coefs = pd.DataFrame(zip(lr.coef_, X.columns), columns=['coef', 'feature'])
coefs.sort_values('coef',ascending=False, inplace=True)
coefs.reset_index(inplace=True)
coefs

Unnamed: 0,index,coef,feature
0,17,0.5341023,Child Mortality
1,1,0.479586,Unsafe drinking water
2,8,0.4622853,Unsafe sanitation
3,0,0.3326541,year
4,11,0.2567794,Fisheries
5,4,0.1490971,Wastewater Treatment
6,5,0.1490971,Water Resources
7,2,0.1220077,PM2.5 Exposure
8,7,0.1098345,Terrestrial biome protection (national weights)
9,10,0.08932496,Agriculture


In [32]:
 sc = StandardScaler()
    
Xs_train = sc.fit_transform(X_train)
Xs_test = sc.transform(X_test)

#LASSO    
lasso = Lasso(random_state = 42, selection = 'random')

lasso_pipe_params = {
    'alpha': loguniform(.001, 100),
    'tol': loguniform(.00001, .001),
    'max_iter': uniform(100, 10000)
}

lasso_rs = RandomizedSearchCV(lasso, param_distributions = lasso_pipe_params, 
                         cv = 5,n_iter=1000,verbose=1, n_jobs = -2, random_state = 42)

lasso_rs.fit(Xs_train, y_train)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


  model = cd_fast.enet_coordinate_descent(


RandomizedSearchCV(cv=5, estimator=Lasso(random_state=42, selection='random'),
                   n_iter=1000, n_jobs=-2,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000024B6D62D2E0>,
                                        'max_iter': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000024B6D62D6A0>,
                                        'tol': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000024B6D62D430>},
                   random_state=42, verbose=1)

In [33]:
lasso_rs.score(Xs_train,y_train), lasso_rs.score(Xs_test,y_test)

(0.7841529070989233, 0.4703925633212023)

In [34]:
lasso_rs.best_params_

{'alpha': 0.0011495174910584793,
 'max_iter': 190.3846907929218,
 'tol': 5.1767678294596205e-05}

In [35]:
ridge = Ridge(random_state = 42)

ridge_pipe_params = {
    'alpha': loguniform(.001, 100),
    'tol': loguniform(.00001, .001),
    'max_iter': uniform(100, 10000)
}

ridge_rs = RandomizedSearchCV(ridge, param_distributions = ridge_pipe_params, cv = 5, n_jobs = -2,n_iter=1000,verbose=1, random_state = 42)
ridge_rs.fit(Xs_train, y_train)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


RandomizedSearchCV(cv=5, estimator=Ridge(random_state=42), n_iter=1000,
                   n_jobs=-2,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000024B6DA62520>,
                                        'max_iter': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000024B6DA62970>,
                                        'tol': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000024B6D623910>},
                   random_state=42, verbose=1)

In [36]:
ridge_rs.score(Xs_train,y_train), ridge_rs.score(Xs_test,y_test)

(0.7838209221603694, 0.4822377302677886)

In [96]:
ridge_rs.best_estimator_.coef_

array([ 1.80010089,  0.32428997,  1.10262046, -0.81914827,  1.11791877,
        1.11791877, -2.93474217,  1.94175457,  0.8993905 ,  0.85454772,
        1.14965139,  4.56139542, -1.41727167,  0.        ,  0.        ,
       -0.71158904, -1.81537928,  0.79131545, -0.2644986 , -2.03326178])

In [37]:
ridge_rs.best_params_

{'alpha': 0.46756001375625666,
 'max_iter': 1521.3712379345766,
 'tol': 1.055390599306468e-05}

pre2015

In [40]:
top_10_df_pre15 = top_10_df[top_10_df['year'] <= 2015]
top_10_df_post15 = top_10_df[top_10_df['year'] > 2015]

In [39]:
top_10_df_pre15

Unnamed: 0,year,country_name,Sanitation & Drinking Water,Unsafe drinking water,PM2.5 Exposure,Air Quality,Marine Protected Areas,Biodiversity & Habitat,Ecosystem Vitality,Wastewater Treatment,...,Agriculture,Fisheries,PM2.5 Exceedance,Household Air Quality,Access to Electricity,Health Impacts,Agricultural Subsidies,Child Mortality,Pesticide Regulation,GDP
741,2002,Luxembourg,100.000,100.0,76.61,74.350,0.000,100.00,75.080,95.00,...,25.11,0.00,51.440,95.0,100.0,94.210,10.22,94.21,40.0,2.361633e+10
742,2003,Luxembourg,100.000,100.0,83.99,80.770,0.000,100.00,75.250,95.00,...,28.26,0.00,63.310,95.0,100.0,94.700,8.51,94.70,48.0,2.955733e+10
743,2004,Luxembourg,100.000,100.0,82.99,80.670,0.000,100.00,76.500,95.00,...,50.75,0.00,64.030,95.0,100.0,95.150,9.50,95.15,92.0,3.468528e+10
744,2005,Luxembourg,100.000,100.0,82.70,80.220,0.000,100.00,76.680,95.00,...,54.01,0.00,62.950,95.0,100.0,95.490,16.02,95.49,92.0,3.734739e+10
745,2006,Luxembourg,100.000,100.0,81.49,79.690,0.000,100.00,76.770,95.00,...,55.62,0.00,62.590,95.0,100.0,95.700,19.25,95.70,92.0,4.241431e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3276,2011,Sweden,100.000,100.0,100.00,96.650,89.540,62.38,64.120,87.86,...,65.18,25.30,94.960,95.0,100.0,100.000,38.35,100.00,92.0,5.630000e+11
3277,2012,Sweden,100.000,100.0,100.00,97.130,89.540,62.38,64.120,87.86,...,65.18,25.30,96.400,95.0,100.0,100.000,38.35,100.00,92.0,5.440000e+11
3278,2013,Sweden,100.000,100.0,100.00,97.130,89.540,62.38,64.120,87.86,...,65.18,25.30,96.400,95.0,100.0,100.000,38.35,100.00,92.0,5.790000e+11
3279,2014,Sweden,100.000,100.0,100.00,97.130,89.540,62.38,64.120,87.86,...,65.18,25.30,96.400,95.0,100.0,100.000,38.35,100.00,92.0,5.740000e+11


In [41]:
top_10_df_post15

Unnamed: 0,year,country_name,Sanitation & Drinking Water,Unsafe drinking water,PM2.5 Exposure,Air Quality,Marine Protected Areas,Biodiversity & Habitat,Ecosystem Vitality,Wastewater Treatment,...,Agriculture,Fisheries,PM2.5 Exceedance,Household Air Quality,Access to Electricity,Health Impacts,Agricultural Subsidies,Child Mortality,Pesticide Regulation,GDP
755,2016,Luxembourg,98.57,100.0,100.0,82.54,0.0,100.0,84.29,99.3,...,54.6,0.0,83.95,95.0,100.0,85.52,38.67,94.79,92.0,64406170000.0
756,2017,Luxembourg,98.935,99.295,97.085,88.08,0.0,98.27,76.385,99.53,...,46.33,0.0,86.815,95.0,100.0,85.52,38.67,94.79,92.0,67187250000.0
757,2018,Luxembourg,99.3,98.59,94.17,93.62,0.0,96.54,68.48,99.76,...,38.06,0.0,89.68,95.0,100.0,85.52,38.67,94.79,92.0,73152140000.0
758,2019,Luxembourg,98.95,98.145,87.785,90.41,0.0,91.02,71.94,99.13,...,40.13,0.0,89.68,95.0,100.0,85.52,38.67,94.79,92.0,77233180000.0
759,2020,Luxembourg,98.6,97.7,81.4,87.2,0.0,85.5,75.4,98.5,...,42.2,0.0,89.68,95.0,100.0,85.52,38.67,94.79,92.0,77233180000.0
812,2016,Germany,99.5,100.0,76.72,69.88,100.0,100.0,83.87,98.59,...,68.35,34.04,57.44,95.0,100.0,84.6,38.62,100.0,92.0,4165170000000.0
813,2017,Germany,98.12,99.055,79.26,76.985,100.0,98.46,77.685,99.12,...,64.78,40.875,71.28,95.0,100.0,84.6,38.62,100.0,92.0,4390900000000.0
814,2018,Germany,96.74,98.11,81.8,84.09,100.0,96.92,71.5,99.65,...,61.21,47.71,85.12,95.0,100.0,84.6,38.62,100.0,92.0,4699850000000.0
815,2019,Germany,97.87,98.255,76.1,82.595,100.0,92.86,70.2,98.325,...,61.555,30.855,85.12,95.0,100.0,84.6,38.62,100.0,92.0,4782660000000.0
816,2020,Germany,99.0,98.4,70.4,81.1,100.0,88.8,68.9,97.0,...,61.9,14.0,85.12,95.0,100.0,84.6,38.62,100.0,92.0,4782660000000.0


In [52]:
X = top_10_df_post15.drop(columns=drop_list)
y = top_10_df_post15['Environmental Performance Index']

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42, test_size=.5)

In [54]:
lr = LinearRegression()

In [55]:
lr.fit(X_train,y_train)

LinearRegression()

In [56]:
lr.score(X_train,y_train), lr.score(X_test,y_test)

(0.992074024864568, 0.13724681565531882)