# <div style="text-align: center"> <font size=+5> <ins>PREDICT ANSWERS from INDICATORS</ins> </font> </div>
___


### Imports and settings

In [1]:
#imports 
import numpy as np
import pandas as pd

import os

import ipywidgets as widgets

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression


from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error


import tensorflow as tf
import statsmodels as sm
from sklearn.ensemble import GradientBoostingRegressor

from hyperopt import fmin, tpe, hp, Trials, space_eval, STATUS_OK, STATUS_FAIL
from hyperopt.pyll import scope as ho_scope

2024-11-03 09:00:07.937006: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
#settings
pd.set_option('display.max_columns', 500)

plt.rcParams.update({'figure.figsize': [6.0, 4.0],
                     'figure.dpi' : 300,
                     'font.size': 12.0, 
                     'mathtext.fontset' : 'stix', 
                     'font.family' : 'STIXGeneral'
                    })

___
## Get datasets

In [3]:
#get dataset
full_dataset = pd.read_pickle("../Saved_variables/full_dataset.pickle")

#convert categorical indicators to numerical
conv_dict_respData = {
        "Sex" : {"Male" : -1, "Female" : 1},
        "Education level" : {"No degree" : 0, "Primary" : 1, "Secondary" : 2, "University" : 3},
        "Living area" : {"Rural area" : 0, "Small city" : 1, "Big city" : 2},
        "Political orientation" : {"Far Left" : -3, "Left" : -2, "Center Left" : -1, "Center" : 0, "None/Other" : 0, "Center Right" : 1, "Right" : 2, "Far Right" : 3},
        "Religious beliefs" : {"Atheist" : 0, "Christian" : 1, "Islamic" : 1, "Jewish" : 1, "Other" : 1}
    }

data_resp_numerical = full_dataset.loc[:, ("Respondent Data", slice(None))].apply(lambda col: col if col.name[1] not in conv_dict_respData else col.map(lambda x: np.nan if pd.isna(x) else conv_dict_respData[col.name[1]][x]).astype(float))
full_dataset = full_dataset.drop(columns=["Respondent Data"])
full_dataset = pd.concat([data_resp_numerical, full_dataset], axis=1)

  full_dataset = full_dataset.drop(columns=["Respondent Data"])


___

In [4]:
#rename macro indicators columns
newcols = {
    "GHG_emissions"     : "Greenhouse gases emissions",
    "PM25_exposure"     : "PM25 exposure",
    "GDP_growth"        : "GDP growth",
    "gini_index"        : "Gini index",
    "GDP_pcap_ppp"      : "GDP per capita (Purchasing Power Parity)",
    "water_withdrawals" : "Water withdrawals",
    "pesticides_use"    : "Pesticides use",
    "GDP_pcap"          : "GDP per capita",
    "internet_usage"    : "Internet usage",
    "CO2_emissions_per_capita" : "CO2 emissions per capita",
    "infant_mortality"  : "Infant Mortality"
          }

newcols = {("Country-wide indicators", key) : ("Country-wide indicators", value) for key, value in newcols.items()}

full_dataset.columns = full_dataset.columns.map(lambda x: newcols.get(x,x))

___
## TensorFlow multi-level linear model

- Fixed intercept (to start at average answer)
- Fixed slope for country-level predictors
- Slope varies by country for individual-level predictors

In [None]:
# Define inputs
macro_indicators = ["PM25 exposure", "GDP growth", "Gini index", "GDP per capita (Purchasing Power Parity)", "Pesticides use", "CO2 emissions per capita"]
micro_indicators = ['Sex', 'Age', 'Education level', 'Personal income', 'Living area', 'Political orientation', 'Religious beliefs']

In [35]:
# Set up TensorFlow Model

# Extract unique country tags
unique_countries = full_dataset.index.get_level_values("Country").unique().sort_values()


# Define custom layer which uses different kernels for each country
class CustomDenseLayer(tf.keras.layers.Layer):
    def __init__(self, unique_countries, output_dim, **kwargs):
        self.unique_countries = unique_countries
        self.num_countries = len(unique_countries)
        self.output_dim = output_dim
        super().__init__(**kwargs)
    
    def build(self, input_shape):
        # Create a kernel for each country
        self.kernels = {
            country: self.add_weight(
                name=f'kernel_{country}',
                shape=(input_shape[-1], self.output_dim),
                initializer='zeros',
                trainable=True
            ) for country in self.unique_countries
        }
        super().build(input_shape)
    
    def call(self, inputs, country_indices):
        # Apply the appropriate kernel to each input based on the country index
        outputs = []
        for i, country in enumerate(self.unique_countries):
            mask = tf.cast(tf.equal(country_indices, i), tf.float32)
            masked_input = inputs * tf.expand_dims(mask, -1)
            output = tf.matmul(masked_input, self.kernels[country])
            outputs.append(output)
        result = tf.reduce_sum(tf.stack(outputs, axis=0), axis=0)[0]
        return tf.reshape(result, (-1, self.output_dim))

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_dim)


class CustomSumLayer(tf.keras.layers.Layer):
    def __init__(self,**kwargs):
        super().__init__(**kwargs)

    def build(self, input_shape):
        self.bias = self.add_weight(
            name='bias',
            initializer='zeros',
            trainable=True,
            shape=(input_shape[-1])
        )
        super().build(input_shape)

    def call(self, input1, input2):
        return input1 + input2 + self.bias

    def compute_output_shape(self, input_shape):
        return (input_shape[0], 1)

num_macro_ind = len(macro_indicators)
num_micro_ind = len(micro_indicators)

#create blank model (weights initialized to zero)
def new_model_tf():
    input_macro = tf.keras.layers.Input(shape=(num_macro_ind,), name="Macro_data")
    input_micro = tf.keras.layers.Input(shape=(num_micro_ind,), name="Micro_data")
    input_country_indices = tf.keras.layers.Input(shape=(1,), dtype=tf.int32, name="Country_indices")
    
    # Define shared weight layer for macro indicators
    macro_dense_layer = tf.keras.layers.Dense(1, kernel_initializer="zeros", use_bias=False, name="Macro_dense_layer")(input_macro)
    #macro_dense_layer = tf.keras.layers.Dense(1, use_bias=False, name="Macro_dense_layer")(input_macro)
    
    # Define custom weight layer for micro indicators
    custom_dense_layer = CustomDenseLayer(unique_countries, 1, name="Micro_custom_layer")
    micro_custom_layer = custom_dense_layer(input_micro, input_country_indices)
    
    # Define the output layer
    custom_sum_layer = CustomSumLayer(name="Get_prediction")
    output = custom_sum_layer(macro_dense_layer, micro_custom_layer)
    
    # Create the model
    model = tf.keras.models.Model(inputs=[input_macro, input_micro, input_country_indices], outputs=output, name="Model")
    
    # Compile the model
    model.compile(optimizer='adam', loss='mse')

    return model

In [36]:
# Preprocessing data for the model
def preprocess_data_tf(df, year, macro_ind, micro_ind, target):
    # select year
    df = df.loc[year]
    
    # fill missing individual values with country average and macro indicator with year average. Return which countries and indicators are missing for further processing
    # if individual indicator is missing for whole country replace with 0.
    missing_values = df.groupby("Country").mean().map(lambda x: np.isnan(x)) # mark which countries have missing data
    
    df_micro_filled = df.loc[:,("Respondent Data", slice(None))].fillna(df.loc[:,("Respondent Data", slice(None))].groupby("Country").mean()) #fill individual missing data with country average
    df_micro_filled = df_micro_filled.fillna(0) #if whole individual indicator is missing fill with 0 (indicator for country will be marked as true in missing_values dataframe)
    
    df_macro_filled = df.loc[:,("Country-wide indicators", slice(None))].fillna(df.loc[:,("Country-wide indicators", slice(None))].mean()) #fill macro indicator with average
    
    df_filled = pd.concat([df_micro_filled, df_macro_filled, df.loc[:,("Environmental concern dimension", slice(None))]], axis=1) #get dataframe

    #train-test split
    df_train, df_test = train_test_split(df_filled, test_size=0.2)    

    #normalization
    scaler = StandardScaler()
    df_train.iloc[:] = scaler.fit_transform(df_train)
    df_test.iloc[:] = scaler.transform(df_test)
    

    #split data into macro and micro indicators
    X_macro_train = df_train.loc[:, df_train.columns.get_level_values("Variable Name").isin(macro_ind)].values
    X_micro_train = df_train.loc[:, df_train.columns.get_level_values("Variable Name").isin(micro_ind)].values
    y_train = df_train.loc[:, ("Environmental concern dimension", target)].values
    country_indices_train = df_train.index.get_level_values("Country").to_series().map({country: i for i, country in enumerate(unique_countries)}).values
    
    X_macro_test = df_test.loc[:, df_test.columns.get_level_values("Variable Name").isin(macro_ind)].values
    X_micro_test = df_test.loc[:, df_test.columns.get_level_values("Variable Name").isin(micro_ind)].values
    y_test = df_test.loc[:, ("Environmental concern dimension", target)].values
    country_indices_test = df_test.index.get_level_values("Country").to_series().map({country: i for i, country in enumerate(unique_countries)}).values
    
    
    return [X_macro_train, X_micro_train, country_indices_train], y_train, [X_macro_test, X_micro_test, country_indices_test], y_test, missing_values

In [47]:
# Fitting Model

index_micro = pd.MultiIndex.from_product([
    ['Awareness', 'Will to make sacrifices','Efficacy of environmental action'],
    [1993, 2000, 2010, 2020],
    unique_countries],
    names = ["Environmental Concern Dimension", "Year", "Country"]
                                )
index_macro = pd.MultiIndex.from_product([
    ['Awareness', 'Will to make sacrifices', 'Efficacy of environmental action'],
    [1993, 2000, 2010, 2020]],
    names = ["Environmental Concern Dimension", "Year"]
                                )



predictive_indicators_individual_tensorflowModel = pd.DataFrame(index=index_micro, columns=pd.Index(micro_indicators, name="Indicators"))
predictive_indicators_macro_tensorflowModel = pd.DataFrame(index=index_macro, columns=pd.Index(macro_indicators, name="Indicators"))
losses = pd.DataFrame(index = index_micro, columns=pd.Index(["r2", "mae", "mse"], name="Losses"))
histories = pd.Series(index = index_macro)

for target in ['Awareness', 'Will to make sacrifices', 'Efficacy of environmental action']:
    for year in [1993, 2000, 2010, 2020]:
        print(f"Training custom TensorFlow model for '{target}' in {year}\n")    
        X_train, y_train, X_test, y_test, missing_data = preprocess_data_tf(full_dataset, year, macro_indicators, micro_indicators, target)       
        model = new_model_tf()

        #fit and get weights
        earlystop = tf.keras.callbacks.EarlyStopping(monitor="loss", patience=8, restore_best_weights=True, start_from_epoch=5)
        
        hist = model.fit(X_train, y_train, epochs=50, batch_size=None, verbose=3, callbacks=[earlystop])
        histories.loc[(target, year)] = hist
        predictive_indicators_individual_tensorflowModel.loc[(target, year, slice(None)),:] = pd.DataFrame.from_dict(model.get_layer("Micro_custom_layer").kernels).map(lambda x: x.numpy()[0]).T.values
        predictive_indicators_macro_tensorflowModel.loc[(target, year),:] = pd.DataFrame.from_dict(model.get_layer("Macro_dense_layer").weights).map(lambda x: x.numpy()[0]).values

        #get predictions and evaluate model
        y_pred = model.predict(X_test, verbose=0).squeeze()

        for i, country in enumerate(unique_countries):
            y_test_country = [y for k,y in enumerate(y_test) if X_test[2][k] == i]
            y_pred_country = [y for k,y in enumerate(y_pred) if X_test[2][k] == i]

            if len(y_test_country) > 0:
                r2_loss = r2_score(y_test_country, y_pred_country)
                mae = mean_absolute_error(y_test_country, y_pred_country)
                mse = mean_squared_error(y_test_country, y_pred_country)
                losses.loc[(target, year, country),:] = r2_loss, mae, mse

        del(model, hist)
        
        print("\n\n")


predictive_indicators_individual_tensorflowModel = pd.concat([predictive_indicators_individual_tensorflowModel, losses], axis=1, keys=["Indicators", "Loss"])
predictive_indicators_macro_tensorflowModel = pd.concat([predictive_indicators_macro_tensorflowModel, losses.groupby(["Environmental Concern Dimension", "Year"]).mean()], axis=1, keys=["Indicators", "Loss"])

index = [idx in full_dataset.index.droplevel(2) for idx in predictive_indicators_individual_tensorflowModel.index.droplevel(0)]
predictive_indicators_individual_tensorflowModel = predictive_indicators_individual_tensorflowModel.loc[index]

Training custom TensorFlow model for 'Awareness' in 1993

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50


  histories.loc[(target, year)] = hist





Training custom TensorFlow model for 'Awareness' in 2000

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50



Training custom TensorFlow model for 'Awareness' in 2010

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50



Training custom TensorFlow model for 'Awareness' in 2020

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50



Training custom TensorFlow model for 'Will to make sacrifices' in 1993

E

In [42]:
#Save data
predictive_indicators_individual_tensorflowModel.to_pickle("../Saved_variables/predictive_indicators_individual_tensorflow.pickle")
predictive_indicators_macro_tensorflowModel.to_pickle("../Saved_variables/predictive_indicators_macro_tensorflow.pickle")

___

In [3]:
#quick load
predictive_indicators_individual_tensorflowModel = pd.read_pickle("../Saved_variables/predictive_indicators_individual_tensorflow.pickle")
predictive_indicators_macro_tensorflowModel = pd.read_pickle("../Saved_variables/predictive_indicators_macro_tensorflow.pickle")

___
### Plot results

In [4]:
def cluster_dataframe(data, method="single", row_cluster=True, col_cluster=True):
    from scipy.cluster.hierarchy import linkage, dendrogram

    if row_cluster:
        row_linkage = linkage(data, method=method, optimal_ordering=True)
        row_order = dendrogram(row_linkage, no_plot=True)['leaves']
    else:
        row_order = slice(None)

    if col_cluster:
        col_linkage = linkage(data.T, method=method,  optimal_ordering=True)
        col_order = dendrogram(col_linkage, no_plot=True)['leaves']

    else:
        col_order = slice(None)
        
    abs_max = data.abs().max(axis=None)
    
    return data.iloc[row_order, col_order]

def style_dataframe(data):
    max = data.loc[:,"Indicators"].abs().max(axis=None)
    styled_df = data.style.background_gradient(cmap="bwr", axis=None, vmin=-max, vmax=max, subset="Indicators")
    
    cm = sns.light_palette("green", reverse=True, as_cmap=True)
    max = data.loc[:,("Loss", "r2")].max(axis=None)
    min = data.loc[:,("Loss", "r2")].min(axis=None)
    styled_df = styled_df.background_gradient(cmap=cm, subset=[("Loss", "r2")], vmin=min, vmax=max)

    cm = sns.light_palette("green", reverse=False, as_cmap=True)
    max = data.loc[:,("Loss", "mae")].max(axis=None)
    min = data.loc[:,("Loss", "mae")].min(axis=None)
    styled_df = styled_df.background_gradient(cmap=cm, subset=[("Loss", "mae")], vmin=min, vmax=max)

    cm = sns.light_palette("green", reverse=False, as_cmap=True)
    max = data.loc[:,("Loss", "mse")].max(axis=None)
    min = data.loc[:,("Loss", "mse")].min(axis=None)
    styled_df = styled_df.background_gradient(cmap=cm, subset=[("Loss", "mse")], vmin=min, vmax=max)

    
    #display(styled_df)
    return styled_df


def style_dataframe_tensorflow(data):

    try:
        max = data.loc[:,["Individual indicators", "Country-level indicators"]].abs().max(axis=None)
    except:
        try:
            max = data.loc[:,["Individual indicators"]].abs().max(axis=None)
        except:
            pass
        try:
            max = data.loc[:,["Country-level indicators"]].abs().max(axis=None)
        except:
            pass

    styled_df = data.style
    
    if "Individual indicators" in data.columns.get_level_values(0):
        styled_df = styled_df.background_gradient(cmap="bwr", axis=None, vmin=-max, vmax=max, subset=["Individual indicators"])
    if "Country-level indicators" in data.columns.get_level_values(0):
        styled_df = styled_df.background_gradient(cmap="bwr", axis=None, vmin=-max, vmax=max, subset=["Country-level indicators"])
    
    cm = sns.light_palette("green", reverse=True, as_cmap=True)
    max = data.loc[:,("Metrics", "r2")].max(axis=None)
    min = data.loc[:,("Metrics", "r2")].min(axis=None)
    styled_df = styled_df.background_gradient(cmap=cm, subset=[("Metrics", "r2")], vmin=min, vmax=max)

    cm = sns.light_palette("green", reverse=False, as_cmap=True)
    max = data.loc[:,("Metrics", "mse")].max(axis=None)
    min = data.loc[:,("Metrics", "mse")].min(axis=None)
    styled_df = styled_df.background_gradient(cmap=cm, subset=[("Metrics", "mse")], vmin=min, vmax=max)

    cm = sns.light_palette("green", reverse=False, as_cmap=True)
    max = data.loc[:,("Metrics", "mae")].max(axis=None)
    min = data.loc[:,("Metrics", "mae")].min(axis=None)
    styled_df = styled_df.background_gradient(cmap=cm, subset=[("Metrics", "mae")], vmin=min, vmax=max)
    
    return styled_df

In [33]:
data_countrywide = predictive_indicators_macro_tensorflowModel.loc[:,"Indicators"]
data_indiv = predictive_indicators_individual_tensorflowModel.loc[:,"Indicators"].groupby(["Environmental Concern Dimension", "Year"]).mean()
data_metrics = predictive_indicators_macro_tensorflowModel.loc[:,"Loss"]
data = pd.concat([data_countrywide, data_indiv, data_metrics], keys=["Country-level indicators", "Individual indicators", "Metrics"], axis=1)
del(data_countrywide, data_indiv, data_metrics)

data1 = cluster_dataframe(data.loc[:, ("Country-level indicators", slice(None))], row_cluster=False)
#data2 = cluster_dataframe(data.loc[:, ("Individual indicators", slice(None))], row_cluster=False)

#what do you want to plot?
#all
cols = [*data1.columns, *data2.columns, *data.loc[:,["Metrics"]].columns]
#country-level only
#cols = [*data1.columns, *data.loc[:,["Metrics"]].columns]
#individual only
#cols = [*data2.columns, *data.loc[:,["Metrics"]].columns]

#print(style_dataframe_tensorflow(data.loc[:,cols]).to_latex(convert_css=True).replace("\color[HTML]{000000}",""))

style_dataframe_tensorflow(data.loc[:,cols])

Unnamed: 0_level_0,Unnamed: 1_level_0,Country-level indicators,Country-level indicators,Country-level indicators,Country-level indicators,Country-level indicators,Country-level indicators,Individual indicators,Individual indicators,Individual indicators,Individual indicators,Individual indicators,Individual indicators,Individual indicators,Metrics,Metrics,Metrics
Unnamed: 0_level_1,Unnamed: 1_level_1,GDP per capita (Purchasing Power Parity),CO2 emissions per capita,Gini index,Pesticides use,PM25 exposure,GDP growth,Education level,Sex,Living area,Personal income,Religious beliefs,Age,Political orientation,r2,mae,mse
Environmental Concern Dimension,Year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
Awareness,1993,0.052034,0.009553,0.029044,-0.026504,-0.063397,-0.151005,-0.04106,0.081843,0.027653,-0.015774,-0.006911,-0.049967,-0.076347,-0.06286,0.789033,0.943644
Awareness,2000,-0.250091,-0.091209,0.164037,0.065083,0.001608,-0.010747,-0.025243,0.069559,0.020094,-0.04727,0.018518,-0.031606,-0.075081,-0.078804,0.759619,0.88714
Awareness,2010,-0.24195,-0.071173,0.028749,0.004955,-0.076958,0.085744,0.0147,0.100994,0.018036,-0.019626,0.017925,-0.058814,-0.118545,-0.031827,0.742553,0.855121
Awareness,2020,-0.147166,-0.060843,0.009377,0.058851,0.01418,-0.06376,0.06653,0.09012,-0.033533,-0.034475,0.046952,-0.053536,-0.163919,-0.045853,0.795007,0.97094
Will to make sacrifices,1993,-0.016703,-0.085523,-0.069568,0.115192,0.134532,0.151834,0.137925,0.014603,0.066095,0.031816,-0.011276,-0.048408,-0.069119,-0.017161,0.748043,0.877674
Will to make sacrifices,2000,0.389421,0.041867,0.022029,0.033032,-0.072304,-0.012487,0.170661,0.015696,-0.028246,0.05074,-0.005841,0.022277,-0.072761,-0.029222,0.749531,0.858365
Will to make sacrifices,2010,0.348985,-0.005651,-0.026976,0.022679,-0.085578,0.220595,0.163945,0.024917,0.017105,0.071522,-0.029255,0.048431,-0.075746,-0.026681,0.73421,0.83077
Will to make sacrifices,2020,0.496919,0.394627,0.004295,0.046192,-0.077867,-0.01064,0.128219,0.033483,0.032957,0.054155,-0.033703,0.031291,-0.195631,-0.007331,0.726796,0.834554
Efficacy of environmental action,1993,0.069969,-0.162129,-0.107866,0.070854,0.077267,0.154593,0.141383,0.016103,0.019271,0.011986,-0.036429,-0.167629,-0.034948,0.036069,0.726798,0.822576
Efficacy of environmental action,2000,0.250936,-0.110063,-0.034807,-0.015042,-0.061151,-0.018146,0.178604,0.047754,-0.045064,0.044915,-0.028116,-0.134934,-0.06257,0.023568,0.719365,0.804052


#### Individual indicators detail

In [24]:
#select specific country
#data = predictive_indicators_individual_tensorflowModel.query("Country == 'United States'")

#select topic and year
#data = predictive_indicators_individual_tensorflowModel.loc[("Efficacy of environmental action",2020,slice(None)),:]

#all data
data = predictive_indicators_individual_tensorflowModel

cluster_rows = False
cluster_columns = True

data_clustered = cluster_dataframe(data.fillna(0).loc[:, ("Indicators", slice(None))], row_cluster=cluster_rows, col_cluster=cluster_columns)
cols = [*data_clustered.columns, *data.loc[:,("Loss", slice(None))].columns]
rows = data_clustered.index

#print(style_dataframe(data.loc[rows,cols]).to_latex(convert_css=True).replace("\color[HTML]{000000}",""))

style_dataframe(data.loc[rows,cols])

  data_clustered = cluster_dataframe(data.fillna(0).loc[:, ("Indicators", slice(None))], row_cluster=cluster_rows, col_cluster=cluster_columns)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Loss,Loss,Loss
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Political orientation,Age,Religious beliefs,Living area,Personal income,Sex,Education level,r2,mae,mse
Environmental Concern Dimension,Year,Country,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Awareness,1993,Australia,-0.075235,-0.039227,-0.030978,0.053246,-0.02518,0.109823,-0.009077,0.052785,0.851254,1.114688
Awareness,1993,Bulgaria,-0.049912,-0.032613,0.023455,0.023832,-0.038126,0.05895,-0.01195,-0.013599,0.79377,0.903179
Awareness,1993,Canada,-0.096876,-0.054396,0.016025,0.016268,0.050197,0.108137,-0.039922,0.0138,0.829162,0.977838
Awareness,1993,Czechia,-0.091218,-0.03692,0.007238,0.054713,0.015811,0.083469,-0.038756,-0.017719,0.766617,0.904583
Awareness,1993,Germany,-0.090624,-0.032093,0.010001,-0.005846,-0.002432,0.090967,-0.042013,-0.17245,0.852101,1.107456
Awareness,1993,Hungary,-0.061185,-0.076057,0.003832,0.035504,-0.063411,0.081496,-0.064714,-0.206345,0.834981,0.972561
Awareness,1993,Ireland,-0.091898,-0.095865,0.013508,0.030971,0.015909,0.093415,-0.066736,-0.004316,0.815704,1.000954
Awareness,1993,Israel,-0.069105,-0.054731,-0.018001,0.039862,-0.001688,0.061238,-0.052751,-0.250616,0.864739,1.076637
Awareness,1993,Italy,-0.056487,-0.037765,-0.012896,0.019514,0.01451,0.079447,-0.036988,-0.146915,0.83657,1.047141
Awareness,1993,Japan,-0.050235,-0.060832,-0.025688,0.009624,-0.013123,0.069462,-0.015559,-0.153469,0.719118,0.848695


___
## StatsModel Multi-Level linear model

In [43]:
macro_indicators = ["PM25 exposure", "GDP growth", "Gini index", "GDP per capita (Purchasing Power Parity)", "Pesticides use", "CO2 emissions per capita"]
micro_indicators = ['Sex', 'Age', 'Education level', 'Personal income', 'Living area', 'Political orientation', 'Religious beliefs']

In [44]:
# Preprocessing data
def preprocess_data_mixedLM(df, year, macro_ind, micro_ind, target):
    # select year
    df = df.loc[year]
    
    # fill missing individual values with country average and macro indicator with year average. Return which countries and indicators are missing for further processing
    # if individual indicator is missing for whole country replace with 0.
    missing_values = df.groupby("Country").mean().map(lambda x: np.isnan(x)) # mark which countries have missing data
    
    df_micro_filled = df.loc[:,("Respondent Data", slice(None))].fillna(df.loc[:,("Respondent Data", slice(None))].groupby("Country").mean()) #fill individual missing data with country average
    df_micro_filled = df_micro_filled.fillna(0) #if whole individual indicator is missing fill with 0 (indicator for country will be marked as true in missing_values dataframe)
    
    df_macro_filled = df.loc[:,("Country-wide indicators", slice(None))].fillna(df.loc[:,("Country-wide indicators", slice(None))].mean()) #fill macro indicator with average
    
    df_filled = pd.concat([df_micro_filled, df_macro_filled, df.loc[:,("Environmental concern dimension", slice(None))]], axis=1) #get dataframe

    #select features
    df_filled = df_filled.loc[:,df_filled.columns.get_level_values("Variable Name").isin(macro_ind+micro_ind+[target])]
    
    #train-test split
    df_train, df_test = train_test_split(df_filled, test_size=0.2)    

    #normalization
    scaler = StandardScaler()
    df_train.iloc[:] = scaler.fit_transform(df_train)
    df_test.iloc[:] = scaler.transform(df_test)

    #set up country column for mixed model class
    df_train[("Class", "Country")] = df_train.index.get_level_values("Country")
    df_train = df_train.reset_index(drop=True)

    df_test[("Class", "Country")] = df_test.index.get_level_values("Country")
    df_test = df_test.reset_index(drop=True)

    #drop columns label
    df_train.columns = df_train.columns.droplevel("Variable Type")
    df_test.columns = df_test.columns.droplevel("Variable Type")
    
    return df_train, df_test

In [45]:
# Fitting models and getting parameters and metrics

import statsmodels.formula.api as smf

index_mixedLM = pd.MultiIndex.from_product([
    ['Awareness', 'Will to make sacrifices', 'Efficacy of environmental action'],
    [1993, 2000, 2010, 2020]],
    names = ["Environmental Concern Dimension", "Year"]
                                )
predictive_indicators_mixedLM = pd.DataFrame(index=index_mixedLM, columns = pd.Index(macro_indicators+micro_indicators, name="Indicators"))
metrics_mixedLM = pd.DataFrame(index=index_mixedLM, columns = pd.Index(["r2", "mse", "mae", "Residual variance", "ICC", "Converged"], name="Metrics"))

cols_renamed_dict = {ind: ind.replace(" ", "_") for ind in macro_indicators+micro_indicators+['Awareness', 'Will to make sacrifices', 'Efficacy of environmental action']}
cols_renamed_dict.update({"GDP per capita (Purchasing Power Parity)" : "GDP_per_capita_PPP"})    

for target in ['Awareness', 'Will to make sacrifices', 'Efficacy of environmental action']:
    for year in [1993, 2000, 2010, 2020]:
        print(f"Training mixed-effects linear model for '{target}' in {year}\n")    
        data_train, data_test = preprocess_data_mixedLM(full_dataset, year, macro_indicators, micro_indicators, target)

        #rename columns (no spaces or parenthesis)
        data_train.columns = data_train.columns.map(lambda x: cols_renamed_dict.get(x,x))
        data_test.columns = data_test.columns.map(lambda x: cols_renamed_dict.get(x,x))
        
        macro_indicators_ = [cols_renamed_dict[ind] for ind in macro_indicators]
        micro_indicators_ = [cols_renamed_dict[ind] for ind in micro_indicators]
        target_ = cols_renamed_dict[target]
        
        # Fit the random effect model
        model = smf.mixedlm(
            f"{target_} ~ {' + '.join(macro_indicators_)} + {' + '.join(micro_indicators_)}",
            data_train, 
            groups=data_train["Country"], 
            re_formula=f"1 + {' + '.join(micro_indicators_)}"
        )
        
        result = model.fit(method=["lbfgs"])
    
        #get parameters
        params = result.fe_params.drop("Intercept")
        params.index = params.index.map({k:i for i,k in cols_renamed_dict.items()})
    
        #get metrics
        res_var = result.scale
        intra_class_var = result.cov_re.values.diagonal().sum()
        icc = intra_class_var / (intra_class_var + res_var)

        
        y_pred = result.fittedvalues
        y_true = data_train[target_]
        
        r2_loss = r2_score(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)
        mse = mean_squared_error(y_true, y_pred)
    
        #save params+metrics
        predictive_indicators_mixedLM.loc[(target, year),:] = params
        if result.converged:
            metrics_mixedLM.loc[(target, year),:] = [r2_loss, mse, mae, res_var, icc, True]
        else:
            metrics_mixedLM.loc[(target, year),:] = [r2_loss, mse, mae, res_var, icc, False]
        
    
        del(model, result, params, res_var, intra_class_var, icc, r2_loss, mse, mae)
        
predictive_indicators_mixedLM = pd.concat([predictive_indicators_mixedLM, metrics_mixedLM], axis=1, keys=["Indicators", "Metrics"])

newLabels = {("Indicators", item) : ("Individual indicators", item) for item in micro_indicators}
newLabels.update({("Indicators", item) : ("Country-level indicators", item) for item in macro_indicators})
predictive_indicators_mixedLM.columns = predictive_indicators_mixedLM.columns.map(lambda x: newLabels.get(x,x))
del(newLabels)

predictive_indicators_mixedLM.to_pickle("../Saved_variables/predictive_indicators_mixedLM.pickle")

del(metrics_mixedLM)

Training mixed-effects linear model for 'Awareness' in 1993





Training mixed-effects linear model for 'Awareness' in 2000





Training mixed-effects linear model for 'Awareness' in 2010





Training mixed-effects linear model for 'Awareness' in 2020





Training mixed-effects linear model for 'Will to make sacrifices' in 1993





Training mixed-effects linear model for 'Will to make sacrifices' in 2000





Training mixed-effects linear model for 'Will to make sacrifices' in 2010





Training mixed-effects linear model for 'Will to make sacrifices' in 2020





Training mixed-effects linear model for 'Efficacy of environmental action' in 1993





Training mixed-effects linear model for 'Efficacy of environmental action' in 2000





Training mixed-effects linear model for 'Efficacy of environmental action' in 2010





Training mixed-effects linear model for 'Efficacy of environmental action' in 2020





___

In [28]:
# quick load
predictive_indicators_mixedLM = pd.read_pickle("../Saved_variables/predictive_indicators_mixedLM.pickle")

___
### Plot results

In [29]:
def cluster_dataframe(data, method="single", row_cluster=True, col_cluster=True):
    from scipy.cluster.hierarchy import linkage, dendrogram

    if row_cluster:
        row_linkage = linkage(data, method=method, optimal_ordering=True)
        row_order = dendrogram(row_linkage, no_plot=True)['leaves']
    else:
        row_order = slice(None)

    if col_cluster:
        col_linkage = linkage(data.T, method=method,  optimal_ordering=True)
        col_order = dendrogram(col_linkage, no_plot=True)['leaves']

    else:
        col_order = slice(None)
    
    return data.iloc[row_order, col_order]

def style_dataframe_mixedLM(data, dropConverged=False):

    if dropConverged == False:
        #convert convergence column
        data.loc[:,("Metrics", "Converged")] = data.loc[:,("Metrics", "Converged")].map(lambda x: "$\checkmark$" if x else r"$\times$")
    else:
        data = data.drop(columns=[("Metrics", "Converged")])
    
    try:
        max = data.loc[:,["Individual indicators", "Country-level indicators"]].abs().max(axis=None)
    except:
        try:
            max = data.loc[:,["Individual indicators"]].abs().max(axis=None)
        except:
            pass
        try:
            max = data.loc[:,["Country-level indicators"]].abs().max(axis=None)
        except:
            pass

    styled_df = data.style
    
    if "Individual indicators" in data.columns.get_level_values(0):
        styled_df = styled_df.background_gradient(cmap="bwr", axis=None, vmin=-max, vmax=max, subset=["Individual indicators"])
    if "Country-level indicators" in data.columns.get_level_values(0):
        styled_df = styled_df.background_gradient(cmap="bwr", axis=None, vmin=-max, vmax=max, subset=["Country-level indicators"])
    
    cm = sns.light_palette("green", reverse=False, as_cmap=True)
    max = data.loc[:,("Metrics", "Residual variance")].max(axis=None)
    min = data.loc[:,("Metrics", "Residual variance")].min(axis=None)
    styled_df = styled_df.background_gradient(cmap=cm, subset=[("Metrics", "Residual variance")], vmin=min, vmax=max)

    cm = sns.light_palette("green", reverse=True, as_cmap=True)
    max = data.loc[:,("Metrics", "r2")].max(axis=None)
    min = data.loc[:,("Metrics", "r2")].min(axis=None)
    styled_df = styled_df.background_gradient(cmap=cm, subset=[("Metrics", "r2")], vmin=min, vmax=max)

    cm = sns.light_palette("green", reverse=False, as_cmap=True)
    max = data.loc[:,("Metrics", "mse")].max(axis=None)
    min = data.loc[:,("Metrics", "mse")].min(axis=None)
    styled_df = styled_df.background_gradient(cmap=cm, subset=[("Metrics", "mse")], vmin=min, vmax=max)

    cm = sns.light_palette("green", reverse=False, as_cmap=True)
    max = data.loc[:,("Metrics", "mae")].max(axis=None)
    min = data.loc[:,("Metrics", "mae")].min(axis=None)
    styled_df = styled_df.background_gradient(cmap=cm, subset=[("Metrics", "mae")], vmin=min, vmax=max)

    cm = sns.light_palette("lime", reverse=False, as_cmap=True)
    max = data.loc[:,("Metrics", "ICC")].max(axis=None)
    min = data.loc[:,("Metrics", "ICC")].min(axis=None)
    styled_df = styled_df.background_gradient(cmap=cm, subset=[("Metrics", "ICC")], vmin=min, vmax=max)

    if dropConverged == False:
        apply_hatch = lambda row: ['background-image: repeating-linear-gradient(35deg, rgba(0,0,0,0.3) 0, rgba(0,0,0,0.3) 1px, transparent 1px, transparent 5px);'] * len(row) if row[("Metrics", "Converged")] == r"$\times$" else [''] * len(row)
        styled_df = styled_df.apply(apply_hatch, axis=1)

    
    return styled_df

In [30]:
data1 = cluster_dataframe(predictive_indicators_mixedLM.loc[:, ("Country-level indicators", slice(None))], row_cluster=False)
data2 = cluster_dataframe(predictive_indicators_mixedLM.loc[:, ("Individual indicators", slice(None))], row_cluster=False)

cols = [*data1.columns, *data2.columns, *predictive_indicators_mixedLM.loc[:,["Metrics"]].columns]
#cols = [*data1.columns, *predictive_indicators_mixedLM.loc[:,["Metrics"]].columns]
#cols = [*data2.columns, *predictive_indicators_mixedLM.loc[:,["Metrics"]].columns]

#print(style_dataframe_mixedLM(predictive_indicators_mixedLM.loc[:,cols], dropConverged=True).to_latex(convert_css=True).replace("\color[HTML]{000000}",""))

style_dataframe_mixedLM(predictive_indicators_mixedLM.loc[:,cols])

Unnamed: 0_level_0,Unnamed: 1_level_0,Country-level indicators,Country-level indicators,Country-level indicators,Country-level indicators,Country-level indicators,Country-level indicators,Individual indicators,Individual indicators,Individual indicators,Individual indicators,Individual indicators,Individual indicators,Individual indicators,Metrics,Metrics,Metrics,Metrics,Metrics,Metrics
Unnamed: 0_level_1,Unnamed: 1_level_1,GDP per capita (Purchasing Power Parity),PM25 exposure,Gini index,Pesticides use,CO2 emissions per capita,GDP growth,Education level,Sex,Living area,Personal income,Religious beliefs,Age,Political orientation,r2,mse,mae,Residual variance,ICC,Converged
Environmental Concern Dimension,Year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
Awareness,1993,0.053372,-0.019237,0.137631,-0.043663,-0.094865,-0.154162,-0.024492,0.082733,0.038282,0.002525,-0.00608,-0.048454,-0.046662,0.113552,0.886448,0.757923,0.889156,0.400623,$\checkmark$
Awareness,2000,-0.236329,-0.131221,0.222343,0.063317,-0.003248,-0.007949,0.002015,0.070688,0.038874,-0.041733,-0.013622,-0.037641,-0.077563,0.204985,0.795015,0.717824,0.798286,0.833118,$\times$
Awareness,2010,-0.201023,-0.174672,-0.002647,-0.000557,-0.054349,0.180402,0.011281,0.097955,0.031906,-0.032341,-0.000146,-0.045499,-0.081473,0.185882,0.814118,0.718922,0.817751,0.107368,$\checkmark$
Awareness,2020,-0.146719,-0.091209,0.015187,-0.005034,0.059119,0.017658,0.015522,0.098864,0.018489,-0.034162,0.010718,-0.03451,-0.142089,0.168861,0.831139,0.729719,0.834524,0.136764,$\times$
Will to make sacrifices,1993,0.084698,-0.090075,0.037608,0.106558,0.042707,-0.003686,0.171188,0.0113,0.033163,0.019135,-0.034552,-0.030485,-0.044849,0.171596,0.828404,0.728835,0.831601,0.417919,$\checkmark$
Will to make sacrifices,2000,0.400595,0.058041,0.038744,0.077688,-0.07133,0.009998,0.206029,0.01401,0.005894,0.031993,-0.043846,0.026593,-0.039536,0.234629,0.765371,0.700465,0.767251,0.832453,$\checkmark$
Will to make sacrifices,2010,0.230397,-0.043052,-0.039858,0.033669,-0.028737,0.220866,0.198418,0.029983,0.015104,0.058767,-0.008453,0.061737,-0.073758,0.238456,0.761544,0.697572,0.765452,0.090928,$\checkmark$
Will to make sacrifices,2020,0.353364,0.257721,0.007142,0.011586,-0.058141,-0.001939,0.157677,0.035406,0.019511,0.051602,-0.023612,0.019572,-0.153673,0.244099,0.755901,0.691255,0.759247,0.114248,$\checkmark$
Efficacy of environmental action,1993,0.145738,-0.101691,0.042236,0.003555,-0.052059,0.011358,0.162368,0.011539,-0.000179,0.015999,-0.063401,-0.145507,-0.024953,0.244078,0.755922,0.693574,0.758177,0.42685,$\checkmark$
Efficacy of environmental action,2000,0.266631,-0.094414,-0.020833,0.007541,-0.051245,-0.010348,0.192237,0.044504,-0.020435,0.043589,-0.04125,-0.129703,-0.027029,0.24595,0.75405,0.691704,0.755459,0.831971,$\checkmark$


___
## Decision Trees

In [46]:
macro_indicators = ["PM25 exposure", "GDP growth", "Gini index", "GDP per capita (Purchasing Power Parity)", "Pesticides use", "CO2 emissions per capita"]
micro_indicators = ['Sex', 'Age', 'Education level', 'Personal income', 'Living area', 'Political orientation', 'Religious beliefs']

In [47]:
# preprocessing data
def preprocess_data_trees(df, year, macro_ind, micro_ind, target, use_country=False):
    # select year
    df = df.loc[year]
    
    # fill missing individual values with country average and macro indicator with year average. Return which countries and indicators are missing for further processing
    # if individual indicator is missing for whole country replace with 0.
    missing_values = df.groupby("Country").mean().map(lambda x: np.isnan(x)) # mark which countries have missing data
    
    df_micro_filled = df.loc[:,("Respondent Data", slice(None))].fillna(df.loc[:,("Respondent Data", slice(None))].groupby("Country").mean()) #fill individual missing data with country average
    df_micro_filled = df_micro_filled.fillna(0) #if whole individual indicator is missing fill with 0 (indicator for country will be marked as true in missing_values dataframe)
    
    df_macro_filled = df.loc[:,("Country-wide indicators", slice(None))].fillna(df.loc[:,("Country-wide indicators", slice(None))].mean()) #fill macro indicator with average
    
    df_filled = pd.concat([df_micro_filled, df_macro_filled, df.loc[:,("Environmental concern dimension", slice(None))]], axis=1) #get dataframe

    #select features
    df_filled = df_filled.loc[:,df_filled.columns.get_level_values("Variable Name").isin(macro_ind+micro_ind+[target])]
    
    #train-test split
    df_train, df_test = train_test_split(df_filled, test_size=0.2)
    df_train, df_val = train_test_split(df_train, test_size=0.25)

    #normalization
    scaler = StandardScaler()
    df_train.iloc[:] = scaler.fit_transform(df_train)
    df_val.iloc[:] = scaler.transform(df_val)
    df_test.iloc[:] = scaler.transform(df_test)

    #set up country column to use as feature
    if use_country:
        df_train[("Respondent Data", "Country")] = pd.factorize(df_train.index.get_level_values("Country").values)[0]
        df_val[("Respondent Data", "Country")] = pd.factorize(df_val.index.get_level_values("Country").values)[0]
        df_test[("Respondent Data", "Country")] = pd.factorize(df_test.index.get_level_values("Country").values)[0]

    #delete index
    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    #get outputs
    yX_train = [df_train.loc[:,("Environmental concern dimension", target)], df_train.drop(columns=[("Environmental concern dimension", target)])]
    yX_val = [df_val.loc[:,("Environmental concern dimension", target)], df_val.drop(columns=[("Environmental concern dimension", target)])]
    yX_test = [df_test.loc[:,("Environmental concern dimension", target)], df_test.drop(columns=[("Environmental concern dimension", target)])]
    
    return yX_train, yX_val, yX_test

In [48]:
# fitting models and saving results

def fit_trees(yX_train, learning_rate=0.1, max_rounds=500, max_depth=5, n_iter_no_change=5, random_seed=42):
    model = GradientBoostingRegressor(learning_rate=learning_rate, n_estimators=max_rounds, max_depth=5, n_iter_no_change=n_iter_no_change, random_state=random_seed)
    model.fit(yX_train[1], yX_train[0])
    return model

search_space = {
    "learning_rate" : hp.loguniform("learning_rate", np.log(10**-5), 0),
    "max_depth" : ho_scope.int(hp.quniform("max_depth", 1, 10, 1)),
}

def objective_func_hyperopt(hyperparams):
    try:
        model = fit_trees(yX_train, **hyperparams)
        return {"loss" : mean_squared_error(yX_val[0], model.predict(yX_val[1])), "status" : STATUS_OK}
    except:
        return {"status" : STATUS_FAIL}


index_trees = pd.MultiIndex.from_product([
    ['Awareness', 'Will to make sacrifices', 'Efficacy of environmental action'],
    [1993, 2000, 2010, 2020]],
    names = ["Environmental Concern Dimension", "Year"]
                                )
predictive_indicators_trees = pd.DataFrame(index=index_trees, columns = pd.Index(micro_indicators+macro_indicators, name="Feature Importance"))
metrics_trees = pd.DataFrame(index=index_trees, columns = pd.Index(["r2", "mse", "mae"], name="Metrics"))

for target in ['Awareness', 'Will to make sacrifices', 'Efficacy of environmental action']:
    for year in [1993, 2000, 2010, 2020]:
        print(f"Optimizing boosted decision trees model for '{target}' in {year}")

        #get datasets
        yX_train, yX_val, yX_test = preprocess_data_trees(full_dataset, year, macro_indicators, micro_indicators, target)

        #optimize model
        best_params = fmin(objective_func_hyperopt, space=search_space, algo=tpe.suggest, max_evals=100)

        #get metrics and feature importance
        model = fit_trees(yX_train, **best_params)

        predictive_indicators_trees.loc[(target, year),:] = model.feature_importances_

        y_test_pred = model.predict(yX_test[1])
        r2 = r2_score(yX_test[0], y_test_pred)
        mse = mean_squared_error(yX_test[0], y_test_pred)
        mae = mean_absolute_error(yX_test[0], y_test_pred)
        
        metrics_trees.loc[(target, year),:] = [r2, mse, mae]

        #delete
        del(model, yX_train, yX_val, yX_test, r2, mse, mae, best_params)

        print("\n")

predictive_indicators_trees = pd.concat([predictive_indicators_trees, metrics_trees], axis=1, keys=["Indicators", "Metrics"])
del(metrics_trees)

newLabels = {("Indicators", item) : ("Individual indicators", item) for item in micro_indicators}
newLabels.update({("Indicators", item) : ("Country-level indicators", item) for item in macro_indicators})
predictive_indicators_trees.columns = predictive_indicators_trees.columns.map(lambda x: newLabels.get(x,x))
del(newLabels)

#save results
predictive_indicators_trees.to_pickle("../Saved_variables/predictive_indicators_trees.pickle")

Optimizing boosted decision trees model for 'Awareness' in 1993
100%|███████| 100/100 [04:19<00:00,  2.59s/trial, best loss: 0.9098143798962984]


Optimizing boosted decision trees model for 'Awareness' in 2000
100%|███████| 100/100 [05:45<00:00,  3.45s/trial, best loss: 0.8090126146370932]


Optimizing boosted decision trees model for 'Awareness' in 2010
100%|███████| 100/100 [09:18<00:00,  5.59s/trial, best loss: 0.8084311361399737]


Optimizing boosted decision trees model for 'Awareness' in 2020
100%|████████| 100/100 [08:41<00:00,  5.21s/trial, best loss: 0.849510823181125]


Optimizing boosted decision trees model for 'Will to make sacrifices' in 1993
100%|████████| 100/100 [04:33<00:00,  2.73s/trial, best loss: 0.825651236044963]


Optimizing boosted decision trees model for 'Will to make sacrifices' in 2000
100%|███████| 100/100 [07:16<00:00,  4.36s/trial, best loss: 0.7940820901234638]


Optimizing boosted decision trees model for 'Will to make sacrifices' in 2010
100%|███████

___

In [20]:
# quick load
predictive_indicators_trees = pd.read_pickle("../Saved_variables/predictive_indicators_trees.pickle")

___
### Plot results

In [21]:
def cluster_dataframe(data, method="ward", row_cluster=True, col_cluster=True):
    from scipy.cluster.hierarchy import linkage, dendrogram

    if row_cluster:
        row_linkage = linkage(data, method=method, optimal_ordering=True)
        row_order = dendrogram(row_linkage, no_plot=True)['leaves']
    else:
        row_order = slice(None)

    if col_cluster:
        col_linkage = linkage(data.T, method=method,  optimal_ordering=True)
        col_order = dendrogram(col_linkage, no_plot=True)['leaves']

    else:
        col_order = slice(None)
    
    return data.iloc[row_order, col_order]

def style_dataframe_trees(data):
    try:
        1/0 #raise error (plot looks better if individual and country level colormaps have different scales)
        min1 = min2 = data.loc[:,["Individual indicators", "Country-level indicators"]].min(axis=None)
        max1 = max2 = data.loc[:,["Individual indicators", "Country-level indicators"]].max(axis=None)
    except:
        try:
            min1 = data.loc[:,"Individual indicators"].min(axis=None)
            max1 = data.loc[:,"Individual indicators"].max(axis=None)
        except:
            pass
        try:
            min2 = data.loc[:,["Country-level indicators"]].min(axis=None)
            max2 = data.loc[:,["Country-level indicators"]].max(axis=None)
        except:
            pass

    styled_df = data.style
    
    if "Individual indicators" in data.columns.get_level_values(0):
        styled_df = styled_df.background_gradient(cmap="Blues", axis=None, vmin=min1, vmax=max1, subset=["Individual indicators"])
    if "Country-level indicators" in data.columns.get_level_values(0):
        styled_df = styled_df.background_gradient(cmap="Oranges", axis=None, vmin=min2, vmax=max2, subset=["Country-level indicators"])
    
    cm = sns.light_palette("green", reverse=True, as_cmap=True)
    max = data.loc[:,("Metrics", "r2")].max(axis=None)
    min = data.loc[:,("Metrics", "r2")].min(axis=None)
    styled_df = styled_df.background_gradient(cmap=cm, subset=[("Metrics", "r2")], vmin=min, vmax=max)

    cm = sns.light_palette("green", reverse=False, as_cmap=True)
    max = data.loc[:,("Metrics", "mse")].max(axis=None)
    min = data.loc[:,("Metrics", "mse")].min(axis=None)
    styled_df = styled_df.background_gradient(cmap=cm, subset=[("Metrics", "mse")], vmin=min, vmax=max)

    cm = sns.light_palette("green", reverse=False, as_cmap=True)
    max = data.loc[:,("Metrics", "mae")].max(axis=None)
    min = data.loc[:,("Metrics", "mae")].min(axis=None)
    styled_df = styled_df.background_gradient(cmap=cm, subset=[("Metrics", "mae")], vmin=min, vmax=max)
    
    return styled_df

In [25]:
data1 = cluster_dataframe(predictive_indicators_trees.loc[:, ("Country-level indicators", slice(None))], row_cluster=False)
data2 = cluster_dataframe(predictive_indicators_trees.loc[:, ("Individual indicators", slice(None))], row_cluster=False)

cols = [*data1.columns, *data2.columns, *predictive_indicators_trees.loc[:,["Metrics"]].columns]
#cols = [*data1.columns, *predictive_indicators_trees.loc[:,["Metrics"]].columns]
#cols = [*data2.columns, *predictive_indicators_trees.loc[:,["Metrics"]].columns]

#print(style_dataframe_trees(predictive_indicators_trees.loc[:,cols]).to_latex(convert_css=True).replace("\color[HTML]{000000}",""))

style_dataframe_trees(predictive_indicators_trees.loc[:,cols])

Unnamed: 0_level_0,Unnamed: 1_level_0,Country-level indicators,Country-level indicators,Country-level indicators,Country-level indicators,Country-level indicators,Country-level indicators,Individual indicators,Individual indicators,Individual indicators,Individual indicators,Individual indicators,Individual indicators,Individual indicators,Metrics,Metrics,Metrics
Unnamed: 0_level_1,Unnamed: 1_level_1,GDP per capita (Purchasing Power Parity),Gini index,GDP growth,Pesticides use,PM25 exposure,CO2 emissions per capita,Religious beliefs,Living area,Sex,Personal income,Political orientation,Age,Education level,r2,mse,mae
Environmental Concern Dimension,Year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
Awareness,1993,0.063668,0.080667,0.188937,0.060864,0.05185,0.09625,0.014984,0.030891,0.064093,0.071892,0.100901,0.12191,0.053094,0.112662,0.905029,0.771326
Awareness,2000,0.179367,0.40199,0.035633,0.022681,0.042597,0.054621,0.010305,0.0187,0.044012,0.055555,0.062889,0.057174,0.014475,0.190813,0.798431,0.719573
Awareness,2010,0.368068,0.124524,0.033657,0.021408,0.064649,0.028947,0.007488,0.021585,0.074105,0.051535,0.10271,0.066368,0.034955,0.176853,0.829696,0.724365
Awareness,2020,0.025571,0.030047,0.05158,0.058177,0.009821,0.242162,0.049076,0.060612,0.06325,0.072458,0.239506,0.060934,0.036807,0.169354,0.851255,0.737951
Will to make sacrifices,1993,0.363535,0.02919,0.0373,0.035765,0.092383,0.017631,0.018392,0.016331,0.01552,0.054699,0.057807,0.095908,0.165539,0.176483,0.825829,0.727241
Will to make sacrifices,2000,0.365493,0.02999,0.014361,0.097024,0.034119,0.085257,0.00979,0.011528,0.004848,0.055795,0.076768,0.07955,0.135479,0.240647,0.769874,0.704604
Will to make sacrifices,2010,0.370227,0.026015,0.110238,0.053373,0.017213,0.035453,0.012609,0.013983,0.007243,0.068828,0.0782,0.068055,0.138563,0.22497,0.781571,0.70488
Will to make sacrifices,2020,0.344541,0.01752,0.055698,0.069456,0.024416,0.067902,0.0199,0.015824,0.011813,0.06548,0.187255,0.048905,0.071289,0.252833,0.748593,0.689507
Efficacy of environmental action,1993,0.486778,0.046179,0.023613,0.002491,0.010113,0.027323,0.017428,0.00507,0.008533,0.036104,0.034497,0.166993,0.134878,0.239288,0.763282,0.701115
Efficacy of environmental action,2000,0.408046,0.075926,0.011449,0.00684,0.022257,0.023274,0.013284,0.011306,0.014321,0.061293,0.042926,0.132184,0.176893,0.240003,0.752786,0.691437


___