# <div style="text-align: center"> <font size=+5> <ins>PREDICT ANSWERS from INDICATORS</ins> </font> </div>
___


### Imports and settings

In [31]:
#imports 

import numpy as np
import pandas as pd

import os

import ipywidgets as widgets



import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt



from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error


import tensorflow as tf

In [2]:
#settings

pd.set_option('display.max_columns', 500)

plt.rcParams.update({'figure.dpi' : 300,
                     'mathtext.fontset' : 'stix', 
                     'font.family' : 'STIXGeneral'
                    })

___
## Get datasets

In [3]:
# get full dataset
data_issp = pd.read_pickle("../Processed_Data/questionnaires/issp_questionnaires_common.pickle")

# convert values to numerical data
conv_dict = {
    "Respondent Data" : {
        "Sex" : {"Male" : -1, "Female" : 1},
        "Education level" : {"No degree" : 0, "Primary" : 1, "Secondary" : 2, "University" : 3},
        "Living area" : {"Rural area" : 0, "Small city" : 1, "Big city" : 2},
        "Political orientation" : {"Far Left" : -3, "Left" : -2, "Center Left" : -1, "Center" : 0, "None/Other" : 0, "Center Right" : 1, "Right" : 2, "Far Right" : 3},
        "Religious beliefs" : {"Atheist" : 0, "Christian" : 1, "Islamic" : 1, "Jewish" : 1, "Other" : 1}
    },
    "Questions" : {
        "Strongly Disagree" : -2,
        "Disagree" : -1,
        "Neither Agree nor Disagree" : 0,
        "Agree" : 1,
        "Strongly Agree" : 2,

        "Yes" : 1,
        "No" : -1,

        'Extremely dangerous': 2,
        'Very dangerous': 1,
        'Somewhat dangerous': 0,
        'Not very dangerous': -1,
        'Not dangerous at all': -2,

        "Always" : 3,
        "Often" : 2,
        "Sometimes" : 1,
        "Never" : 0,

        'Very willing': 2,
        'Fairly willing': 1,
        'Neither willing nor unwilling': 0,
        'Fairly unwilling': -1,
        'Very unwilling': -2
    }
}

#convert values to numerical
data_resp = data_issp.loc[:,("Respondent Data", slice(None))].apply(lambda col: col if col.name[1] not in conv_dict["Respondent Data"] else col.map(lambda x: np.nan if pd.isna(x) else conv_dict["Respondent Data"][col.name[1]][x]).astype(float))
data_quest = data_issp.loc[:,("Questions", slice(None))].map(lambda x: conv_dict["Questions"].get(x, x)).astype(float)

#cluster questions based on topic
clusters = {
    #TOPIC 1: awareness of the environmental consequences of societies’ modern, industrial activities
    "Awareness" : [
        'Danger to the environment (Air pollution by cars)',
        'Danger to the environment (Air pollution by industry)',
        'Danger to the environment (Pesticides and chemicals in farming)',
        'Danger to the environment (River, lake and stream pollution)',
        'Danger to the environment (Rise in the world’s temperature)',
        'Danger to the environment (Nuclear power stations)'
    ],

    #TOPIC 2: willingness to sacrifice personally in some manner (e.g. time, money) for the environment
    "Will to make sacrifices" : [
        'Willingness to Make Trade-Offs for Environment (Pay much higher prices)',
        'Willingness to Make Trade-Offs for Environment (Pay much higher taxes)',
        'Willingness to Make Trade-Offs for Environment (Cut your standard of living)',
        'Member of a group to preserve environment',
        'In the last five years, signed a petition',
        'In the last five years, given money',
        'In the last five years, participated in an environmental demonstration',
        'I do what is right even when it costs money and takes time'
    ],
    
    #TOPIC 3: importance of addressing environmental problems and of collective effort for resolving environmental
    #issues and how environmental issues intersect with economic issues, science and progress
    # NB!! answers are rescaled so that higher scores reflect pro-environment responses
    "Efficacy of environmental action" : [
        'We worry too much about harming environment',
        'We worry too much about future environment',
        'Science solves environmental problems',
        'Too difficult to do much about environment'
    ]
        
}

#rescale
data_to_cluster = data_quest.apply(lambda x: x if x.name[1] not in clusters["Efficacy of environmental action"] else -x)
data_quest_clustered = pd.concat([pd.concat([data_to_cluster.loc[:, ("Questions", clusters[topic])].sum(axis=1) for topic in clusters], axis=1, keys=clusters.keys())], keys=["Environmental concern dimension"],  names=["Variable Type", "Variable Name"], axis=1)


#get country-wide indicators columns
macro_indicators_dict = {(ind[:-7]) : pd.read_pickle(f"Processed_Data/indicators/{ind}") for ind in os.listdir("../Processed_Data/indicators") if ind[-7:] == ".pickle"}
data_indic = pd.concat([pd.DataFrame.from_records([data_issp.index.map(lambda idx: indicator.loc[idx[1], idx[0]]) for indicator in macro_indicators_dict.values()], index=macro_indicators_dict.keys(), columns=data_issp.index).T], keys=["Country-wide indicators"], names=["Variable Type", "Variable Name"], axis=1)


#merge into single dataframes
data_issp_numerical = pd.concat([data_resp, data_indic, data_quest], axis=1) #separate questions
data_issp_numerical_clustered = pd.concat([data_resp, data_indic, data_quest_clustered], axis=1) #clustered questions


del(data_issp, data_resp, data_quest, data_to_cluster, data_quest_clustered, data_indic, conv_dict)


#dump
#data_issp_numerical.to_pickle("../Saved_variables/full_dataset_numerical_quest+indic.pickle")
#data_issp_numerical_clustered.to_pickle("../Saved_variables/full_dataset_numerical_clustered.pickle")

In [3]:
#quick load
data_issp_numerical = pd.read_pickle("../Saved_variables/full_dataset_numerical_quest+indic.pickle")
data_issp_numerical_clustered = pd.read_pickle("../Saved_variables/full_dataset_numerical_clustered.pickle")

___

In [4]:
#group by year and country (weighted mean)
data_issp_numerical_grouped = data_issp_numerical.drop(columns=("Respondent Data", "Weight")).mul(data_issp_numerical.loc[:,("Respondent Data", "Weight")], axis=0).groupby(["Year", "Country"]).sum(min_count=1).div(data_issp_numerical.loc[:,("Respondent Data", "Weight")].groupby(["Year", "Country"]).sum(), axis=0)
data_issp_numerical_clustered_grouped = data_issp_numerical_clustered.drop(columns=("Respondent Data", "Weight")).mul(data_issp_numerical_clustered.loc[:,("Respondent Data", "Weight")], axis=0).groupby(["Year", "Country"]).sum(min_count=1).div(data_issp_numerical_clustered.loc[:,("Respondent Data", "Weight")].groupby(["Year", "Country"]).sum(), axis=0)

In [5]:
#rescale so that dimensions columns are in range [-1,+1]
data_issp_numerical_clustered.loc[:,("Environmental concern dimension", slice(None))] /= [12,12,8]
data_issp_numerical_clustered_grouped.loc[:,("Environmental concern dimension", slice(None))] /= [12,12,8]

___
## Lasso

In [4]:
def fit_lasso(yX, alpha, weights=None):
    model = Lasso(alpha)
    model.fit(yX.iloc[:,1:], yX.iloc[:,0], sample_weight=weights)
    return {"alpha" : alpha, "loss" : 1-model.score(yX.iloc[:,1:], yX.iloc[:,0], sample_weight=weights), "coefficients" : model.coef_, "model" : model}

def plot_indicators(data, question, year, indicators):
    yX = data.loc[(year, slice(None)),[("Environmental concern dimension", question), *indicators]].dropna().astype(float)
    yX.iloc[:,:] = StandardScaler().fit_transform(yX.to_numpy())
    
    try:
        weights = data.loc[yX.index, ("Respondent Data", "Weight")]
    except:
        weights = None
    
    res = pd.DataFrame([fit_lasso(yX, alpha, weights) for alpha in np.logspace(-2,0,250)])
    
    fig, (ax1, ax2) = plt.subplots(1,2, figsize=(6.4*2,4.8), gridspec_kw={"width_ratios" : (5,95)}, sharey=True)
    fig.subplots_adjust(wspace=0)
    fig.suptitle(f"Most important indicators to predict '{question}' in {year}", fontsize=20)
    data = np.stack(res["coefficients"].to_numpy())
    sns.heatmap([[np.log(l)] for l in res["loss"]], ax=ax1, cbar=False)
    sns.heatmap(data, cmap="bwr", center=0, ax=ax2)
    ax2.set_title("Coefficients in LASSO regression")
    ax2.set_xticklabels([col[1] for col in yX.iloc[:,1:].columns], rotation=50, horizontalalignment="right", fontsize=8)
    ax2.set_xlabel("Indicator")

    for ax in [ax1]:
        ax.set_yticks(res.index[::50])
        ax.set_yticklabels([f"{l:.0e}" for l in res.loc[::50,"alpha"]], rotation=30, horizontalalignment="right", verticalalignment="top", fontsize=8)
        ax.tick_params(axis="y", pad=0)
        ax.set_ylabel("L1 regularization hyperparameter")
    
    ax1.set_title("Model error")
    ax1.set_xticks([])
    
    plt.tight_layout()

### Macro Indicators

In [6]:
@widgets.interact(year=[1993,2000,2010,2020], dimension=data_issp_numerical_clustered.loc[:,"Environmental concern dimension"].columns)
def plot(dimension, year=2020):
    indicators = [("Country-wide indicators", ind) for ind in ["GHG_emissions", "PM25_exposure", "GDP_growth", "gini_index", "GDP_pcap_ppp", "water_withdrawals", "pesticides_use"]]
    plot_indicators(data_issp_numerical_clustered_grouped, dimension, year, indicators)

interactive(children=(Dropdown(description='dimension', options=('Awareness', 'Will to make sacrifices', 'Effi…

### Individual indicators

In [7]:
@widgets.interact(year=[1993,2000,2010,2020], dimension=data_issp_numerical_clustered.loc[:,"Environmental concern dimension"].columns, all_countries=False, country=data_issp_numerical_clustered.index.get_level_values("Country").unique())
def plot(dimension, country, all_countries, year=2020):
    indicators = [("Respondent Data", ind) for ind in data_issp_numerical.loc[:, "Respondent Data"].drop(columns="Weight").columns]
    if all_countries:
        plot_indicators(data_issp_numerical_clustered, dimension, year, indicators)
    else:
        plot_indicators(data_issp_numerical_clustered.query("Country == @country"), dimension, year, indicators)

interactive(children=(Dropdown(description='dimension', options=('Awareness', 'Will to make sacrifices', 'Effi…

___
## Linear Fit

In [7]:
def fit_linear(yX, weights=None):
        
    #fill missing values with columns average
    yX = yX.fillna(yX.mean())
    #if whole column is missing fill with 0, then replace with NaN in output
    NaN_cols = yX.iloc[:,1:].mean().isna().to_numpy()
    yX = yX.fillna(0)

    #for large datasets: single train-test split
    if len(yX) > 500:
        yX_train, yX_test = train_test_split(yX, test_size=0.2)

        if weights is not None:
            weights_train, weights_test = weights.loc[yX_train.index], weights.loc[yX_test.index]
        else:
            weights_train, weights_test = None, None
        
        #standardize data
        scaler = StandardScaler()
        yX_train.iloc[:,:] = scaler.fit_transform(yX_train)
        yX_test.iloc[:,:] = scaler.transform(yX_test)
        
        #fit linear regression model
        model = LinearRegression(fit_intercept=True)
        model.fit(yX_train.iloc[:, 1:], yX_train.iloc[:,0], sample_weight=weights_train)
        
        #compute metrics
        yX_pred = pd.DataFrame().reindex_like(yX_test)
        yX_pred.iloc[:,0] = model.predict(yX_test.iloc[:,1:])
        yX_pred.iloc[:,:] = scaler.inverse_transform(yX_pred)
        yX_test.iloc[:,:] = scaler.inverse_transform(yX_test)
        
        r2_loss = r2_score(yX_test.iloc[:,0], yX_pred.iloc[:,0], sample_weight=weights_test)
        mae = mean_absolute_error(yX_test.iloc[:,0], yX_pred.iloc[:,0], sample_weight=weights_test)
        

        return {"coefficients" : [coef if NaN_cols[i] == False else np.nan for i,coef in enumerate(model.coef_)], "losses" : [r2_loss, mae]}


    #for small datasets, leave one out cross validation
    else:
        
        yX_pred = pd.DataFrame().reindex_like(yX) #prepare dataframe for predictions
        
        for idx_num, idx_val in enumerate(yX.index):
            yX_train, yX_test = yX.loc[yX.index != idx_val, :], yX.loc[[idx_val], :]

            if weights is not None:
                weights_train = weights.loc[weights.index != idx_val]
            else:
                weights_train = None

            #standardize data
            scaler = StandardScaler()
            yX_train.iloc[:,:] = scaler.fit_transform(yX_train)
            yX_test.iloc[:,:] = scaler.transform(yX_test)

            #fit linear regression model
            model = LinearRegression(fit_intercept=True)
            model.fit(yX_train.iloc[:, 1:], yX_train.iloc[:,0], sample_weight=weights_train)

            #get predictions
            yX_pred.iloc[idx_num,0] = model.predict(yX_test.iloc[:,1:])

        #rescale back to original predictions data
        yX_pred.iloc[:,:] = scaler.inverse_transform(yX_pred)

        #compute metrics
        r2_loss = r2_score(yX.iloc[:,0], yX_pred.iloc[:,0], sample_weight=weights)
        mae = mean_absolute_error(yX.iloc[:,0], yX_pred.iloc[:,0], sample_weight=weights)
        
        return {"coefficients" : [coef if NaN_cols[i] == False else np.nan for i,coef in enumerate(model.coef_)], "losses" : [r2_loss, mae]}

In [None]:
#get indicators dataframes


##############################################################################################################################################################################################
indicators_macro = [("Country-wide indicators", ind) for ind in ["GHG_emissions", "PM25_exposure", "GDP_growth", "gini_index", "GDP_pcap_ppp", "water_withdrawals", "pesticides_use"]]
predictive_indicators_macro_linearFit  = pd.DataFrame(columns=[("Indicators",ind) for _, ind in indicators_macro]+[("Loss","r2_score"),("Loss", "mae")]).T

for year, dimension in ((y,d) for d in data_issp_numerical_clustered_grouped.loc[:,("Environmental concern dimension", slice(None))].columns for y in [1993,2000,2010,2020]):    
    data = data_issp_numerical_clustered_grouped.loc[(year, slice(None)), [dimension, *indicators_macro]]
    res = fit_linear(data)
    predictive_indicators_macro_linearFit [(dimension[1], year)] = [*res["coefficients"], *res["losses"]]

predictive_indicators_macro_linearFit = predictive_indicators_macro_linearFit.T
predictive_indicators_macro_linearFit.index = pd.MultiIndex.from_tuples(predictive_indicators_macro_linearFit.index, names=["Environmental concern dimension", "Year"])
predictive_indicators_macro_linearFit.columns = pd.MultiIndex.from_tuples(predictive_indicators_macro_linearFit.columns)

##############################################################################################################################################################################################
indicators_individual = [("Respondent Data", ind) for ind in data_issp_numerical.loc[:, "Respondent Data"].drop(columns="Weight").columns]
predictive_indicators_individual_linearFit = pd.DataFrame(columns=[("Indicators", ind) for _, ind in indicators_individual]+[("Loss","r2_score"),("Loss", "mae")]).T

for year, dimension in ((y,d) for d in data_issp_numerical_clustered.loc[:,("Environmental concern dimension", slice(None))].columns for y in [1993,2000,2010,2020]):
    for country in data_issp_numerical_clustered.loc[year].index.get_level_values("Country").unique():
        data = data_issp_numerical_clustered.loc[(year, country, slice(None)), [dimension, *indicators_individual]]
        weights = data_issp_numerical_clustered.loc[(year, country, slice(None)), ("Respondent Data", "Weight")]
        res = fit_linear(data, weights)
        predictive_indicators_individual_linearFit[(dimension[1], year, country)] = [*res["coefficients"], *res["losses"]]
        
predictive_indicators_individual_linearFit = predictive_indicators_individual_linearFit.T
predictive_indicators_individual_linearFit.index = pd.MultiIndex.from_tuples(predictive_indicators_individual_linearFit.index, names=["Environmental concern dimension", "Year", "Country"])
predictive_indicators_individual_linearFit.columns = pd.MultiIndex.from_tuples(predictive_indicators_individual_linearFit.columns)



##############################################################################################################################################################################################
#dump results

#predictive_indicators_macro_linearFit.to_pickle("../Saved_variables/predictive_indicators_macro_linearFit.pickle")
#predictive_indicators_individual_linearFit.to_pickle("../Saved_variables/predictive_indicators_individual_linearFit.pickle")

In [15]:
#quick load

predictive_indicators_macro_linearFit = pd.read_pickle("../Saved_variables/predictive_indicators_macro_linearFit.pickle")
predictive_indicators_individual_linearFit = pd.read_pickle("../Saved_variables/predictive_indicators_individual_linearFit.pickle")

### Plot results

In [10]:
def cluster_dataframe(data, method="ward", row_cluster=True, col_cluster=True):
    from scipy.cluster.hierarchy import linkage, dendrogram

    if row_cluster:
        row_linkage = linkage(data, method=method, optimal_ordering=True)
        row_order = dendrogram(row_linkage, no_plot=True)['leaves']
    else:
        row_order = slice(None)

    if col_cluster:
        col_linkage = linkage(data.T, method=method,  optimal_ordering=True)
        col_order = dendrogram(col_linkage, no_plot=True)['leaves']

    else:
        col_order = slice(None)
        
    abs_max = data.abs().max(axis=None)
    
    return data.iloc[row_order, col_order]

def style_dataframe(data):
    max = data.loc[:,"Indicators"].abs().max(axis=None)
    styled_df = data.style.background_gradient(cmap="bwr", axis=None, vmin=-max, vmax=max, subset="Indicators")
    
    cm = sns.light_palette("green", reverse=True, as_cmap=True)
    max = data.loc[:,("Loss", "r2_score")].max(axis=None)
    min = data.loc[:,("Loss", "r2_score")].min(axis=None)
    styled_df = styled_df.background_gradient(cmap=cm, subset=[("Loss", "r2_score")], vmin=min, vmax=max)

    cm = sns.light_palette("green", reverse=False, as_cmap=True)
    max = data.loc[:,("Loss", "mae")].max(axis=None)
    min = data.loc[:,("Loss", "mae")].min(axis=None)
    styled_df = styled_df.background_gradient(cmap=cm, subset=[("Loss", "mae")], vmin=min, vmax=max)
    
    
    display(styled_df)

#### Individual indicators

In [29]:
data = predictive_indicators_individual_linearFit.groupby(["Environmental concern dimension", "Year"]).mean() #average over all countries
#data = predictive_indicators_individual_linearFit.query("Country == 'United States'") #select specific country
#data = predictive_indicators_individual_linearFit.loc[("Awareness",2020,slice(None)),:] #select topic and year
#data = predictive_indicators_individual_linearFit #all data

cluster_rows = False
cluster_columns = True

data_clustered = cluster_dataframe(data.fillna(0).loc[:, ("Indicators", slice(None))], row_cluster=cluster_rows, col_cluster=cluster_columns)
cols = [*data_clustered.columns, *data.loc[:,("Loss", slice(None))].columns]
rows = data_clustered.index
style_dataframe(data.loc[rows,cols])

Unnamed: 0_level_0,Unnamed: 1_level_0,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Loss,Loss
Unnamed: 0_level_1,Unnamed: 1_level_1,Education level,Sex,Religious beliefs,Living area,Personal income,Age,Political orientation,r2_score,mae
Environmental concern dimension,Year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Awareness,1993,0.011283,0.07861,-0.005173,0.03566,-0.000968,-0.07299,-0.061984,0.018126,0.234003
Awareness,2000,0.056684,0.065433,-0.010168,0.06214,-0.040176,-0.05324,-0.070653,0.023967,0.221913
Awareness,2010,0.045471,0.114065,0.005075,0.039112,-0.015969,-0.045733,-0.089996,0.041698,0.239872
Awareness,2020,0.043314,0.097709,-0.002013,0.025063,-0.025102,-0.0387,-0.139488,0.060757,0.247259
Efficacy of environmental action,1993,0.174004,0.023399,-0.041384,0.000585,0.023181,-0.152805,-0.034216,0.091023,0.27091
Efficacy of environmental action,2000,0.201272,0.045326,-0.016273,-0.005087,0.044455,-0.138183,-0.036582,0.075847,0.27889
Efficacy of environmental action,2010,0.187575,0.06658,-0.021087,0.011866,0.056422,-0.066429,-0.051542,0.082727,0.252756
Efficacy of environmental action,2020,0.150309,0.075902,-0.048465,0.01319,0.039733,-0.062936,-0.105576,0.082832,0.27137
Will to make sacrifices,1993,0.159158,-0.00119,-0.009791,0.029111,0.031414,-0.058339,-0.056164,0.065342,0.261683
Will to make sacrifices,2000,0.199637,0.009184,-0.000369,0.017331,0.046487,0.007554,-0.05942,0.043336,0.254699


**Conclusions:**

1. **Political orientation** more important in recent years -> more polarized, environmental issues become political issues (left -> more pro-environment)
2. **Education level** important in predicting Will to make sacrifices and Efficacy of env. action (more educ. -> more pro-environment)
3. **Sex** correlated with awareness (females -> more aware)
4. **Age** slight correlation with all env. concern dimensions (older -> less concerned, more willing to make sacrifices)

#### Macro indicators

In [27]:
data = predictive_indicators_macro_linearFit
clustered_cols = [*cluster_dataframe(data.loc[:, ("Indicators", slice(None))], row_cluster=False).columns, *data.loc[:,("Loss", slice(None))].columns]
style_dataframe(data.loc[:,clustered_cols])

Unnamed: 0_level_0,Unnamed: 1_level_0,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Loss,Loss
Unnamed: 0_level_1,Unnamed: 1_level_1,GDP_pcap_ppp,PM25_exposure,gini_index,GDP_growth,pesticides_use,GHG_emissions,water_withdrawals,r2_score,mae
Environmental concern dimension,Year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Awareness,1993,0.438817,0.357063,0.446461,-0.169293,-0.388325,0.290399,-0.470529,-2.346247,0.095402
Awareness,2000,-0.4572,-0.267852,0.405966,-0.016144,0.052909,0.231937,0.011248,-0.426181,0.114449
Awareness,2010,-0.594186,0.091857,0.140625,0.206022,0.086692,-0.264735,-0.149735,-0.065081,0.097155
Awareness,2020,-0.58388,-0.11282,0.221287,-0.094452,0.119735,-0.251812,-0.115362,-0.246574,0.076345
Will to make sacrifices,1993,0.477396,-0.237442,0.023186,-0.004063,0.096073,0.039636,-0.212889,-0.086094,0.08603
Will to make sacrifices,2000,1.082811,0.359969,0.259621,0.004684,0.179731,-0.004995,-0.201586,0.314981,0.076435
Will to make sacrifices,2010,0.975516,0.142393,0.225563,0.415449,0.223281,0.02005,-0.248488,0.45795,0.066642
Will to make sacrifices,2020,1.158186,1.076622,0.192342,-0.03897,0.055227,-0.040605,-0.142782,0.227512,0.090688
Efficacy of environmental action,1993,0.520898,-0.266849,-0.212424,0.15448,0.14942,0.035353,-0.106626,0.692562,0.067858
Efficacy of environmental action,2000,0.670729,-0.127434,-0.103532,-0.073141,0.038551,0.073167,-0.146331,0.042043,0.099076


**Conclusions:**

1. **GDP** most important predictor -> less aware, but more willing to act (make sacrifices, believes in environmental action)

___
## Linear fit with basis expansion (needs fixing)

In [96]:
def fit_linear_basisExpansion(yX, weights=None):

    nFeatures = len(yX.columns) - 1
    
    #fill missing values with columns average
    yX = yX.fillna(yX.mean())
    #if whole column is missing fill with 0, then replace with NaN in output
    NaN_cols = [x for x in yX.iloc[:,1:].mean().isna().to_numpy() for i in range(3)]
    yX = yX.fillna(0)
    
    #standardize data
    scaler = MinMaxScaler()
    yX.loc[:] = scaler.fit_transform(yX)
    
    #basis expansion
    col_order = [idx for i in range(nFeatures) for idx in [1+i, 1+nFeatures+i, 1+2*nFeatures+i]]
    yX = pd.concat([yX.iloc[:,0], yX.iloc[:,1:], yX.iloc[:,1:]**2, np.log(yX.iloc[:,1:]+1e-20)], axis=1).iloc[:, [0,*col_order]]
    
    #standardize again
    new_scaler = MinMaxScaler()
    yX.loc[:] = new_scaler.fit_transform(yX)


    #for large datasets: single train-test split
    if len(yX) > 500:
        yX_train, yX_test = train_test_split(yX, test_size=0.2)

        if weights is not None:
            weights_train, weights_test = weights.loc[yX_train.index], weights.loc[yX_test.index]
        else:
            weights_train, weights_test = None, None

        #fit linear regression model
        model = LinearRegression(fit_intercept=True)
        model.fit(yX_train.iloc[:, 1:], yX_train.iloc[:,0], sample_weight=weights_train)

        #compute metrics        
        y_pred = (model.predict(yX_test.iloc[:,1:])-scaler.min_[0])/scaler.scale_[0]
        y_test = (yX_test.iloc[:,0]-scaler.min_[0])/scaler.scale_[0]
        
        r2_loss = r2_score(y_test, y_pred, sample_weight=weights_test)
        mae = mean_absolute_error(y_test, y_pred, sample_weight=weights_test)        

        return {"coefficients" : [coef if NaN_cols[i] == False else np.nan for i,coef in enumerate(model.coef_)], "losses" : [r2_loss, mae]}


    #for small datasets, leave one out cross validation
    else:
        
        y_pred = pd.Series(index=yX.index)#prepare dataframe for predictions

        for idx_num, idx_val in enumerate(yX.index):
            yX_train, yX_test = yX.loc[yX.index != idx_val, :], yX.loc[[idx_val], :]

            if weights is not None:
                weights_train, weights_test = weights.loc[weights.index != idx_val], weights.loc[weights.index == idx_val]
            else:
                weights_train, weights_test = None, None

            #fit linear regression model
            model = LinearRegression(fit_intercept=True)
            model.fit(yX_train.iloc[:, 1:], yX_train.iloc[:,0], sample_weight=weights_train)

            #get predictions
            y_pred.iloc[idx_num] = model.predict(yX_test.iloc[:,1:])

        #rescale back to original predictions data
        y_pred = (y_pred - scaler.min_[0])/scaler.scale_[0]
        y_test = yX.iloc[:,0]

        #compute metrics
        r2_loss = r2_score(y_test, y_pred, sample_weight=weights_test)
        mae = mean_absolute_error(y_test, y_pred, sample_weight=weights_test)        

        return {"coefficients" : [coef if NaN_cols[i] == False else np.nan for i,coef in enumerate(model.coef_)], "losses" : [r2_loss, mae]}

In [None]:
#get indicators dataframes


##############################################################################################################################################################################################
indicators_macro = [("Country-wide indicators", ind) for ind in ["GHG_emissions", "PM25_exposure", "GDP_growth", "gini_index", "GDP_pcap_ppp", "water_withdrawals", "pesticides_use"]]
predictive_indicators_macro_linearFitBasisExp  = pd.DataFrame(columns=[("Indicators",ind, x) for _, ind in indicators_macro for x in ["x", "x^2", "log(x)"]]+[("Loss","r2_score", None),("Loss", "mae", None)]).T

for year, dimension in ((y,d) for d in data_issp_numerical_clustered_grouped.loc[:,("Environmental concern dimension", slice(None))].columns for y in [1993,2000,2010,2020]):    
    data = data_issp_numerical_clustered_grouped.loc[(year, slice(None)), [dimension, *indicators_macro]]
    res = fit_linear_basisExpansion(data)
    predictive_indicators_macro_linearFitBasisExp [(dimension[1], year)] = [*res["coefficients"], *res["losses"]]

predictive_indicators_macro_linearFitBasisExp = predictive_indicators_macro_linearFitBasisExp.T
predictive_indicators_macro_linearFitBasisExp.index = pd.MultiIndex.from_tuples(predictive_indicators_macro_linearFitBasisExp.index, names=["Environmental concern dimension", "Year"])
predictive_indicators_macro_linearFitBasisExp.columns = pd.MultiIndex.from_tuples(predictive_indicators_macro_linearFitBasisExp.columns)

##############################################################################################################################################################################################
indicators_individual = [("Respondent Data", ind) for ind in data_issp_numerical.loc[:, "Respondent Data"].drop(columns="Weight").columns]
predictive_indicators_individual_linearFitBasisExp = pd.DataFrame(columns=[("Indicators",ind, x) for _, ind in indicators_macro for x in ["x", "x^2", "log(x)"]]+[("Loss","r2_score", None),("Loss", "mae", None)]).T

for year, dimension in ((y,d) for d in data_issp_numerical_clustered.loc[:,("Environmental concern dimension", slice(None))].columns for y in [1993,2000,2010,2020]):
    for country in data_issp_numerical_clustered.loc[year].index.get_level_values("Country").unique():
        data = data_issp_numerical_clustered.loc[(year, country, slice(None)), [dimension, *indicators_individual]]
        weights = data_issp_numerical_clustered.loc[(year, country, slice(None)), ("Respondent Data", "Weight")]
        res = fit_linear_basisExpansion(data, weights)
        predictive_indicators_individual_linearFitBasisExp[(dimension[1], year, country)] = [*res["coefficients"], *res["losses"]]
        
predictive_indicators_individual_linearFitBasisExp = predictive_indicators_individual_linearFitBasisExp.T
predictive_indicators_individual_linearFitBasisExp.index = pd.MultiIndex.from_tuples(predictive_indicators_individual_linearFitBasisExp.index, names=["Environmental concern dimension", "Year", "Country"])
predictive_indicators_individual_linearFitBasisExp.columns = pd.MultiIndex.from_tuples(predictive_indicators_individual_linearFitBasisExp.columns)



##############################################################################################################################################################################################
#dump results

#predictive_indicators_macro_linearFitBasisExp.to_pickle("../Saved_variables/predictive_indicators_macro_linearFitBasisExp.pickle")
#predictive_indicators_individual_linearFitBasisExp.to_pickle("../Saved_variables/predictive_indicators_individual_linearFitBasisExp.pickle")

In [None]:
#quick load

predictive_indicators_macro_linearFitBasisExp = pd.read_pickle("../Saved_variables/predictive_indicators_macro_linearFitBasisExp.pickle")
predictive_indicators_individual_linearFitBasisExp = pd.read_pickle("../Saved_variables/predictive_indicators_individual_linearFitBasisExp.pickle")

### Plot results

In [100]:
def cluster_dataframe(data, method="ward", row_cluster=True, col_cluster=True):
    from scipy.cluster.hierarchy import linkage, dendrogram

    if row_cluster:
        row_linkage = linkage(data, method=method, optimal_ordering=True)
        row_order = dendrogram(row_linkage, no_plot=True)['leaves']
    else:
        row_order = slice(None)

    if col_cluster:
        col_linkage = linkage(data.T, method=method,  optimal_ordering=True)
        col_order = dendrogram(col_linkage, no_plot=True)['leaves']

    else:
        col_order = slice(None)
        
    abs_max = data.abs().max(axis=None)
    
    return data.iloc[row_order, col_order]

def style_dataframe(data):
    max = data.loc[:,"Indicators"].abs().max(axis=None)
    styled_df = data.style.background_gradient(cmap="bwr", axis=None, vmin=-max, vmax=max, subset="Indicators")
    
    cm = sns.light_palette("green", reverse=True, as_cmap=True)
    max = data.loc[:,("Loss", "r2_score")].max(axis=None)
    min = data.loc[:,("Loss", "r2_score")].min(axis=None)
    styled_df = styled_df.background_gradient(cmap=cm, subset=[("Loss", "r2_score", None)], vmin=min, vmax=max)

    cm = sns.light_palette("green", reverse=False, as_cmap=True)
    max = data.loc[:,("Loss", "mae")].max(axis=None)
    min = data.loc[:,("Loss", "mae")].min(axis=None)
    styled_df = styled_df.background_gradient(cmap=cm, subset=[("Loss", "mae", None)], vmin=min, vmax=max)
    
    
    display(styled_df)

#### Individual indicators

In [107]:
data = predictive_indicators_individual_linearFitBasisExp.groupby(["Environmental concern dimension", "Year"]).mean() #average over all countries
#data = predictive_indicators_individual_linearFitBasisExp.query("Country == 'United States'") #select specific country
#data = predictive_indicators_individual_linearFitBasisExp.loc[("Awareness",2020,slice(None)),:] #select topic and year
#data = predictive_indicators_individual_linearFitBasisExp #all data

cluster_rows = False
cluster_columns = True

data_clustered = cluster_dataframe(data.fillna(0).loc[:, ("Indicators", slice(None))], row_cluster=cluster_rows, col_cluster=cluster_columns)
cols = [*data_clustered.columns, *data.loc[:,("Loss", slice(None))].columns]
rows = data_clustered.index
style_dataframe(data.loc[rows,cols])

  max = data.loc[:,("Loss", "r2_score")].max(axis=None)
  min = data.loc[:,("Loss", "r2_score")].min(axis=None)
  max = data.loc[:,("Loss", "mae")].max(axis=None)
  min = data.loc[:,("Loss", "mae")].min(axis=None)


Unnamed: 0_level_0,Unnamed: 1_level_0,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Loss,Loss
Unnamed: 0_level_1,Unnamed: 1_level_1,GDP_pcap_ppp,GDP_pcap_ppp,GDP_pcap_ppp,GDP_growth,GDP_growth,water_withdrawals,gini_index,PM25_exposure,PM25_exposure,PM25_exposure,gini_index,gini_index,water_withdrawals,water_withdrawals,GDP_growth,pesticides_use,pesticides_use,GHG_emissions,GHG_emissions,pesticides_use,GHG_emissions,r2_score,mae
Unnamed: 0_level_2,Unnamed: 1_level_2,x,x^2,log(x),x^2,log(x),x^2,x^2,x^2,log(x),x,log(x),x,log(x),x,x,x^2,log(x),log(x),x^2,x,x,nan,nan
Environmental concern dimension,Year,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3,Unnamed: 24_level_3
Awareness,1993,46111416521.00401,-29228797879.687683,-16882618641.29718,9142922141.23804,4713346951.880373,-0.106826,0.027086,-0.0096,-0.005506,-0.036269,0.049337,0.049609,-0.072914,0.146029,-13856269093.12708,601705096798.4829,313397025912.4211,-920690801656.0728,-1126132897373.644,-915102122710.9124,2046823699029.7388,0.024909,0.235873
Awareness,2000,-99915569820.26622,73930780773.14455,25984789047.136177,-372398264346.25543,-191978253453.6023,-0.074308,0.608593,-0.072297,0.015649,0.022838,-0.024929,-0.115958,-0.034426,0.06727,564376517799.8785,-171023059512.2241,-889467410743.1127,-854222149032.9856,-375661402814.694,1060490470255.326,1229883551847.7124,-0.659986,0.229466
Awareness,2010,-61133893460.25681,33764574898.495277,27369318561.780567,-3753588066.991888,-1935044682.735845,0.137206,-0.000865,-0.093518,-0.006524,0.05794,-0.015991,-0.029742,0.063511,-0.26335,5688632749.757568,956920916173.8787,342026940261.4607,163096900019.01285,-144129828282.69058,-1298947856435.3315,-18967071736.285503,0.045425,0.241392
Awareness,2020,6.323182,-4.15716,-2.152102,-230883516742.73807,-79496259567.3197,-30244202091.09485,0.289653,0.009113,0.014586,-0.027466,-0.015302,-0.312542,-11598880131.416248,41843082209.343376,310379776310.25183,190485927916.89783,168041607549.81497,124713534447.41856,-269255441542.2241,-358527535466.71704,144541907094.83875,-445.560408,0.336851
Efficacy of environmental action,1993,-753891795965.9054,497634542102.8744,256257253863.0092,-8038979248.7885,-4144243793.574575,0.21869,-0.07303,-0.088689,-0.010156,-0.040113,0.047706,0.123459,0.084474,-0.335897,12183223042.45872,-485049931612.38007,-24981069765.177208,-779563486692.4366,-583807887522.5989,510031001377.52246,1363371374215.0476,-7.995347753309013e+20,205403879.51832
Efficacy of environmental action,2000,-63144811738.31807,51385287494.87328,11759524243.418276,-104787849145.92996,-54020091359.75276,-0.160222,-0.343346,-0.12299,-0.003068,0.01179,-0.073333,0.277209,-0.029647,0.161266,158807940505.79953,1716724470885.5771,900769375040.2313,209390694094.66617,-545981719453.3775,-2617493845925.8135,336591025358.7319,-2.504856937489108e+22,1179506719.415406
Efficacy of environmental action,2010,-304201193864.0149,195076265122.73447,109124928741.28894,-0.011289,-0.04872,0.196945,-0.064037,-0.132304,-0.021399,0.070377,-0.014675,0.10337,0.101859,-0.326586,0.181863,-113320612532.17686,-154780076525.89963,-405337256733.92035,-718510406471.071,268100689058.0695,1123847663205.0156,-3.278006511926177e+20,51049264.435725
Efficacy of environmental action,2020,-25672871391.014874,16939990317.225662,8732881073.795229,181704671296.00055,85557723482.37222,-78448934358.21872,0.271647,-0.106146,-0.030002,0.045588,-0.004355,-0.241932,-30085759321.922623,108534693680.08109,-267262394774.87646,-1749950895484.7563,-991614427017.4353,-1688869954668.2463,-1956444230196.676,2741565322502.1685,3645314184864.9526,-2.0062296953264223e+20,32294780.589476
Will to make sacrifices,1993,275423679979.70264,-170457177221.0058,-104966502758.68884,70448612046.61156,36317573935.68815,0.450447,-0.459202,-0.1114,-0.104922,0.070503,0.171491,0.234539,0.13372,-0.605883,-106766185982.2052,1761104557235.3477,1036582692616.952,-118417387430.9776,-291956587001.3085,-2797687249852.3247,410373974432.29126,-2221681295606216.0,281926.10465
Will to make sacrifices,2000,-2803101163.231422,2325351786.676974,477749376.557886,5291812373.642291,2728028012.889948,-0.033936,-0.280842,-0.091414,-0.02419,0.095989,-0.007838,0.157905,-0.038441,0.020575,-8019840386.434531,-956206588246.8403,-578636688578.6533,-14631992910.599148,-460460530327.8488,1534843276825.4888,475092523238.45154,-0.075502,0.256231


#### Macro indicators

In [104]:
data = predictive_indicators_macro_linearFitBasisExp
clustered_cols = [*cluster_dataframe(data.loc[:, ("Indicators", slice(None))], row_cluster=False).columns, *data.loc[:,("Loss", slice(None))].columns]
style_dataframe(data.loc[:,clustered_cols])

  max = data.loc[:,("Loss", "r2_score")].max(axis=None)
  min = data.loc[:,("Loss", "r2_score")].min(axis=None)
  max = data.loc[:,("Loss", "mae")].max(axis=None)
  min = data.loc[:,("Loss", "mae")].min(axis=None)


Unnamed: 0_level_0,Unnamed: 1_level_0,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Loss,Loss
Unnamed: 0_level_1,Unnamed: 1_level_1,GHG_emissions,GHG_emissions,GDP_pcap_ppp,pesticides_use,PM25_exposure,GDP_growth,gini_index,GDP_growth,PM25_exposure,water_withdrawals,water_withdrawals,GDP_pcap_ppp,PM25_exposure,GHG_emissions,gini_index,gini_index,water_withdrawals,GDP_pcap_ppp,pesticides_use,GDP_growth,pesticides_use,r2_score,mae
Unnamed: 0_level_2,Unnamed: 1_level_2,x^2,log(x),x,x,log(x),x^2,x,log(x),x,x^2,log(x),log(x),x^2,x,x^2,log(x),x,x^2,x^2,x,log(x),nan,nan
Environmental concern dimension,Year,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3,Unnamed: 24_level_3
Awareness,1993,4.47864,-1.261918,-1.040996,-1.488542,-0.450185,0.893946,-7.441031,-2.294815,-4.028455,-4.496125,-6.634643,-0.852603,4.325925,2.715578,9.961246,7.425418,4.957359,3.148691,0.61768,-2.771976,4.061523,-22.650193,0.876396
Awareness,2000,33.375711,0.080008,2.395116,-2.029393,-0.172723,1.441989,3.520596,0.911715,0.249186,-2.897887,0.267833,-0.377832,-0.562657,-7.720502,-2.196409,-0.128688,2.955638,-1.929524,1.015628,-2.134917,0.724762,-44823.030439,15.092078
Awareness,2010,-4.316248,-10.241495,0.080227,-2.178953,0.382943,-4.254137,2.443227,0.019369,0.058848,-1.834865,-3.532507,0.032227,0.794003,5.547607,-2.191163,-0.005739,1.627384,-0.446613,1.545881,3.820498,12.92852,-1550.751393,2.473044
Awareness,2020,-3.668805,-6.060263,3.373016,1.859764,0.615062,-4.001541,-0.310783,0.299408,1.0883,1.462556,5.354117,4.865803,4.451445,4.198523,1.582502,0.498796,-1.946801,-3.197764,-1.93755,2.291814,0.491332,-149.832541,1.244014
Will to make sacrifices,1993,-5.243022,0.698686,3.503071,-1.244988,-0.943202,1.141708,8.388786,2.288915,3.802926,4.339912,10.134179,-0.05029,-3.112868,-4.037519,-9.503666,-9.812287,-5.855852,-2.308945,1.300894,-0.00358,-5.307455,-90.741895,1.759887
Will to make sacrifices,2000,43.766424,1.257075,-3.30446,-1.523622,-1.163845,-3.489867,-0.46338,-0.18041,2.490316,1.452529,0.253665,0.461701,-1.552418,-4.309532,-0.248791,-0.007538,-1.870342,3.222885,1.658949,4.835954,4.178195,-27550.671479,11.369019
Will to make sacrifices,2010,-0.762224,-1.225891,3.003865,-0.669152,-0.349773,2.023062,1.684186,0.335712,0.653153,-1.257148,-0.855586,-0.413769,0.002508,0.229348,-1.008773,-0.399128,0.885867,-1.335,0.554701,-1.25205,1.913568,-461.531873,1.85113
Will to make sacrifices,2020,3.216353,3.670665,-0.056447,1.128224,0.077111,-1.344163,0.527547,0.158936,-1.383973,-0.539181,-1.581946,0.175668,2.492316,-2.921305,-0.512152,-0.385189,0.587996,0.607223,-0.798862,1.001149,-2.07234,-100.286983,1.340943
Efficacy of environmental action,1993,-0.737337,0.171414,1.329841,-0.271899,-0.122867,-1.769215,0.090977,-0.116151,-0.211945,0.652487,0.431716,-0.141304,0.198415,-1.268985,-0.616383,-0.260309,-0.941785,-0.348822,0.569766,2.487671,-1.329943,-6.317839,0.669607
Efficacy of environmental action,2000,24.113102,1.644585,-6.199449,-2.297814,-1.069311,-3.604242,-1.100648,-0.016193,1.901161,3.000168,0.225421,0.893423,-1.515854,0.881144,-0.469949,-0.221797,-3.160311,4.942272,2.713332,5.254054,4.562986,-5368.580241,8.458186


___
## Decision Trees (needs optimization)

In [21]:
def fit_trees(yX, weights=None):
        
    #fill missing values with columns average
    yX = yX.fillna(yX.mean())
    #if whole column is missing fill with 0, then replace with NaN in output
    NaN_cols = yX.iloc[:,1:].mean().isna().to_numpy()
    yX = yX.fillna(0)

    #for large datasets: single train-test split
    if len(yX) > 500:
        yX_train, yX_test = train_test_split(yX, test_size=0.2)

        if weights is not None:
            weights_train, weights_test = weights.loc[yX_train.index], weights.loc[yX_test.index]
        else:
            weights_train, weights_test = None, None
        
        #standardize data
        scaler = StandardScaler()
        yX_train.iloc[:,:] = scaler.fit_transform(yX_train)
        yX_test.iloc[:,:] = scaler.transform(yX_test)
        
        #fit linear regression model
        model = GradientBoostingRegressor()
        model.fit(yX_train.iloc[:, 1:], yX_train.iloc[:,0], sample_weight=weights_train)

        #compute metrics
        yX_pred = pd.DataFrame().reindex_like(yX_test)
        yX_pred.iloc[:,0] = model.predict(yX_test.iloc[:,1:])
        yX_pred.iloc[:,:] = scaler.inverse_transform(yX_pred)
        yX_test.iloc[:,:] = scaler.inverse_transform(yX_test)
        
        r2_loss = r2_score(yX_test.iloc[:,0], yX_pred.iloc[:,0], sample_weight=weights_test)
        mae = mean_absolute_error(yX_test.iloc[:,0], yX_pred.iloc[:,0], sample_weight=weights_test)
        

        return {"importance" : [coef if NaN_cols[i] == False else np.nan for i,coef in enumerate(model.feature_importances_)], "losses" : [r2_loss, mae]}


    #for small datasets, leave one out cross validation
    else:
        
        yX_pred = pd.DataFrame().reindex_like(yX) #prepare dataframe for predictions
        
        for idx_num, idx_val in enumerate(yX.index):
            yX_train, yX_test = yX.loc[yX.index != idx_val, :], yX.loc[[idx_val], :]

            if weights is not None:
                weights_train = weights.loc[weights.index != idx_val]
            else:
                weights_train = None

            #standardize data
            scaler = StandardScaler()
            yX_train.iloc[:,:] = scaler.fit_transform(yX_train)
            yX_test.iloc[:,:] = scaler.transform(yX_test)

            #fit linear regression model
            model = GradientBoostingRegressor()
            model.fit(yX_train.iloc[:, 1:], yX_train.iloc[:,0], sample_weight=weights_train)

            #get predictions
            yX_pred.iloc[idx_num,0] = model.predict(yX_test.iloc[:,1:])

        #rescale back to original predictions data
        yX_pred.iloc[:,:] = scaler.inverse_transform(yX_pred)

        #compute metrics
        r2_loss = r2_score(yX.iloc[:,0], yX_pred.iloc[:,0], sample_weight=weights)
        mae = mean_absolute_error(yX.iloc[:,0], yX_pred.iloc[:,0], sample_weight=weights)
        
        return {"importance" : [coef if NaN_cols[i] == False else np.nan for i,coef in enumerate(model.feature_importances_)], "losses" : [r2_loss, mae]}

In [None]:
#get indicators dataframes


##############################################################################################################################################################################################
indicators_macro = [("Country-wide indicators", ind) for ind in ["GHG_emissions", "PM25_exposure", "GDP_growth", "gini_index", "GDP_pcap_ppp", "water_withdrawals", "pesticides_use"]]
predictive_indicators_macro_boostedTrees = pd.DataFrame(columns=[("Indicators",ind) for _, ind in indicators_macro]+[("Loss","r2_score"),("Loss", "mae")]).T

for year, dimension in ((y,d) for d in data_issp_numerical_clustered_grouped.loc[:,("Environmental concern dimension", slice(None))].columns for y in [1993,2000,2010,2020]):    
    data = data_issp_numerical_clustered_grouped.loc[(year, slice(None)), [dimension, *indicators_macro]]
    res = fit_trees(data)
    predictive_indicators_macro_boostedTrees[(dimension[1], year)] = [*res["importance"], *res["losses"]]

predictive_indicators_macro_boostedTrees = predictive_indicators_macro_boostedTrees.T
predictive_indicators_macro_boostedTrees.index = pd.MultiIndex.from_tuples(predictive_indicators_macro_boostedTrees.index, names=["Environmental concern dimension", "Year"])
predictive_indicators_macro_boostedTrees.columns = pd.MultiIndex.from_tuples(predictive_indicators_macro_boostedTrees.columns)


##############################################################################################################################################################################################
indicators_individual = [("Respondent Data", ind) for ind in data_issp_numerical.loc[:, "Respondent Data"].drop(columns="Weight").columns]
predictive_indicators_individual_boostedTrees = pd.DataFrame(columns=[("Indicators", ind) for _, ind in indicators_individual]+[("Loss","r2_score"),("Loss", "mae")]).T

for year, dimension in ((y,d) for d in data_issp_numerical_clustered.loc[:,("Environmental concern dimension", slice(None))].columns for y in [1993,2000,2010,2020]):
    for country in data_issp_numerical_clustered.loc[year].index.get_level_values("Country").unique():
        data = data_issp_numerical_clustered.loc[(year, country, slice(None)), [dimension, *indicators_individual]]
        weights = data_issp_numerical_clustered.loc[(year, country, slice(None)), ("Respondent Data", "Weight")]
        res = fit_trees(data, weights)
        predictive_indicators_individual_boostedTrees[(dimension[1], year, country)] = [*res["importance"], *res["losses"]]
        
predictive_indicators_individual_boostedTrees = predictive_indicators_individual_boostedTrees.T
predictive_indicators_individual_boostedTrees.index = pd.MultiIndex.from_tuples(predictive_indicators_individual_boostedTrees.index, names=["Environmental concern dimension", "Year", "Country"])
predictive_indicators_individual_boostedTrees.columns = pd.MultiIndex.from_tuples(predictive_indicators_individual_boostedTrees.columns)



##############################################################################################################################################################################################
#dump results

#predictive_indicators_macro_boostedTrees.to_pickle("../Saved_variables/predictive_indicators_macro_boostedTrees.pickle")
#predictive_indicators_individual_boostedTrees.to_pickle("../Saved_variables/predictive_indicators_individual_boostedTrees.pickle")

In [27]:
#quick load

predictive_indicators_macro_boostedTrees = pd.read_pickle("../Saved_variables/predictive_indicators_macro_boostedTrees.pickle")
predictive_indicators_individual_boostedTrees = pd.read_pickle("../Saved_variables/predictive_indicators_individual_boostedTrees.pickle")

### Plot results

In [36]:
def cluster_dataframe(data, method="ward", row_cluster=True, col_cluster=True):
    from scipy.cluster.hierarchy import linkage, dendrogram

    if row_cluster:
        row_linkage = linkage(data, method=method, optimal_ordering=True)
        row_order = dendrogram(row_linkage, no_plot=True)['leaves']
    else:
        row_order = slice(None)

    if col_cluster:
        col_linkage = linkage(data.T, method=method,  optimal_ordering=True)
        col_order = dendrogram(col_linkage, no_plot=True)['leaves']

    else:
        col_order = slice(None)
        
    abs_max = data.abs().max(axis=None)
    
    return data.iloc[row_order, col_order]

def style_dataframe(data):
    max = data.loc[:,"Indicators"].max(axis=None)
    min = data.loc[:,"Indicators"].min(axis=None)
    styled_df = data.style.background_gradient(cmap="viridis", axis=None, vmin=min, vmax=max, subset="Indicators")
    
    cm = sns.light_palette("green", reverse=True, as_cmap=True)
    max = data.loc[:,("Loss", "r2_score")].max(axis=None)
    min = data.loc[:,("Loss", "r2_score")].min(axis=None)
    styled_df = styled_df.background_gradient(cmap=cm, subset=[("Loss", "r2_score")], vmin=min, vmax=max)

    cm = sns.light_palette("green", reverse=False, as_cmap=True)
    max = data.loc[:,("Loss", "mae")].max(axis=None)
    min = data.loc[:,("Loss", "mae")].min(axis=None)
    styled_df = styled_df.background_gradient(cmap=cm, subset=[("Loss", "mae")], vmin=min, vmax=max)
    
    
    display(styled_df)

#### Individual indicators

In [37]:
data = predictive_indicators_individual_boostedTrees.groupby(["Environmental concern dimension", "Year"]).mean() #average over all countries
#data = predictive_indicators_individual_boostedTrees.query("Country == 'United States'") #select specific country
#data = predictive_indicators_individual_boostedTrees.loc[("Awareness",2010,slice(None)),:] #select topic and year
#data = predictive_indicators_individual_boostedTrees #all data

cluster_rows = False
cluster_columns = True

data_clustered = cluster_dataframe(data.fillna(0).loc[:, ("Indicators", slice(None))], row_cluster=cluster_rows, col_cluster=cluster_columns)
cols = [*data_clustered.columns, *data.loc[:,("Loss", slice(None))].columns]
rows = data_clustered.index
style_dataframe(data.loc[rows,cols])

Unnamed: 0_level_0,Unnamed: 1_level_0,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Loss,Loss
Unnamed: 0_level_1,Unnamed: 1_level_1,Age,Education level,Political orientation,Personal income,Living area,Sex,Religious beliefs,r2_score,mae
Environmental concern dimension,Year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Awareness,1993,0.451952,0.104821,0.158265,0.285534,0.090215,0.074355,0.053658,-0.014294,0.238123
Awareness,2000,0.390013,0.09102,0.155039,0.221143,0.072922,0.07097,0.041024,-0.023001,0.226858
Awareness,2010,0.355587,0.096632,0.140441,0.213414,0.079043,0.080213,0.042473,0.006446,0.241242
Awareness,2020,0.313071,0.070004,0.197206,0.215998,0.080678,0.082643,0.047704,0.019697,0.252424
Efficacy of environmental action,1993,0.431707,0.235142,0.116865,0.24678,0.060078,0.042455,0.050101,0.06012,0.279022
Efficacy of environmental action,2000,0.378729,0.210953,0.114485,0.200481,0.055223,0.04137,0.032955,0.041631,0.281354
Efficacy of environmental action,2010,0.33763,0.195475,0.109503,0.216946,0.062925,0.046103,0.037501,0.05015,0.261102
Efficacy of environmental action,2020,0.315755,0.144936,0.168881,0.20937,0.066337,0.053484,0.047493,0.058958,0.274015
Will to make sacrifices,1993,0.358991,0.236262,0.15406,0.25746,0.080516,0.055739,0.057667,0.029724,0.264358
Will to make sacrifices,2000,0.335075,0.193365,0.159646,0.210387,0.052973,0.040153,0.048628,0.027218,0.256471


#### Macro indicators

In [30]:
data = predictive_indicators_macro_boostedTrees
clustered_cols = [*cluster_dataframe(data.loc[:, ("Indicators", slice(None))], row_cluster=False).columns, *data.loc[:,("Loss", slice(None))].columns]
style_dataframe(data.loc[:,clustered_cols])

Unnamed: 0_level_0,Unnamed: 1_level_0,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Loss,Loss
Unnamed: 0_level_1,Unnamed: 1_level_1,GDP_pcap_ppp,PM25_exposure,pesticides_use,GDP_growth,GHG_emissions,water_withdrawals,gini_index,r2_score,mae
Environmental concern dimension,Year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Awareness,1993,0.015794,0.117668,0.032923,0.106882,0.147605,0.50354,0.075587,-1.010101,0.077029
Awareness,2000,0.204728,0.043893,0.0211,0.005238,0.151152,0.010457,0.563433,-0.158794,0.104482
Awareness,2010,0.177183,0.067425,0.005643,0.026451,0.08061,0.122673,0.520014,0.35738,0.076323
Awareness,2020,0.088447,0.622625,0.021392,0.067439,0.050688,0.004787,0.144623,0.453572,0.052556
Will to make sacrifices,1993,0.596141,0.122672,0.041598,0.000548,0.103723,0.099081,0.036236,-0.274081,0.097369
Will to make sacrifices,2000,0.603496,0.181153,0.125718,0.020771,0.009286,0.042004,0.017571,0.21208,0.081366
Will to make sacrifices,2010,0.558734,0.036904,0.165747,0.138147,0.046812,0.017426,0.036231,0.218127,0.082613
Will to make sacrifices,2020,0.515182,0.215225,0.113679,0.026741,0.090794,0.011261,0.027118,0.277011,0.07044
Efficacy of environmental action,1993,0.82457,0.081256,0.026211,0.006118,0.030073,0.00617,0.025603,0.705011,0.066964
Efficacy of environmental action,2000,0.735878,0.044134,0.019954,0.0237,0.022469,0.06109,0.092775,0.370988,0.093847


___
## Tensorflow multi-level linear model

In [15]:
# Extract unique country tags
unique_countries = data_issp_numerical_clustered.index.get_level_values("Country").unique()

# Define inputs
macro_indicators = ["GHG_emissions", "PM25_exposure", "GDP_growth", "gini_index", "GDP_pcap_ppp", "water_withdrawals", "pesticides_use"]
micro_indicators = ['Sex', 'Age', 'Education level', 'Personal income', 'Living area', 'Political orientation', 'Religious beliefs']

# Define custom layer which uses different kernels for each country
class CustomDenseLayer(tf.keras.layers.Layer):
    def __init__(self, unique_countries, output_dim, **kwargs):
        self.unique_countries = unique_countries
        self.num_countries = len(unique_countries)
        self.output_dim = output_dim
        super().__init__(**kwargs)
    
    def build(self, input_shape):
        # Create a kernel for each country
        self.kernels = {
            country: self.add_weight(
                name=f'kernel_{country}',
                shape=(input_shape[-1], self.output_dim),
                initializer='zeros',
                trainable=True
            ) for country in self.unique_countries
        }
        super().build(input_shape)
    
    def call(self, inputs, country_indices):
        # Apply the appropriate kernel to each input based on the country index
        outputs = []
        for i, country in enumerate(self.unique_countries):
            mask = tf.cast(tf.equal(country_indices, i), tf.float32)
            masked_input = inputs * tf.expand_dims(mask, -1)
            output = tf.matmul(masked_input, self.kernels[country])
            outputs.append(output)
        result = tf.reduce_sum(tf.stack(outputs, axis=0), axis=0)[0]
        return tf.reshape(result, (-1, self.output_dim))

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_dim)


class CustomSumLayer(tf.keras.layers.Layer):
    def __init__(self,**kwargs):
        super().__init__(**kwargs)

    def build(self, input_shape):
        self.bias = self.add_weight(
            name='bias',
            initializer='zeros',
            trainable=True,
            shape=(input_shape[-1])
        )
        super().build(input_shape)

    def call(self, input1, input2):
        return input1 + input2 + self.bias

    def compute_output_shape(self, input_shape):
        return (input_shape[0], 1)

num_macro_ind = len(macro_indicators)
num_micro_ind = len(micro_indicators)

#create blank model (weights initialized to zero)
def new_model():
    input_macro = tf.keras.layers.Input(shape=(num_macro_ind,), name="Macro_data")
    input_micro = tf.keras.layers.Input(shape=(num_micro_ind,), name="Micro_data")
    input_country_indices = tf.keras.layers.Input(shape=(1,), dtype=tf.int32, name="Country_indices")
    
    # Define shared weight layer for macro indicators
    macro_dense_layer = tf.keras.layers.Dense(1, kernel_initializer="zeros", use_bias=False, name="Macro_dense_layer")(input_macro)
    
    # Define custom weight layer for micro indicators
    custom_dense_layer = CustomDenseLayer(unique_countries, 1, name="Micro_custom_layer")
    micro_custom_layer = custom_dense_layer(input_micro, input_country_indices)
    
    # Define the output layer
    custom_sum_layer = CustomSumLayer(name="Get_prediction")
    output = custom_sum_layer(macro_dense_layer, micro_custom_layer)
    
    # Create the model
    model = tf.keras.models.Model(inputs=[input_macro, input_micro, input_country_indices], outputs=output, name="Model")
    
    # Compile the model
    model.compile(optimizer='adam', loss='mse')

    return model

In [16]:
# Preprocessing data for the model
def preprocess_data(df, year, macro_ind, micro_ind, target):
    # select year
    df = df.loc[year]
    
    # fill missing individual values with country average and macro indicator with year average. Return which countries and indicators are missing for further processing
    # if individual indicator is missing for whole country replace with 0.
    missing_values = df.groupby("Country").mean().map(lambda x: np.isnan(x)) # mark which countries have missing data
    
    df_micro_filled = df.loc[:,("Respondent Data", slice(None))].fillna(df.loc[:,("Respondent Data", slice(None))].groupby("Country").mean()) #fill individual missing data with country average
    df_micro_filled = df_micro_filled.fillna(0) #if whole individual indicator is missing fill with 0 (indicator for country will be marked as true in missing_values dataframe)
    
    df_macro_filled = df.loc[:,("Country-wide indicators", slice(None))].fillna(df.loc[:,("Country-wide indicators", slice(None))].mean()) #fill macro indicator with average
    
    df_filled = pd.concat([df_micro_filled, df_macro_filled, df.loc[:,("Environmental concern dimension", slice(None))]], axis=1) #get dataframe

    #train-test split
    df_train, df_test = train_test_split(df_filled, test_size=0.2)    

    #normalization
    scaler = StandardScaler()
    df_train.iloc[:] = scaler.fit_transform(df_train)
    df_test.iloc[:] = scaler.transform(df_test)
    

    #split data into macro and micro indicators
    X_macro_train = df_train.loc[:, df_train.columns.get_level_values("Variable Name").isin(macro_ind)].values
    X_micro_train = df_train.loc[:, df_train.columns.get_level_values("Variable Name").isin(micro_ind)].values
    y_train = df_train.loc[:, ("Environmental concern dimension", target)].values
    country_indices_train = df_train.index.get_level_values("Country").to_series().map({country: i for i, country in enumerate(unique_countries)}).values
    
    X_macro_test = df_test.loc[:, df_test.columns.get_level_values("Variable Name").isin(macro_ind)].values
    X_micro_test = df_test.loc[:, df_test.columns.get_level_values("Variable Name").isin(micro_ind)].values
    y_test = df_test.loc[:, ("Environmental concern dimension", target)].values
    country_indices_test = df_test.index.get_level_values("Country").to_series().map({country: i for i, country in enumerate(unique_countries)}).values
    
    
    return [X_macro_train, X_micro_train, country_indices_train], y_train, [X_macro_test, X_micro_test, country_indices_test], y_test, missing_values

In [38]:
index_micro = pd.MultiIndex.from_product([
    ['Awareness', 'Will to make sacrifices','Efficacy of environmental action'],
    [1993, 2000, 2010, 2020],
    unique_countries],
    names = ["Environmental Concern Dimension", "Year", "Country"]
                                )
index_macro = pd.MultiIndex.from_product([
    ['Awareness', 'Will to make sacrifices', 'Efficacy of environmental action'],
    [1993, 2000, 2010, 2020]],
    names = ["Environmental Concern Dimension", "Year"]
                                )



predictive_indicators_individual_tensorflowModel = pd.DataFrame(index=index_micro, columns=pd.Index(micro_indicators, name="Indicators"))
predictive_indicators_macro_tensorflowModel = pd.DataFrame(index=index_macro, columns=pd.Index(macro_indicators, name="Indicators"))
losses = pd.DataFrame(index = index_micro, columns=pd.Index(["r2", "mae", "mse"], name="Losses"))

for target in ['Awareness', 'Will to make sacrifices', 'Efficacy of environmental action']:
    for year in [1993, 2000, 2010, 2020]:
        print(f"Training multi-level model for '{target}' in {year}\n")    
        X_train, y_train, X_test, y_test, missing_data = preprocess_data(data_issp_numerical_clustered, year, macro_indicators, micro_indicators, target)       
        model = new_model()

        #fit and get weights
        earlystop = tf.keras.callbacks.EarlyStopping(monitor="loss", patience=5, restore_best_weights=True, start_from_epoch=5)
        
        model.fit(X_train, y_train, epochs=30, batch_size=None, verbose=3, callbacks=[earlystop])
        predictive_indicators_individual_tensorflowModel.loc[(target, year, slice(None)),:] = pd.DataFrame.from_dict(model.get_layer("Micro_custom_layer").kernels).map(lambda x: x.numpy()[0]).T.values
        predictive_indicators_macro_tensorflowModel.loc[(target, year),:] = pd.DataFrame.from_dict(model.get_layer("Macro_dense_layer").weights).map(lambda x: x.numpy()[0]).values

        #get predictions and evaluate model
        y_pred = model.predict(X_test, verbose=0).squeeze()

        for i, country in enumerate(unique_countries):
            y_test_country = [y for k,y in enumerate(y_test) if X_test[2][k] == i]
            y_pred_country = [y for k,y in enumerate(y_pred) if X_test[2][k] == i]

            if len(y_test_country) > 0:
                r2_loss = r2_score(y_test_country, y_pred_country)
                mae = mean_absolute_error(y_test_country, y_pred_country)
                mse = mean_squared_error(y_test_country, y_pred_country)
                losses.loc[(target, year, country),:] = r2_loss, mae, mse

        del(model)
        
        print("\n\n")


predictive_indicators_individual_tensorflowModel = pd.concat([predictive_indicators_individual_tensorflowModel, losses], axis=1, keys=["Indicators", "Loss"])
predictive_indicators_macro_tensorflowModel = pd.concat([predictive_indicators_macro_tensorflowModel, losses.groupby(["Environmental Concern Dimension", "Year"]).mean()], axis=1, keys=["Indicators", "Loss"])

index = [idx in data_issp_numerical_clustered.index.droplevel(2) for idx in predictive_indicators_individual_tensorflowModel.index.droplevel(0)]
predictive_indicators_individual_tensorflowModel = predictive_indicators_individual_tensorflowModel.loc[index]

predictive_indicators_individual_tensorflowModel.to_pickle("../Saved_variables/predictive_indicators_individual_tensorflow.pickle")
predictive_indicators_macro_tensorflowModel.to_pickle("../Saved_variables/predictive_indicators_macro_tensorflow.pickle")

Training multi-level model for 'Awareness' in 1993

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30



Training multi-level model for 'Awareness' in 2000

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30



Training multi-level model for 'Awareness' in 2010

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30



Training multi-level model for 'Awareness' in 2020

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30



Training multi-level model for 'Will to make sacrifices' in 1993

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30



Training mul

In [None]:
#quick load
predictive_indicators_individual_tensorflowModel = pd.read_pickle("../Saved_variables/predictive_indicators_individual_tensorflow.pickle")
predictive_indicators_macro_tensorflowModel = pd.read_pickle("../Saved_variables/predictive_indicators_macro_tensorflow.pickle")

### Plot results

In [49]:
def cluster_dataframe(data, method="ward", row_cluster=True, col_cluster=True):
    from scipy.cluster.hierarchy import linkage, dendrogram

    if row_cluster:
        row_linkage = linkage(data, method=method, optimal_ordering=True)
        row_order = dendrogram(row_linkage, no_plot=True)['leaves']
    else:
        row_order = slice(None)

    if col_cluster:
        col_linkage = linkage(data.T, method=method,  optimal_ordering=True)
        col_order = dendrogram(col_linkage, no_plot=True)['leaves']

    else:
        col_order = slice(None)
        
    abs_max = data.abs().max(axis=None)
    
    return data.iloc[row_order, col_order]

def style_dataframe(data):
    max = data.loc[:,"Indicators"].abs().max(axis=None)
    styled_df = data.style.background_gradient(cmap="bwr", axis=None, vmin=-max, vmax=max, subset="Indicators")
    
    cm = sns.light_palette("green", reverse=True, as_cmap=True)
    max = data.loc[:,("Loss", "r2")].max(axis=None)
    min = data.loc[:,("Loss", "r2")].min(axis=None)
    styled_df = styled_df.background_gradient(cmap=cm, subset=[("Loss", "r2")], vmin=min, vmax=max)

    cm = sns.light_palette("green", reverse=False, as_cmap=True)
    max = data.loc[:,("Loss", "mae")].max(axis=None)
    min = data.loc[:,("Loss", "mae")].min(axis=None)
    styled_df = styled_df.background_gradient(cmap=cm, subset=[("Loss", "mae")], vmin=min, vmax=max)

    cm = sns.light_palette("green", reverse=False, as_cmap=True)
    max = data.loc[:,("Loss", "mse")].max(axis=None)
    min = data.loc[:,("Loss", "mse")].min(axis=None)
    styled_df = styled_df.background_gradient(cmap=cm, subset=[("Loss", "mse")], vmin=min, vmax=max)

    
    display(styled_df)

#### Individual indicators

In [70]:
data = predictive_indicators_individual_tensorflowModel.groupby(["Environmental Concern Dimension", "Year"]).mean() #average over all countries
#data = predictive_indicators_individual_tensorflowModel.query("Country == 'United States'") #select specific country
#data = predictive_indicators_individual_tensorflowModel.loc[("Awareness",2020,slice(None)),:] #select topic and year
#data = predictive_indicators_individual_tensorflowModel #all data

cluster_rows = False
cluster_columns = True

data_clustered = cluster_dataframe(data.fillna(0).loc[:, ("Indicators", slice(None))], row_cluster=cluster_rows, col_cluster=cluster_columns)
cols = [*data_clustered.columns, *data.loc[:,("Loss", slice(None))].columns]
rows = data_clustered.index
style_dataframe(data.loc[rows,cols])

  data_clustered = cluster_dataframe(data.fillna(0).loc[:, ("Indicators", slice(None))], row_cluster=cluster_rows, col_cluster=cluster_columns)


Unnamed: 0_level_0,Unnamed: 1_level_0,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Loss,Loss,Loss
Unnamed: 0_level_1,Unnamed: 1_level_1,Political orientation,Age,Sex,Personal income,Living area,Religious beliefs,Education level,r2,mae,mse
Environmental Concern Dimension,Year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
Awareness,1993,-0.080753,-0.089896,0.075447,-0.012986,0.047652,-0.007164,-0.003885,-0.038657,0.811206,1.004713
Awareness,2000,-0.078076,-0.04429,0.073951,-0.03019,0.006348,0.012684,0.013876,-0.117144,0.797793,0.957824
Awareness,2010,-0.107308,-0.057076,0.10195,-0.011145,0.019692,0.036287,0.037067,-0.037849,0.761082,0.899797
Awareness,2020,-0.158313,-0.040977,0.098118,-0.030098,-0.003428,0.017667,0.074598,-0.027747,0.805016,0.987669
Efficacy of environmental action,1993,-0.034731,-0.175016,0.02347,0.01591,0.027341,-0.039983,0.150863,0.043977,0.720711,0.79595
Efficacy of environmental action,2000,-0.055223,-0.134837,0.041735,0.052472,-0.025314,-0.021644,0.199975,0.025899,0.744888,0.86228
Efficacy of environmental action,2010,-0.068932,-0.065719,0.063603,0.062984,0.010159,-0.030211,0.159685,0.027575,0.739654,0.868575
Efficacy of environmental action,2020,-0.125637,-0.055303,0.076475,0.041134,0.015506,-0.074412,0.133352,0.009801,0.725834,0.815234
Will to make sacrifices,1993,-0.061833,-0.07008,0.007165,0.026877,0.053151,-0.012449,0.144384,-0.000745,0.775431,0.923586
Will to make sacrifices,2000,-0.076555,0.008137,0.003208,0.037408,-0.008443,0.000946,0.188775,0.007695,0.754795,0.884531


#### Macro indicators

In [71]:
data = predictive_indicators_macro_tensorflowModel
clustered_cols = [*cluster_dataframe(data.loc[:, ("Indicators", slice(None))], row_cluster=False).columns, *data.loc[:,("Loss", slice(None))].columns]
style_dataframe(data.loc[:,clustered_cols])

Unnamed: 0_level_0,Unnamed: 1_level_0,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Indicators,Loss,Loss,Loss
Unnamed: 0_level_1,Unnamed: 1_level_1,GHG_emissions,water_withdrawals,PM25_exposure,GDP_pcap_ppp,GDP_growth,pesticides_use,gini_index,r2,mae,mse
Environmental Concern Dimension,Year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
Awareness,1993,-0.108231,-0.058159,-0.059311,-0.109456,0.055902,0.019013,0.082061,-0.038657,0.811206,1.004713
Awareness,2000,-0.011768,0.056436,-0.001733,-0.013013,0.124308,-0.1294,-0.266441,-0.117144,0.797793,0.957824
Awareness,2010,0.042838,0.015424,-0.05087,-0.030297,0.028711,-0.031166,-0.258434,-0.037849,0.761082,0.899797
Awareness,2020,-0.019709,0.065074,-0.078541,-0.027395,0.04599,-0.009461,-0.125263,-0.027747,0.805016,0.987669
Will to make sacrifices,1993,0.013142,0.03564,-0.015224,-0.059648,0.006261,-0.063218,0.093951,-0.000745,0.775431,0.923586
Will to make sacrifices,2000,0.020586,0.086415,-0.047157,-0.056738,0.062323,0.143799,0.313589,0.007695,0.754795,0.884531
Will to make sacrifices,2010,0.173514,0.076419,-0.07759,-0.081967,0.063798,0.084377,0.301977,0.013027,0.762292,0.89978
Will to make sacrifices,2020,-0.057068,0.001437,-0.051265,-0.02295,0.023528,0.46159,0.435609,-0.005462,0.741523,0.865995
Efficacy of environmental action,1993,0.081324,0.041345,-0.036032,-0.064653,-0.066962,-0.116232,0.185612,0.043977,0.720711,0.79595
Efficacy of environmental action,2000,-0.032098,0.035553,-0.068047,-0.069265,-0.057487,-0.076333,0.207901,0.025899,0.744888,0.86228
