# <div style="text-align: center"> <font size=+5> <ins>PREDICT ANSWERS from INDICATORS</ins> </font> </div>
___


### Imports

In [1]:
import numpy as np
import pandas as pd

import os
import ipywidgets as widgets

import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.dpi' : 300,
                     'mathtext.fontset' : 'stix', 
                     'font.family' : 'STIXGeneral'
                    })

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso

In [2]:
pd.set_option('display.max_columns', 500)

___
## Get datasets

In [268]:
# get full dataset
data_issp = pd.read_pickle("Processed_Data/questionnaires/issp_questionnaires_common.pickle")

# convert values to numerical data
conv_dict = {
    "Respondent Data" : {
        "Sex" : {"Male" : -1, "Female" : 1},
        "Education level" : {"No degree" : 0, "Primary" : 1, "Secondary" : 2, "University" : 3},
        "Living area" : {"Rural area" : 0, "Small city" : 1, "Big city" : 2},
        "Political orientation" : {"Far Left" : -3, "Left" : -2, "Center Left" : -1, "Center" : 0, "None/Other" : 0, "Center Right" : 1, "Right" : 2, "Far Right" : 3},
        "Religious beliefs" : {"Atheist" : 0, "Christian" : 1, "Islamic" : 1, "Jewish" : 1, "Other" : 1}
    },
    "Questions" : {
        "Strongly Disagree" : -2,
        "Disagree" : -1,
        "Neither Agree nor Disagree" : 0,
        "Agree" : 1,
        "Strongly Agree" : 2,

        "Yes" : 1,
        "No" : -1,

        'Extremely dangerous': 2,
        'Very dangerous': 1,
        'Somewhat dangerous': 0,
        'Not very dangerous': -1,
        'Not dangerous at all': -2,

        "Always" : 3,
        "Often" : 2,
        "Sometimes" : 1,
        "Never" : 0,

        'Very willing': 2,
        'Fairly willing': 1,
        'Neither willing nor unwilling': 0,
        'Fairly unwilling': -1,
        'Very unwilling': -2
    }
}

#convert values to numerical
data_resp = data_issp.loc[:,("Respondent Data", slice(None))].apply(lambda col: col if col.name[1] not in conv_dict["Respondent Data"] else col.map(lambda x: np.nan if pd.isna(x) else conv_dict["Respondent Data"][col.name[1]][x]).astype(float))
data_quest = data_issp.loc[:,("Questions", slice(None))].map(lambda x: conv_dict["Questions"].get(x, x)).astype(float)

#cluster questions based on topic
clusters = {
    #TOPIC 1: awareness of the environmental consequences of societies’ modern, industrial activities
    "Awareness" : [
        'Danger to the environment (Air pollution by cars)',
        'Danger to the environment (Air pollution by industry)',
        'Danger to the environment (Pesticides and chemicals in farming)',
        'Danger to the environment (River, lake and stream pollution)',
        'Danger to the environment (Rise in the world’s temperature)',
        'Danger to the environment (Nuclear power stations)'
    ],

    #TOPIC 2: willingness to sacrifice personally in some manner (e.g. time, money) for the environment
    "Will to make sacrifices" : [
        'Willingness to Make Trade-Offs for Environment (Pay much higher prices)',
        'Willingness to Make Trade-Offs for Environment (Pay much higher taxes)',
        'Willingness to Make Trade-Offs for Environment (Cut your standard of living)',
        'Member of a group to preserve environment',
        'In the last five years, signed a petition',
        'In the last five years, given money',
        'In the last five years, participated in an environmental demonstration',
        'I do what is right even when it costs money and takes time'
    ],
    
    #TOPIC 3: importance of addressing environmental problems and of collective effort for resolving environmental
    #issues and how environmental issues inter- sect with economic issues, science and progress
    # NB!! answers are rescaled so that higher scores reflect pro-environment responses
    "Efficacy of environmental action" : [
        'We worry too much about harming environment',
        'We worry too much about future environment',
        'Science solves environmental problems',
        'Too difficult to do much about environment'
    ]
        
}

#rescale
data_to_cluster = data_quest.apply(lambda x: x if x.name[1] not in clusters["Efficacy of environmental action"] else -x)
data_quest_clustered = pd.concat([pd.concat([data_to_cluster.loc[:, ("Questions", clusters[topic])].sum(axis=1) for topic in clusters], axis=1, keys=clusters.keys())], keys=["Environmental concern dimension"],  names=["Variable Type", "Variable Name"], axis=1)


#get country-wide indicators columns
macro_indicators_dict = {(ind[:-7]) : pd.read_pickle(f"Processed_Data/indicators/{ind}") for ind in os.listdir("Processed_Data/indicators") if ind[-7:] == ".pickle"}
data_indic = pd.concat([pd.DataFrame.from_records([data_issp.index.map(lambda idx: indicator.loc[idx[1], idx[0]]) for indicator in countries_indicators_dict.values()], index=countries_indicators_dict.keys(), columns=data_issp.index).T], keys=["Country-wide indicators"], names=["Variable Type", "Variable Name"], axis=1)


#merge into single dataframes
data_issp_numerical = pd.concat([data_resp, data_indic, data_quest], axis=1) #separate questions
data_issp_numerical_clustered = pd.concat([data_resp, data_indic, data_quest_clustered], axis=1) #clustered questions


del(data_issp, data_resp, data_quest, data_to_cluster, data_quest_clustered, data_indic, conv_dict)


#dump
#data_issp_numerical.to_pickle("Saved_variables/full_dataset_numerical_quest+indic.pickle")
#data_issp_numerical_clustered.to_pickle("Saved_variables/full_dataset_numerical_clustered.pickle")

In [3]:
#quick load
data_issp_numerical = pd.read_pickle("Saved_variables/full_dataset_numerical_quest+indic.pickle")
data_issp_numerical_clustered = pd.read_pickle("Saved_variables/full_dataset_numerical_clustered.pickle")

___

In [4]:
#group by year and country (weighted mean)
data_issp_numerical_grouped = data_issp_numerical.drop(columns=("Respondent Data", "Weight")).mul(data_issp_numerical.loc[:,("Respondent Data", "Weight")], axis=0).groupby(["Year", "Country"]).sum(min_count=1).div(data_issp_numerical.loc[:,("Respondent Data", "Weight")].groupby(["Year", "Country"]).sum(), axis=0)
data_issp_numerical_clustered_grouped = data_issp_numerical_clustered.drop(columns=("Respondent Data", "Weight")).mul(data_issp_numerical_clustered.loc[:,("Respondent Data", "Weight")], axis=0).groupby(["Year", "Country"]).sum(min_count=1).div(data_issp_numerical_clustered.loc[:,("Respondent Data", "Weight")].groupby(["Year", "Country"]).sum(), axis=0)

___
## Get relevant indicators

In [91]:
def fit_lasso(yX, alpha, weights=None):
    model = Lasso(alpha)
    model.fit(yX.iloc[:,1:], yX.iloc[:,0], sample_weight=weights)
    return {"alpha" : alpha, "loss" : 1-model.score(yX.iloc[:,1:], yX.iloc[:,0], sample_weight=weights), "coefficients" : model.coef_}

def plot_indicators(data, question, year, indicators):
    yX = data.loc[(year, slice(None)),[("Environmental concern dimension", question), *indicators]].dropna().astype(float)
    yX.iloc[:,:] = StandardScaler().fit_transform(yX.to_numpy())
    
    try:
        weights = data.loc[yX.index, ("Respondent Data", "Weight")]
    except:
        weights = None
    
    res = pd.DataFrame([fit_lasso(yX, alpha, weights) for alpha in np.logspace(-2,0,250)])
    
    fig, (ax1, ax2) = plt.subplots(1,2, figsize=(6.4*2,4.8), gridspec_kw={"width_ratios" : (5,95)}, sharey=True)
    fig.subplots_adjust(wspace=0)
    fig.suptitle(f"Most important indicators to predict '{question}' in {year}", fontsize=20)
    data = np.stack(res["coefficients"].to_numpy())
    sns.heatmap([[np.log(l)] for l in res["loss"]], ax=ax1, cbar=False)
    sns.heatmap(data, cmap="bwr", center=0, ax=ax2)
    ax2.set_title("Coefficients in LASSO regression")
    ax2.set_xticklabels([col[1] for col in yX.iloc[:,1:].columns], rotation=50, horizontalalignment="right", fontsize=8)
    ax2.set_xlabel("Indicator")

    for ax in [ax1]:
        ax.set_yticks(res.index[::50])
        ax.set_yticklabels([f"{l:.0e}" for l in res.loc[::50,"alpha"]], rotation=30, horizontalalignment="right", verticalalignment="top", fontsize=8)
        ax.tick_params(axis="y", pad=0)
        ax.set_ylabel("L1 regularization hyperparameter")
    
    ax1.set_title("Model error")
    ax1.set_xticks([])
    
    plt.tight_layout()

### Macro Indicators

In [92]:
@widgets.interact(year=[1993,2000,2010,2020], dimension=data_issp_numerical_clustered.loc[:,"Environmental concern dimension"].columns)
def plot(dimension, year=2020):
    indicators = [("Country-wide indicators", ind) for ind in ["GHG_emissions", "PM25_exposure", "GDP_growth", "gini_index", "GDP_pcap_ppp", "water_withdrawals", "pesticides_use"]]
    plot_indicators(data_issp_numerical_clustered_grouped, dimension, year, indicators)

interactive(children=(Dropdown(description='dimension', options=('Awareness', 'Will to make sacrifices', 'Effi…

### Individual indicators

In [93]:
@widgets.interact(year=[1993,2000,2010,2020], dimension=data_issp_numerical_clustered.loc[:,"Environmental concern dimension"].columns, all_countries=False, country=data_issp_numerical_clustered.index.get_level_values("Country").unique())
def plot(dimension, country, all_countries, year=2020):
    indicators = [("Respondent Data", ind) for ind in data_issp_numerical.loc[:, "Respondent Data"].drop(columns="Weight").columns]
    if all_countries:
        plot_indicators(data_issp_numerical_clustered, dimension, year, indicators)
    else:
        plot_indicators(data_issp_numerical_clustered.query("Country == @country"), dimension, year, indicators)

interactive(children=(Dropdown(description='dimension', options=('Awareness', 'Will to make sacrifices', 'Effi…

___
## Get main indicators dataframe

In [84]:
def cluster_dataframe(data, cmap="bwr", method="ward", show=False, row_cluster=True, col_cluster=True):
    from scipy.cluster.hierarchy import linkage, dendrogram

    if row_cluster:
        row_linkage = linkage(data, method=method, optimal_ordering=True)
        row_order = dendrogram(row_linkage, no_plot=True)['leaves']
    else:
        row_order = slice(None)

    if col_cluster:
        col_linkage = linkage(data.T, method=method,  optimal_ordering=True)
        col_order = dendrogram(col_linkage, no_plot=True)['leaves']

    else:
        col_order = slice(None)
        
    abs_max = data.abs().max(axis=None)

    if show:
        display(data.iloc[row_order, col_order].style.background_gradient(cmap=cmap, axis=None, vmin=-abs_max, vmax=abs_max).map(lambda x: "color:white" if x == 0.0 else None))
    else:
        return data.iloc[row_order, col_order]

In [65]:
def get_main_indicators(data, question, year, indicators, country=None, n_coef=3, tol=0.1):
    if country == None:
        country = slice(None)

    yX = data.loc[(year, country),[("Environmental concern dimension", question), *indicators]].astype(float)
    yX = yX.fillna(yX.mean()) #fill missing values with columns average 
    
    #if whole column is missing replace with 0 for fitting, then replace with NaN in output
    NaN_ind = yX.iloc[:,1:].mean().isna().to_numpy()
    yX = yX.fillna(0)
    
    #print(f"\n\n{question} - {year} - {country}")
    #display(yX)

    #normalize data
    yX.iloc[:,:] = StandardScaler().fit_transform(yX.to_numpy())

    #weighted fit if weights available (for ungrouped data)
    try:
        weights = data.loc[yX.index, ("Respondent Data", "Weight")]
    except:
        weights = None

    #get results for various alpha values (L1 regularizaton hyperparam)
    res = pd.DataFrame([fit_lasso(yX, alpha, weights) for alpha in np.logspace(-6,0,250)])

    #select largest alpha value for which at least `n_coef` indicators are not zero
    #(and the model loss is `tol` times worse than best model compared to worst model, will select more than n_coef values if model isn't good enough)
    for idx in res.index[::-1]:
        if sum(np.vectorize(lambda x: x != 0.)(res.loc[idx,"coefficients"])) > n_coef and (res.loc[idx, "loss"] - res["loss"].min())/(res["loss"].max() - res["loss"].min()) < tol:
            best = res.loc[idx+1,:].copy()
            best["coefficients"] = [coef if NaN_ind[i] == False else np.nan for i, coef in enumerate(best["coefficients"])]
            break
    return best

In [66]:
#get indicators dataframes
indicators_macro = [("Country-wide indicators", ind) for ind in ["GHG_emissions", "PM25_exposure", "GDP_growth", "gini_index", "GDP_pcap_ppp", "water_withdrawals", "pesticides_use"]]
predictive_indicators_macro = pd.DataFrame(columns=[ind for _, ind in indicators_macro]).T

for year, question in ((y,q) for q in data_issp_numerical_clustered_grouped.loc[:,"Environmental concern dimension"].columns for y in [1993,2000,2010,2020]):
    predictive_indicators_macro[(question, year)] = get_main_indicators(data_issp_numerical_clustered_grouped, question, year, indicators_macro, n_coef=3, tol=.1)["coefficients"]
predictive_indicators_macro = predictive_indicators_macro.T
predictive_indicators_macro.index = pd.MultiIndex.from_tuples(predictive_indicators_macro.index, names=["Environmental concern dimension", "Year"])
predictive_indicators_macro.columns.name = "Indicators"
#predictive_indicators_macro.to_pickle("./Saved_variables/predictive_indicators_macro.pickle")


indicators_individual = [("Respondent Data", ind) for ind in data_issp_numerical.loc[:, "Respondent Data"].drop(columns="Weight").columns]
predictive_indicators_individual = pd.DataFrame(columns=[ind for _, ind in indicators_individual]).T

for year, question in ((y,q) for q in data_issp_numerical_clustered.loc[:,"Environmental concern dimension"].columns for y in [1993,2000,2010,2020]):
    for country in data_issp_numerical_clustered.loc[year].index.get_level_values("Country").unique():
        predictive_indicators_individual[(question, year, country)] = get_main_indicators(data_issp_numerical_clustered, question, year, indicators_individual, country, n_coef=3, tol=1)["coefficients"]
predictive_indicators_individual = predictive_indicators_individual.T
predictive_indicators_individual.index = pd.MultiIndex.from_tuples(predictive_indicators_individual.index, names=["Environmental concern dimension", "Year", "Country"])
predictive_indicators_individual.columns.name = "Indicators"
#predictive_indicators_individual.to_pickle("./Saved_variables/predictive_indicators_individual.pickle")

In [69]:
#quick load

predictive_indicators_macro = pd.read_pickle("./Saved_variables/predictive_indicators_macro.pickle")
predictive_indicators_individual = pd.read_pickle("./Saved_variables/predictive_indicators_individual.pickle")

### Individual indicators

In [115]:
data = predictive_indicators_individual.groupby(["Environmental concern dimension", "Year"]).mean() #average over all countries
#data = predictive_indicators_individual.query("Country == 'Germany'") #select specific country
cluster_dataframe(data.fillna(0), show=True, row_cluster=False)

Unnamed: 0_level_0,Indicators,Education level,Sex,Personal income,Living area,Religious beliefs,Age,Political orientation
Environmental concern dimension,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Awareness,1993,0.010401,0.042134,0.002197,0.014303,-0.00168,-0.036752,-0.043531
Awareness,2000,0.023864,0.033903,-0.003917,0.025287,0.000517,-0.027529,-0.032173
Awareness,2010,0.022504,0.067807,-0.001086,0.009745,0.002166,-0.020145,-0.063706
Awareness,2020,0.0,0.064657,0.000603,0.004237,0.01112,-0.016341,-0.106811
Efficacy of environmental action,1993,0.161059,-0.001068,0.00077,0.000462,-0.022736,-0.1146,-0.006253
Efficacy of environmental action,2000,0.157502,0.004487,0.009462,0.00128,-0.003599,-0.086311,-0.013593
Efficacy of environmental action,2010,0.147067,0.009101,0.013034,0.003173,-0.005918,-0.028773,-0.026717
Efficacy of environmental action,2020,0.00106,0.015146,0.010044,0.013933,-0.01681,-0.044362,-0.069337
Will to make sacrifices,1993,0.146655,0.004382,0.010984,0.006067,-0.007428,-0.028725,-0.032368
Will to make sacrifices,2000,0.156814,-0.002326,0.011705,0.001269,-0.004698,-0.003973,-0.036725


**Conclusions:**

1. **Political orientation** more important in recent years -> more polarized, environmental issues become political issues (left -> more pro-environment)
2. **Education level** important in predicting Will to make sacrifices and Efficacy of env. action (more educ. -> more pro-environment)
3. **Sex** correlated with awareness (females -> more aware)
4. **Age** slight correlation with all env. concern dimensions (older -> less concerned)

### Macro indicators

In [83]:
data = predictive_indicators_macro
cluster_dataframe(data, show=True, row_cluster=False)

Unnamed: 0_level_0,Indicators,GDP_pcap_ppp,GDP_growth,water_withdrawals,GHG_emissions,pesticides_use,gini_index,PM25_exposure
Environmental concern dimension,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Awareness,1993,0.08962,-0.368108,-0.379788,-0.209979,-0.109,0.229525,0.026715
Awareness,2000,-0.425421,-0.0,-0.0,0.0,0.036204,0.250093,-0.068043
Awareness,2010,-0.560506,0.023279,-0.0,-0.0,0.0,0.146556,0.0
Awareness,2020,-0.401892,-0.077129,-0.011686,-0.206767,0.040232,0.181721,-0.0
Will to make sacrifices,1993,0.394031,0.0,-0.069748,0.0,0.0,0.0,-0.243801
Will to make sacrifices,2000,0.770736,-0.0,-0.0,-0.052432,0.10312,0.122187,0.094458
Will to make sacrifices,2010,0.699284,0.450111,-0.0,0.0,0.034381,0.0,0.0
Will to make sacrifices,2020,0.858688,-0.0,-0.00749,0.0,0.0,0.007421,0.730018
Efficacy of environmental action,1993,0.495633,0.053475,-0.0,-0.0,0.030802,-0.093118,-0.235734
Efficacy of environmental action,2000,0.472084,-0.0,-0.0,-0.0,0.0,-0.121656,-0.154445


**Conclusions:**

1. **GDP** most important predictor -> less aware, but more willing to act (make sacrifices, believes in environmental action)

___