In [159]:
import pandas as pd
import numpy as np

pd.options.plotting.backend = "plotly"
import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots



import plotly.io as pio
import plotly.express as px

pio.templates.default = "simple_white"

In [160]:
control_columns = [
    "Participant id",
    "Number of subordinates",
    "Age",
    "Sex",
]

data_usa_raw = pd.read_csv("../data/raw/usa/all_apps_wide_2023-11-14_USA.csv").merge(
    pd.read_csv("../data/raw/usa/prolific_demographics.csv")[control_columns],
    left_on="participant.label",
    right_on="Participant id",
    how="left",
)
data_deu_1_raw = pd.read_csv(
    "../data/raw/deutsch/all_apps_wide_2023-11-14_DEU.csv"
).merge(
    pd.read_csv("../data/raw/deutsch/prolific_demographics.csv")[control_columns],
    left_on="participant.label",
    right_on="Participant id",
    how="left",
)


data_deu_2_raw = (
    pd.read_csv("../data/raw/deutsch runde 2/all_apps_wide_2023-11-16_DEU.csv")
    .merge(
        pd.read_csv("../data/raw/deutsch runde 2/prolific_demographics.csv")[
            control_columns
        ],
        left_on="participant.label",
        right_on="Participant id",
        how="left",
    )
)

In [161]:
# page times

page_times_usa_raw = pd.read_csv("../data/raw/usa/PageTimes-2023-11-14_USA.csv")
page_times_deu_1_raw = pd.read_csv("../data/raw/deutsch/PageTimes-2023-11-14_DEU.csv")


page_times_deu_2_raw = pd.read_csv(
    "../data/raw/deutsch runde 2/PageTimes-2023-11-16_DEU.csv"
)

In [162]:
merged_times = pd.concat(
    [page_times_usa_raw, page_times_deu_1_raw, page_times_deu_2_raw], axis=0
)

time_delta = (
    (
        merged_times.groupby("participant_code")["epoch_time_completed"].max()
        - merged_times.groupby("participant_code")["epoch_time_completed"].min()
    )
    .to_frame()
    .reset_index()
)

In [217]:
relevant_columns = [
    "survey.1.player.positive_szenarios",
    "survey.1.player.treatment_pa",
    "survey.1.player.PRS1T1",
    "survey.1.player.PRS1T2",
    "survey.1.player.PRS2T1",
    "survey.1.player.PRS2T2",
    "survey.1.player.PRS3T1",
    "survey.1.player.PRS3T2",
    "survey.1.player.PRS4T1",
    "survey.1.player.PRS4T2",
    "survey.1.player.FRS1T1",
    "survey.1.player.FRS1T2",
    "survey.1.player.FRS2T1",
    "survey.1.player.FRS2T2",
    "survey.1.player.FRS3T1",
    "survey.1.player.FRS3T2",
    "survey.1.player.FRS4T1",
    "survey.1.player.FRS4T2",
    "survey.1.player.ARS1T1",
    "survey.1.player.ARS1T2",
    "survey.1.player.ARS2T1",
    "survey.1.player.ARS2T2",
    "survey.1.player.ARS3T1",
    "survey.1.player.ARS3T2",
    "survey.1.player.ARS4T1",
    "survey.1.player.ARS4T2",
    "survey.1.player.ETHS1T1",
    "survey.1.player.ETHS1T2",
    "survey.1.player.ETHS2T1",
    "survey.1.player.ETHS2T2",
    "survey.1.player.ETHS3T1",
    "survey.1.player.ETHS3T2",
    "survey.1.player.ETHS4T1",
    "survey.1.player.ETHS4T2",
    "survey.1.player.PROS1T1",
    "survey.1.player.PROS1T2",
    "survey.1.player.PROS2T1",
    "survey.1.player.PROS2T2",
    "survey.1.player.PROS3T1",
    "survey.1.player.PROS3T2",
    "survey.1.player.PROS4T1",
    "survey.1.player.PROS4T2",
    "survey.1.player.algo_aversion",
    "survey.1.player.risk_propensity",
    "origin",
    "overall_time",
    "Number of subordinates",
    "Age",
    "Sex",
    "participant.label",
]

In [218]:
data_all = (
    pd.concat(
        [
            data_usa_raw.assign(origin="USA"),
            data_deu_1_raw.assign(origin="DEU"),
            data_deu_2_raw.assign(origin="DEU"),
        ],
        axis=0,
    )
    .merge(time_delta, left_on="participant.code", right_on="participant_code")
    .rename(columns={"epoch_time_completed": "overall_time"})
    .loc[lambda df_: df_["participant._current_page_name"] == "End"]
    .loc[:, relevant_columns]
    .rename(
        columns=dict(
            zip(
                relevant_columns,
                [x.replace("survey.1.player.", "") for x in relevant_columns],
            )
        )
    )
    .rename(
        columns={
            "Number of subordinates": "number_of_subordinates",
            "Age": "age",
            "Sex": "sex",
        }
    )
    .assign(
        sex=lambda df_: df_.sex.replace("Female", 1).replace("Male", 0),
        number_of_subordinates=lambda df_: df_.number_of_subordinates.replace(
            {"2-3": 0, "4-6": 1, "7-10": 2, ">10": 3}
        ),
        # TODO NEXT Check how to replace here by dict values  !!
    )
)
MIN_TIME = data_all.overall_time.quantile(0.25)  # 5 min also worked ok
MAX_TIME = data_all.overall_time.quantile(0.75)  # 15 also min worked ok
data_all = (
    data_all
    .loc[
    data_all.overall_time.between(left=MIN_TIME, right=MAX_TIME, inclusive="both")]
    .loc[
    data_all.age != "DATA_EXPIRED"
].astype({"sex": "int", "age": "int", "number_of_subordinates": "int"})
)
data_usa = data_all.loc[data_all.origin == "USA"]
data_deu = data_all.loc[data_all.origin == "DEU"]

Scale explanation:
- FRS1T1 -->  feel responsible senario 1, team lead 1 (familiar solution)
- ARS1T1 -->  act responsible senario 1, team lead 1 (familiar solution)
- FRS1T2 -->  feel responsible senario 1, team lead 2 (new solution)
- ARS1T2 -->  act responsible senario 1, team lead 2 (new solution)

In [219]:
scenario_dict_outcome = {
    "1": {"positive": "1_and_4", "negative": "2_and_3"},
    "2": {"positive": "2_and_3", "negative": "1_and_4"},
    "3": {"positive": "2_and_3", "negative": "1_and_4"},
    "4": {"positive": "1_and_4", "negative": "2_and_3"},
}

scenario_dict_pa = {
    "1": {"yes": "1_and_4", "no": "2_and_3"},
    "2": {"yes": "2_and_3", "no": "1_and_4"},
    "3": {"yes": "2_and_3", "no": "1_and_4"},
    "4": {"yes": "1_and_4", "no": "2_and_3"},
}


def get_mean_scales(data, scale, scenario_nr, solution_type, outcome, pa):
    scale_string = ""
    if scale == "feel":
        scale_string += "FR"
    elif scale == "act":
        scale_string += "AR"
    elif scale == "risk":
        scale_string += "PR"

    scale_string += f"S{scenario_nr}"

    if solution_type == "familiar":
        scale_string += "T1"
    else:
        scale_string += "T2"

    outcome_filter = scenario_dict_outcome[str(scenario_nr)][outcome]
    pa_filter = scenario_dict_pa[str(scenario_nr)][pa]

    return data.loc[
        (data.positive_szenarios == outcome_filter)
        & (data.treatment_pa == pa_filter),
        scale_string,
    ].mean()

## Plot Fig 1 from Nordbye

In [220]:
x_axis_values_list = [
    ["Familiar solution", "New solution"],
    ["Follow advise", "Not follow advise"],
    ["Hold on", "Change decision"],
    ["Wait and see", "Take action"],
]

d3_colors = px.colors.qualitative.D3
colors_dict = {
    "feel responsible + positive outcome + PA yes": d3_colors[0],
    "feel responsible + positive outcome + PA no": d3_colors[1],
    "feel responsible + negative outcome + PA yes": d3_colors[2],
    "feel responsible + negative outcome + PA no": d3_colors[3],
    "act responsible + positive outcome + PA yes": d3_colors[4],
    "act responsible + positive outcome + PA no": d3_colors[5],
    "act responsible + negative outcome + PA yes": d3_colors[6],
    "act responsible + negative outcome + PA no": d3_colors[7],
}


def create_line_subplot(data, output_file_name, show_fig, fig_title):
    fig = make_subplots(rows=2, cols=2)

    for scenario, x_axis_values in zip(range(1, 5), x_axis_values_list):
        for responsibility_scale in ["feel", "act"]:
            for outcome in ["positive", "negative"]:
                for pa_available in ["yes", "no"]:
                    fig.add_trace(
                        go.Scatter(
                            x=x_axis_values,
                            y=[
                                get_mean_scales(
                                    data=data,
                                    scale=responsibility_scale,
                                    scenario_nr=scenario,
                                    solution_type="familiar",
                                    outcome=outcome,
                                    pa=pa_available,
                                ),
                                get_mean_scales(
                                    data=data,
                                    scale=responsibility_scale,
                                    scenario_nr=scenario,
                                    solution_type="new",
                                    outcome=outcome,
                                    pa=pa_available,
                                ),
                            ],
                            mode="lines+markers",
                            name=f"{responsibility_scale} responsible + {outcome} outcome + PA {pa_available}",
                            legendgroup=f"{responsibility_scale} responsible + {outcome} outcome + PA {pa_available}",
                            marker_color=colors_dict[
                                f"{responsibility_scale} responsible + {outcome} outcome + PA {pa_available}"
                            ],
                            showlegend=True if scenario == 1 else False,
                        ),
                        row=1 if scenario in [1, 2] else 2,
                        col=1 if scenario in [1, 3] else 2,
                    )
    fig.update_yaxes(range=[1, 7], dtick=1)
    fig.update_layout(width=900 + 200, height=600 + 200, title=fig_title)
    fig.write_html(f"{output_file_name}.html")
    if show_fig:
        fig.show()

In [221]:
create_line_subplot(data=data_all, output_file_name="scenarios_all", show_fig=True, fig_title="All")

In [222]:
create_line_subplot(
    data=data_deu, output_file_name="scenarios_deu", show_fig=True, fig_title="DEU"
)

In [223]:
create_line_subplot(
    data=data_usa, output_file_name="scenarios_usa", show_fig=True, fig_title="USA"
)

## who is perceied to feel / act more responsible?

In [224]:
# who is perceied to feel more responsible?
t1_columns_feel = [f"FRS{x}T1" for x in range(1, 5)]
t2_columns_feel = [f"FRS{x}T2" for x in range(1, 5)]

# who is perceied to act more responsible?
t1_columns_act = [f"ARS{x}T1" for x in range(1, 5)]
t2_columns_act = [f"ARS{x}T2" for x in range(1, 5)]

pd.DataFrame(
    {
        "t1_feel": data_all[t1_columns_feel].to_numpy().reshape(-1),
        "t2_feel": data_all[t2_columns_feel].to_numpy().reshape(-1),
        "t1_act": data_all[t1_columns_act].to_numpy().reshape(-1),
        "t2_act": data_all[t2_columns_act].to_numpy().reshape(-1),
    }
).describe()

Unnamed: 0,t1_feel,t2_feel,t1_act,t2_act
count,360.0,360.0,360.0,360.0
mean,4.713889,5.988889,5.130556,4.708333
std,1.814159,1.242196,1.587612,1.567759
min,1.0,1.0,1.0,1.0
25%,3.0,5.0,4.0,4.0
50%,5.0,6.0,6.0,5.0
75%,6.0,7.0,6.0,6.0
max,7.0,7.0,7.0,7.0


--> t2 (B) will feel more responsible and acted less responsible

## Regression

In [225]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
from stargazer.stargazer import Stargazer

In [226]:
data_all.columns

Index(['positive_szenarios', 'treatment_pa', 'PRS1T1', 'PRS1T2', 'PRS2T1',
       'PRS2T2', 'PRS3T1', 'PRS3T2', 'PRS4T1', 'PRS4T2', 'FRS1T1', 'FRS1T2',
       'FRS2T1', 'FRS2T2', 'FRS3T1', 'FRS3T2', 'FRS4T1', 'FRS4T2', 'ARS1T1',
       'ARS1T2', 'ARS2T1', 'ARS2T2', 'ARS3T1', 'ARS3T2', 'ARS4T1', 'ARS4T2',
       'ETHS1T1', 'ETHS1T2', 'ETHS2T1', 'ETHS2T2', 'ETHS3T1', 'ETHS3T2',
       'ETHS4T1', 'ETHS4T2', 'PROS1T1', 'PROS1T2', 'PROS2T1', 'PROS2T2',
       'PROS3T1', 'PROS3T2', 'PROS4T1', 'PROS4T2', 'algo_aversion',
       'risk_propensity', 'origin', 'overall_time', 'number_of_subordinates',
       'age', 'sex', 'participant.label'],
      dtype='object')

In [230]:
def get_regression_df(szenario_nr, dependent_variable):
    t1_df = (
        data_all[
            [
                "positive_szenarios",
                "treatment_pa",
                "algo_aversion",
                "number_of_subordinates",
                "age",
                "sex",
                f"{dependent_variable}S{szenario_nr}T1",
            ]
        ]
        .assign(
            outcome_positive=data_all.positive_szenarios.replace(
                scenario_dict_outcome[str(szenario_nr)]["positive"], 1
            ).replace(scenario_dict_outcome[str(szenario_nr)]["negative"], 0),
            pa_available=data_all.treatment_pa.replace(
                scenario_dict_pa[str(szenario_nr)]["yes"], 1
            ).replace(scenario_dict_pa[str(szenario_nr)]["no"], 0),
            participant=range(data_all.shape[0]),
            active_teamlead=1,
            origin_usa=data_all.origin == "USA",
        )
        .rename(columns={f"{dependent_variable}S{szenario_nr}T1": dependent_variable})
        .astype({"origin_usa": "int"})
    )

    t2_df = (
        data_all[
            [
                "positive_szenarios",
                "treatment_pa",
                "algo_aversion",
                "number_of_subordinates",
                "age",
                "sex",
                f"{dependent_variable}S{szenario_nr}T2",
            ]
        ]
        .assign(
            outcome_positive=data_all.positive_szenarios.replace(
                scenario_dict_outcome[str(szenario_nr)]["positive"], 1
            ).replace(scenario_dict_outcome[str(szenario_nr)]["negative"], 0),
            pa_available=data_all.treatment_pa.replace(
                scenario_dict_pa[str(szenario_nr)]["yes"], 1
            ).replace(scenario_dict_pa[str(szenario_nr)]["no"], 0),
            participant=range(data_all.shape[0]),
            active_teamlead=0,
            origin_usa=data_all.origin == "USA",
        )
        .rename(columns={f"{dependent_variable}S{szenario_nr}T2": dependent_variable})
        .astype({"origin_usa": "int"})
    )

    return pd.concat([t1_df, t2_df], axis=0).loc[
        :,
        [
            dependent_variable,
            "outcome_positive",
            "pa_available",
            "active_teamlead",
            "participant",
            "algo_aversion",
            "origin_usa",
            "number_of_subordinates",
            "age",
            "sex",
        ],
    ]

In [259]:
def get_PR_residuals():
    dependent_variable = "PR"
    regression_df_all = pd.DataFrame()
    for scenario in range(1, 5):
        regression_df = get_regression_df(scenario, dependent_variable)
        regression_df = regression_df.assign(scenario=scenario)
        regression_df_all = pd.concat([regression_df_all, regression_df])

    mod = smf.ols(
        formula=f"{dependent_variable} ~ C(scenario) + pa_available * active_teamlead + algo_aversion + sex",
        data=regression_df_all,
    )
    res = mod.fit()
    res.summary()  # .tables[1]
    # if (
    #     min(
    #         res.pvalues["pa_available"],
    #         res.pvalues["pa_available:active_teamlead"],
    #     )
    #     < 0.05
    # ):
    #     print(f"{scenario=}, {dependent_variable=}")
    #     print(res.summary().tables[1])
    #     print(20 * "-")

    return list(res.resid)

In [302]:
regresion_results = []
for dependent_variable in ["AR", "FR"]:
    regression_df_all = pd.DataFrame()
    for scenario in range(1, 5):
        regression_df = get_regression_df(scenario, dependent_variable)
        regression_df = regression_df.assign(scenario=scenario)
        regression_df_all = pd.concat([regression_df_all, regression_df])

    regression_df_all = regression_df_all.assign(residuals_PR=get_PR_residuals())

    mod = smf.mixedlm(
        formula=f"{dependent_variable} ~ C(scenario) + outcome_positive * pa_available * active_teamlead + residuals_PR + algo_aversion + sex + origin_usa",
        groups="participant",
        data=regression_df_all,
    )

    regresion_results.append(mod.fit())
    print(f"AIC: {mod.fit(reml=False).aic}")

# if (
#     min(
#         res.pvalues["pa_available"],
#         res.pvalues["outcome_positive:pa_available"],
#         res.pvalues["outcome_positive:pa_available:active_teamlead"],
#     )
#     < 0.05
# ):
#     print(f"{scenario=}, {dependent_variable=}")
#     print(res.summary().tables[1])
#     print(20 * "-")


Maximum Likelihood optimization failed to converge. Check mle_retvals


Retrying MixedLM optimization with lbfgs



AIC:



Maximum Likelihood optimization failed to converge. Check mle_retvals


Retrying MixedLM optimization with lbfgs


Maximum Likelihood optimization failed to converge. Check mle_retvals


Retrying MixedLM optimization with cg


Maximum Likelihood optimization failed to converge. Check mle_retvals


MixedLM optimization failed, trying a different optimizer may help.


Gradient optimization failed, |grad| = 26.860156



2526.6169646346953



Maximum Likelihood optimization failed to converge. Check mle_retvals


Retrying MixedLM optimization with lbfgs


Maximum Likelihood optimization failed to converge. Check mle_retvals


Retrying MixedLM optimization with cg


Maximum Likelihood optimization failed to converge. Check mle_retvals


MixedLM optimization failed, trying a different optimizer may help.


Gradient optimization failed, |grad| = 18.799420



AIC:



Maximum Likelihood optimization failed to converge. Check mle_retvals


Retrying MixedLM optimization with lbfgs


Maximum Likelihood optimization failed to converge. Check mle_retvals


Retrying MixedLM optimization with cg



2602.138616357356



Maximum Likelihood optimization failed to converge. Check mle_retvals


MixedLM optimization failed, trying a different optimizer may help.


Gradient optimization failed, |grad| = 21.872546



In [306]:
regresion_results[1].summary()

0,1,2,3
Model:,MixedLM,Dependent Variable:,FR
No. Observations:,720,Method:,REML
No. Groups:,90,Scale:,1.9627
Min. group size:,8,Log-Likelihood:,-1283.5014
Max. group size:,8,Converged:,No
Mean group size:,8.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,5.559,0.346,16.046,0.000,4.880,6.238
C(scenario)[T.2],-0.482,0.148,-3.249,0.001,-0.773,-0.191
C(scenario)[T.3],0.196,0.148,1.319,0.187,-0.095,0.487
C(scenario)[T.4],-0.094,0.148,-0.640,0.522,-0.384,0.195
outcome_positive,0.038,0.230,0.167,0.867,-0.412,0.489
pa_available,-0.392,0.230,-1.702,0.089,-0.844,0.060
outcome_positive:pa_available,0.559,0.353,1.584,0.113,-0.133,1.250
active_teamlead,-1.825,0.209,-8.735,0.000,-2.234,-1.415
outcome_positive:active_teamlead,1.094,0.296,3.702,0.000,0.515,1.673


In [303]:
# Zuerst kommen die Treatment variables (Intercept, PA, Outcome, dann die Interaktion PA*Outcome, dann die Kontrollvariablen wie algo aversion, dann die soziodemographischen Kontrollvariablen, participant-level FE bitte noch mal nachschauen, wie die korrekt zu reporten sind, meiner Meinung nicht als IVs im regression table. 
                                       
covariates_order =[
    "Intercept",
    "outcome_positive",
    "pa_available",
    "active_teamlead",
    "outcome_positive:pa_available",
    "pa_available:active_teamlead",
    "outcome_positive:active_teamlead",
    "outcome_positive:pa_available:active_teamlead",
    "residuals_PR",
    "algo_aversion",
    "origin_usa",
    "sex",
    "C(scenario)[T.2]",
    "C(scenario)[T.3]",
    "C(scenario)[T.4]",
]

In [304]:
stargazer = Stargazer(regresion_results)
stargazer.covariate_order(covariates_order)

with open("reg_tables_new.html", "w") as outfile:
    outfile.write(stargazer.render_html())

In [None]:
all_regression_df_feel_responsible = pd.DataFrame()
for scenario in range(1, 5):
    for dependent_variable in ["FR"]:
        regression_df = get_regression_df(scenario, dependent_variable)
        regression_df = regression_df.assign(vignette=scenario)
        all_regression_df_feel_responsible = pd.concat(
            [all_regression_df_feel_responsible, regression_df]
        )

In [None]:
all_regression_df_act_responsible = pd.DataFrame()
for scenario in range(1, 5):
    for dependent_variable in ["AR"]:
        regression_df = get_regression_df(scenario, dependent_variable)
        regression_df = regression_df.assign(vignette=scenario)
        all_regression_df_act_responsible = pd.concat(
            [all_regression_df_act_responsible, regression_df]
        )

In [None]:
all_regression_df_perceived_risk = pd.DataFrame()
for scenario in range(1, 5):
    for dependent_variable in ["PR"]:
        regression_df = get_regression_df(scenario, dependent_variable)
        regression_df = regression_df.assign(vignette=scenario)
        all_regression_df_perceived_risk = pd.concat(
            [all_regression_df_perceived_risk, regression_df]
        )

In [None]:
all_regression_df_feel_responsible.to_csv(
    "regressions_df_feel_responsible.csv", index=False
)
all_regression_df_act_responsible.to_csv("regression_df_act_responsible.csv", index=False)
all_regression_df_perceived_risk.to_csv("regression_df_perceived_risk.csv", index=False)

In [None]:
all_regression_df_feel_responsible.assign(
    AR=all_regression_df_act_responsible.AR, PR=all_regression_df_perceived_risk.PR
).loc[
    :,
    [
        "AR",
        "FR",
        "PR",
        "outcome_positive",
        "pa_available",
        "active_teamlead",
        "participant",
        "algo_aversion",
        "origin_usa",
        "number_of_subordinates",
        "age",
        "sex",
        "vignette",
    ],
].to_csv("regression_df_all_dependent_variables.csv", index=False)

## Demographics

In [None]:
data_all[["age", "sex", "algo_aversion", "risk_propensity"]].describe().round(2)

Unnamed: 0,age,sex,algo_aversion,risk_propensity
count,90.0,90.0,90.0,90.0
mean,38.41,0.27,5.13,6.13
std,11.55,0.44,1.23,2.06
min,22.0,0.0,2.0,1.0
25%,29.25,0.0,4.0,5.0
50%,34.0,0.0,5.0,6.0
75%,46.75,1.0,6.0,7.0
max,78.0,1.0,7.0,10.0


In [None]:
data_all.origin.value_counts(normalize=True).mul(100).round(1)

origin
USA    53.3
DEU    46.7
Name: proportion, dtype: float64

In [None]:
data_all.overall_time.div(60).describe()

count    90.000000
mean      7.155926
std       1.654941
min       4.666667
25%       5.841667
50%       6.700000
75%       8.562500
max      10.533333
Name: overall_time, dtype: float64

In [None]:
# how many started the survey?
pd.concat(
    [
        data_usa_raw.assign(origin="USA"),
        data_deu_1_raw.assign(origin="DEU"),
        data_deu_2_raw.assign(origin="DEU"),
    ],
    axis=0,
)["participant._current_page_name"].value_counts().sum()

213