# Required Imports and Constants

In [1]:
%matplotlib inline

import ast
import json
import os
import sys
from datetime import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import lines
from matplotlib import patches

from scipy.stats import f_oneway
from scipy.stats import ttest_1samp

import warnings
warnings.filterwarnings("ignore")

CONDITIONS = ["CTRL", "WTHN", "BTWN", "BOTH"]


CTRL_PIDS = [
    "0jjk2LtlRwXu",
    "BDIPCGcOdllu",
    "fimUvwh9JPRU",
    "jHPt9jOOmoXM",
    "nyEXxPkdgP81",
    "WvfHfFBIfx6m",
    "YqgrTs5hzcsj"
]

WTHN_PIDS = [
    "6BSJnNllOaUQ", 
    "8wC3YK6TgqRm", 
    "db2aF23Z9hnH", 
    "F0xDdtLRrhtQ", 
    "nRJBgq4Tg2LG", 
    "YvLb2lkRiYyh"
]

BTWN_PIDS = [
    "38vsTr4jwSxV",
    "fHevN3Wo38TA",
    "iswbEgz7w3KE",
    "kdSlRblQt77j",
    "mYSzFPXnkOdd",
    "QEpQRLrqn7CX",
    "Tcrfm9xpHN59"
]

BOTH_PIDS = [
    "1w4I0l6f60JG", 
    "7UR5LIyKmQiz", 
    "C9pIAv6kBqr6", 
    "KK2JFLnabEl9", 
    "MjcBTpGzYCD9", 
    "TlJaxdq1DM23"
]

ALL_PIDS = {
    'CTRL': CTRL_PIDS,
    'WTHN': WTHN_PIDS,
    'BTWN': BTWN_PIDS,
    'BOTH': BOTH_PIDS,
}

# ANOVA

In [None]:
task = "hiring"
rows = []

for condition in CONDITIONS:
    PIDS = ALL_PIDS[condition]
    for pid in PIDS:
        basepath = os.path.join(condition, pid)  # basepath for PID
        try:
            df = pd.read_csv(os.path.join(basepath, f"task_PM.csv"))
            appOrder = "PM"
        except FileNotFoundError:
            df = pd.read_csv(os.path.join(basepath, f"task_MP.csv"))
            appOrder = "MP"


        # Add to list of rows to create DataFrame from
        rows.append([pid, condition, attr_counts_phase_1.loc[ratio_of], attr_counts_phase_2.loc[ratio_of]])

# Combine phase 1 and 2 into single dataframe
df_ratio_of_by_phase = pd.DataFrame(
    rows,
    columns=['PID', 'Condition', 'Phase_1', 'Phase_2']
)

### ACROSS ALL CONDITIONS

print('Across all conditions => {CTRL, SUM, RT, RTSUM}')
print('-----------------------------------------------')

# Show 5 number summary
print(df_ratio_of_by_phase.groupby('Condition')['Phase_2'].describe())
print()

# Perform ANOVA 1-way test
vals = df_ratio_of_by_phase.groupby('Condition')['Phase_2'].apply(list).tolist()
f_stat, p_val = f_oneway(*vals)
print(f"One-Way ANOVA\t|\tGroups: Condition\t|\tF Statistic: {f_stat:.04f}\t|\tp-value: {p_val:.04f}")
print()

### BETWEEN CONTROL AND INTERVENTION ###

print('Between CTRL (CONTROL) and SUM + RT + RTSUM (INTERVENTION) conditions')
print('---------------------------------------------------------------------')

# Show 5 number summary
print(
    df_ratio_of_by_phase.replace(['SUM', 'RT', 'RTSUM'], 'INTV')
        .groupby('Condition')['Phase_2']
        .describe()
)
print()

# Compare to baseline ratios
vals = df_ratio_of_by_phase.replace(['SUM', 'RT', 'RTSUM'], 'INTV').groupby('Condition')['Phase_2'].apply(list)
print(f"One-Sample t-TEST | Within each condition")
if ratio_of == "Male":
    print(f"Expected Mean: 0.68")
    print()
    t_stat, p_val = ttest_1samp(vals['CTRL'], 0.68)
    print(f"CONTROL\t|\tT Statistic: {t_stat:.04f}\t|\tp-value: {p_val:.08f}")
    t_stat, p_val = ttest_1samp(vals['INTV'], 0.68)
    print(f"INTERV\t|\tT Statistic: {t_stat:.04f}\t|\tp-value: {p_val:.08f}")
elif ratio_of == "Democrat":
    print(f"Expected Mean: 0.41")
    print()
    t_stat, p_val = ttest_1samp(vals['CTRL'], 0.41)
    print(f"CONTROL\t|\tT Statistic: {t_stat:.04f}\t|\tp-value: {p_val:.08f}")
    t_stat, p_val = ttest_1samp(vals['INTV'], 0.41)
    print(f"INTERV\t|\tT Statistic: {t_stat:.04f}\t|\tp-value: {p_val:.08f}")
print()
print()

# TODO