### Loading packages and data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse

import statsmodels.api as sm
from statsmodels.formula.api import ols

from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

from scipy.stats import spearmanr

import plotly.figure_factory as ff

In [None]:
columns = [
    # 'survey_year',
    # demographics
    'age',
    'gender',
    'race',
    # soc-eco background
    'fincur',
    'gpa_sr',
    'alc_any',
    # other indicators
    'exerc',
    'degree',
    'international',
    'residenc',
    # educ parents
    'educ_par1',
    'educ_par2',
    # target variables go at the end
    'flourish',
    'anx_score',
    'deprawsc',
    ]

df = pd.read_csv('../../data/processed/all_data_test.csv', usecols = columns)[columns]  # Test hypothesis on TEST data
df['educ_par_max'] = df[['educ_par1', 'educ_par2']].max(axis=1)
df.drop(columns=['educ_par1', 'educ_par2'])

features = columns[:-5] + ['educ_par_max']
cat_features = ['gender', 'race', 'degree']
monotone_features = ['age', 'fincur', 'gpa_sr', 'alc_any', 'exerc', 'educ_par_max', 'international', 'residenc']
print(set(features) == set(cat_features).union(set(monotone_features)))
targets = ['flourish','anx_score','deprawsc']
targets_names = ['Flourish score','Anxiety score','Depression score']

dict_race = {1 : 'black' , 2 : 'ainaan' , 3 : 'asian' , 4 : 'his' , 5 : 'pi' , 6 : 'mides' , 7 : 'white'}
dict_gender = {1: 'female', 2: 'male', 3: 'trans', 4: 'non_binary'}

df['race'] = df['race'].replace(dict_race)
df['gender'] = df['gender'].replace(dict_gender)


def var_ols(feat):
    if feat in cat_features:
        return f'C({feat})'
    else:
        return feat

### Two-way anova to deduce intersectionality

Simple intersectionality

In [None]:
for target_var in targets:
    for i,feat1 in enumerate(features):
        for feat2 in features[i+1:]:
            specification = f'{target_var} ~ {var_ols(feat1)} * {var_ols(feat2)}'
            model = ols(specification , data=df).fit()
            anova_table = sm.stats.anova_lm(model , typ=2)
            p_value = anova_table['PR(>F)'][f'{var_ols(feat1)}:{var_ols(feat2)}']
            if p_value < 0.05:
                print(f'{p_value:.3f} significant intersectionality between {feat1} and {feat2} for {target_var}')
            else:
                print(f'{p_value:.3f} NO significant intersectionality between {feat1} and {feat2} for {target_var}')

In [None]:
# Print nice anova table
def nice_display_anova(target_var, feat1, feat2):
    model = ols(f'{target_var} ~ {var_ols(feat1)} * {var_ols(feat2)}' , data=df).fit()
    anova_table = sm.stats.anova_lm(model , typ=2)
    fig = ff.create_table(anova_table.round(4),index=True,index_title = f'{' '*5}{target_var}')
    fig.update_layout(
        autosize=False,
        width=560,
        height=200,
    )
    fig.show()

nice_display_anova('anx_score', 'age', 'alc_any')