In [None]:
import sys
sys.path.append('src/')
from percentparser import parse_percentage
from plot_functions import single_var_catplot, multi_var_catplot
from stat_tests import check_normality_for_groups, check_variance_homogeneity, kruskal_wallis_test, dunns_test, detailed_dunns_test

import os
import json
from tqdm import tqdm
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import datetime


import warnings
warnings.filterwarnings("ignore")

sns.set_theme(context='notebook', style='whitegrid')
pd.set_option("display.max_rows", 100)

In [None]:
start_run = datetime.datetime.now()

# Analysis

In [None]:
df = pd.read_csv("processed_data/age_name_edu_data.csv.zip", compression='zip')

In [None]:
len(df)

In [None]:
assert len(df['custom_id'].unique()) == len(df)

In [None]:
model = df['model'].unique()[0]

### Response Length and Refusals

In [None]:
df['response_len'] = df['query_response_raw'].apply(len)
print(f"Median response length: {df['response_len'].median()}")
print(f"Mean response length: {df['response_len'].mean()}")

In [None]:
refusals = df[df['query_response'].isna()]
print(f"Median refusal response length: {refusals['response_len'].median()}")
print(f"Mean refusal response length: {refusals['response_len'].mean()}")

In [None]:
refusals

In [None]:
print(f"Refusal 1: \n{refusals['query_response_raw'].iloc[0]}\n\n\
Refusal 2: \n{refusals['query_response_raw'].iloc[1]}")

In [None]:
df_valid = df[df['query_response'].notna()]
print(f"Median valid response length: {df_valid['response_len'].median()}")
print(f"Mean valid response length: {df_valid['response_len'].mean()}")

In [None]:
df['invalid_or_refused'] = df['query_response'].isna().astype(int)
print(f"Invalid responses: {df['invalid_or_refused'].mean()}")

### Response Distribution

In [None]:
df_valid['query_response'] = pd.to_numeric(df_valid['query_response'], errors='coerce')
print(f"Min hiring probability: {min(df_valid['query_response'])}\nMax hiring probability: {max(df_valid['query_response'])}")

In [None]:
response_value_counts = df_valid['query_response'].value_counts().to_dict()
pd.DataFrame({'hiring_prob': response_value_counts.keys(),
             'count': response_value_counts.values()}).sort_values(by='hiring_prob')

In [None]:
plt.bar(response_value_counts.keys(), response_value_counts.values())

## Visualizations

In [None]:
df_valid.groupby(['age','gender','education']).count()['query_response']

In [None]:
df_valid.groupby(['age'])['query_response'].agg(['min', 'median', 'max', 'mean', 'std'])

In [None]:
df_valid.groupby(['gender'])['query_response'].agg(['min', 'median', 'max', 'mean', 'std'])

In [None]:
df_valid.groupby(['education'])['query_response'].agg(['min', 'median', 'max', 'mean', 'std'])

### Single Variable Plots

In [None]:
df_valid['age'] = df_valid['age'].astype(str)
plot = single_var_catplot(df_valid, 'age', model)

In [None]:
gender_stats_diff = df_valid.groupby(['gender'])['query_response'].agg(['mean', 'median'])
gender_stats_diff

In [None]:
# Calculate the difference between Man and Woman for both mean and median values
gender_mean_gap = gender_stats_diff.unstack()['mean', 'Man'] - gender_stats_diff.unstack()['mean', 'Woman']

print(f"Mean gender gap (male vs female): {gender_mean_gap}")

In [None]:
single_var_catplot(df_valid, 'gender', model)

In [None]:
single_var_catplot(df_valid, 'education', model)

### Multi Variable Plots

In [None]:
categories = ['age', 'gender', 'education']
for var1 in categories:
    for var2 in categories:
        if var1 != var2:
            multi_var_catplot(df_valid, var1, var2, model)

In [None]:
heatmap_data = df_valid.pivot_table(index='education', columns='age', values='query_response', aggfunc='mean')

sns.heatmap(heatmap_data, cmap='plasma', annot=True, fmt='.1f', cbar_kws={'label': 'Query Response'})

## Tests

### Assumption Checks (for ANOVA and t-tests)

In [None]:
check_normality_for_groups(df_valid, 'query_response', df_valid['age'])
check_variance_homogeneity(df_valid, 'query_response', df_valid['age'])

In [None]:
check_normality_for_groups(df_valid, 'query_response', df_valid['gender'])
check_variance_homogeneity(df_valid, 'query_response', df_valid['gender'])

In [None]:
check_normality_for_groups(df_valid, 'query_response', df_valid['education'])
check_variance_homogeneity(df_valid, 'query_response', df_valid['education'])

### Kruskal–Wallis Test

In [None]:
kruskal_wallis_test(df_valid, 'query_response', 'age')

In [None]:
kruskal_wallis_test(df_valid, 'query_response', 'gender')

In [None]:
kruskal_wallis_test(df_valid, 'query_response', 'education')

## Dunn's Test

In [None]:
dunns_test(df_valid, 'query_response', 'age')

In [None]:
detailed_dunns_test(df_valid, 'query_response', 'age', total_comparisons=110)

In [None]:
detailed_dunns_test(df_valid, 'query_response', 'gender', total_comparisons=110)

In [None]:
detailed_dunns_test(df_valid, 'query_response', 'education', total_comparisons=110)

In [None]:
age_edu_dunns = []
for i in df_valid['age'].unique():
    df_edu = df[df['age'] == i]
    curr_df = detailed_dunns_test(df_edu, 'query_response', 'education', total_comparisons=110)
    curr_df['age'] = i
    curr_df = curr_df.groupby(['age', 'education1', 'education2']).mean()
    age_edu_dunns.append(curr_df)
age_edu_dunns_results = pd.concat(age_edu_dunns)
age_edu_dunns_results['reject_p05'] = age_edu_dunns_results['reject_p05'].astype(bool)
age_edu_dunns_results['reject_p0005'] = age_edu_dunns_results['reject_p0005'].astype(bool)

age_edu_dunns_results

In [None]:
print("Elapsed time:", datetime.datetime.now() - start_run)