In [13]:
import dask.dataframe as dd
import pandas as pd
from random import choices
from scipy import stats 

In [2]:
path = f'../datasets/integrated_datas_2020.parquet.gzip'

df_2020 = dd.read_parquet(path, ignore_metadata_file=True)

In [6]:
def select_sample(year, uf):
    path = f'../datasets/integrated_datas_{year}.parquet.gzip'
    df = dd.read_parquet(path, ignore_metadata_file=True)
    
    women = df[(df['TP_SEXO'] == 'F') & (df['NU_IDADE'] == 2) & (df['SG_UF_RESIDENCIA'] == uf)]
    man = df[(df['TP_SEXO'] == 'M') & (df['NU_IDADE'] == 2) & (df['SG_UF_RESIDENCIA'] == uf)]
    
    # sorted index
    ids_women = sorted(choices(range(len(women)), k=5000))
    ids_man = sorted(choices(range(len(man)), k=5000))
    
    women_sample = women.compute().iloc[ids_women]
    man_sample = man.compute().iloc[ids_man]
    
    return women_sample, man_sample

In [7]:
%%time

women_ce, man_ce = select_sample(2020, 'CE')

Wall time: 38.7 s


In [10]:
stats.shapiro(women_ce['NU_NOTA_MT'])

ShapiroResult(statistic=0.9512751698493958, pvalue=7.00159282167342e-38)

In [17]:
%%time

ufs = df_2020['SG_UF_RESIDENCIA'].unique().compute()
tests = pd.DataFrame([], columns=['year', 'uf', 'shapiro_w', 'shapiro_m', 'ttest'])


for uf in ufs:
    w_sample, m_sample = select_sample(2020, uf)
    
    shp_w = stats.shapiro(w_sample['NU_NOTA_MT'])
    shp_m = stats.shapiro(m_sample['NU_NOTA_MT'])
    ttest_ = stats.ttest_ind(w_sample['NU_NOTA_MT'], m_sample['NU_NOTA_MT'])
    
    tests = tests.append([{'ano': 2020, 'estado': uf, 'shapiro_w': shp_w, 'shapiro_m': shp_m, 'ttest': ttest_}], ignore_index=True)
    

Wall time: 17min 53s


In [15]:
tests

Unnamed: 0,ano,estado,shapiro_w,shapiro_m,ttest
0,2020,RS,"(0.988560140132904, 1.0299667831537581e-19)","(0.9950118660926819, 4.18360918824634e-12)","(-27.80409114599032, 5.993860345060439e-164)"
1,2020,PB,"(0.9512351155281067, 6.809578498300848e-38)","(0.9864070415496826, 1.5031625651870745e-21)","(-25.920427568365174, 1.9967592958354716e-143)"
2,2020,BA,"(0.9528319835662842, 2.0920552019523553e-37)","(0.9881572127342224, 4.480981811689521e-20)","(-29.84100287835722, 1.6236670118889494e-187)"
3,2020,AL,"(0.9494891166687012, 2.0630106568381105e-38)","(0.9827902317047119, 3.333153078016353e-24)","(-25.10293372043569, 6.563479714640053e-135)"
4,2020,PA,"(0.9518539905548096, 1.048392208056818e-37)","(0.9769991636276245, 1.0809822036222762e-27)","(-25.510189651057008, 4.018865099243244e-139)"
5,2020,TO,"(0.9497838616371155, 2.5179520123704126e-38)","(0.9790095090866089, 1.4486009958456902e-26)","(-24.242337087997345, 3.337690040534875e-126)"


In [16]:
tests.to_csv('tests.csv')