[stackoverflow](https://stackoverflow.com/questions/21494141/how-do-i-do-a-f-test-in-python) on python test

In [1]:
# import dependencies 
import pandas as pd
# from scipy.stats import ttest_ind
# from scipy.stats import bartlett
from scipy.stats import levene
from collections import Counter

## Tests

1. `ttest_ind`
Calculate the T-test for the means of two independent samples of scores.

This is a two-sided test for the null hypothesis that 2 independent samples have identical average (expected) values. This test assumes that the populations have identical variances by default.

2. `bartlett`
Perform Bartlett’s test for equal variances.

Bartlett’s test tests the null hypothesis that all input samples are from populations with equal variances. For samples from significantly non-normal populations, Levene’s test levene is more robust.

3. **`levene`**
Perform Levene test for equal variances.

The Levene test tests the null hypothesis that all input samples are from populations with equal variances. Levene’s test is an alternative to Bartlett’s test bartlett in the case where there are significant deviations from normality.

In [15]:
# manual input based on headquarters region filter
regions = ['US', 'EU', 'Europe', 'China']

### Run Significance Tests + Export Results
compare gender difference within region

In [16]:
def test_diff(n):
    '''compare within region'''

    df_metrics = pd.DataFrame()
    df_metrics['region'] = regions

    by_industry_group = ['industry_groups', 'industries']

    for big in by_industry_group:

            # initiate containers
            female_co_std_data = []
            not_female_co_std_data = []

            # significance test
            levene_stat_data = []
            levene_p_data = []

            for region in regions:

                df = pd.read_csv(f'../data/crunchbase-aggregated/{region}-{n}-{big}.csv')

                # female
                female_co_std = df['#female_co'].std()
                female_co_std_data.append(female_co_std)

                # not female
                not_female_co_std = df['#not_female_co'].std()
                not_female_co_std_data.append(not_female_co_std)

                # significance test
                stat3, p3 = levene(df['#female_co'].dropna(), df['#not_female_co'].dropna(), center='mean')
                levene_stat_data.append(stat3)
                levene_p_data.append(p3)


            # add data to df
            if (big=='industries'):
                df_metrics[f'#female_co_std_{n}'] = female_co_std_data
                df_metrics[f'#not_female_co_std_{n}'] = not_female_co_std_data
                df_metrics[f'levene_stat_{n}'] = levene_stat_data
                df_metrics[f'levene_p_{n}'] = levene_p_data

            elif (big=='industry_groups'):
                df_metrics[f'#female_co_std_group_{n}'] = female_co_std_data
                df_metrics[f'#not_female_co_std_group_{n}'] = not_female_co_std_data
                df_metrics[f'levene_stat_group_{n}'] = levene_stat_data
                df_metrics[f'levene_p_group_{n}'] = levene_p_data
    
    return df_metrics

In [17]:
def present_test_diff(df_metrics, axis=0):
    p_cols  = [col for col in df_metrics.columns if '_p' in col]
    df_p = df_metrics[['region']+p_cols]
    return df_p.style.background_gradient(cmap='Blues', axis=axis)

In [18]:
present_test_diff(test_diff(98))

Unnamed: 0,region,levene_p_group_98,levene_p_98
0,US,0.649598,0.555815
1,EU,0.625603,0.856606
2,Europe,0.417509,0.083795
3,China,0.067952,0.014168


In [19]:
present_test_diff(test_diff(32))

Unnamed: 0,region,levene_p_group_32,levene_p_32
0,US,0.848429,0.94504
1,EU,0.638911,0.741564
2,Europe,0.840758,0.214286
3,China,0.023067,0.108388


### Run Significance Tests + Export Results
compare industry difference across regions

In [82]:
def test_diff_region(n=''):
    '''compare between regions'''

    df_metrics = pd.DataFrame()
    regions = ['US', 'EU', 'Europe', 'China']

    by_industry_group = ['industry_groups', 'industries']

    for big in by_industry_group:
        
            region_pair_data = []

            # significance test
            levene_stat_female_data = []
            levene_p_female_data = []
            levene_stat_all_data = []
            levene_p_all_data = []

            for region1 in regions:
                for region2 in regions:
                    
                    if region1==region2:
                        pass
                    
                    else:
                        region_pair_data.append([region1, region2])

                        if isinstance(n, int):
                            df1 = pd.read_csv(f'../data/crunchbase-aggregated/{region1}-{n}-{big}.csv')
                            df2 = pd.read_csv(f'../data/crunchbase-aggregated/{region2}-{n}-{big}.csv')

                        else: #full data
                            df1 = pd.read_csv(f'../data/crunchbase-aggregated/{region1}-{big}_equal.csv')
                            df2 = pd.read_csv(f'../data/crunchbase-aggregated/{region2}-{big}_equal.csv')

                        # combined
                        df1['#all_co'] = df1['#female_co'] + df1['#not_female_co']
                        df2['#all_co'] = df2['#female_co'] + df2['#not_female_co']

                        # significance test
                        stat1, p1 = levene(df1['#female_co'].dropna(), df2['#female_co'].dropna())
                        levene_stat_female_data.append(stat1)
                        levene_p_female_data.append(p1)

                        stat2, p2 = levene(df1['#all_co'].dropna(), df2['#all_co'].dropna())
                        levene_stat_all_data.append(stat2)
                        levene_p_all_data.append(p2)


            # add data to df
            if (big=='industries'):
                df_metrics[f'levene_stat_f{n}'] = levene_stat_female_data
                df_metrics[f'levene_p_f{n}'] = levene_p_female_data
                df_metrics[f'levene_stat_all{n}'] = levene_stat_all_data
                df_metrics[f'levene_p_all{n}'] = levene_p_all_data

            elif (big=='industry_groups'):
                df_metrics[f'levene_stat_group_f{n}'] = levene_stat_female_data
                df_metrics[f'levene_p_group_f{n}'] = levene_p_female_data
                df_metrics[f'levene_stat_group_all{n}'] = levene_stat_all_data
                df_metrics[f'levene_p_group_all{n}'] = levene_p_all_data

    df_metrics['region'] = region_pair_data
    return df_metrics

In [83]:
present_test_diff(test_diff_region(98))

Unnamed: 0,region,levene_p_group_f98,levene_p_group_all98,levene_p_f98,levene_p_all98
0,"['US', 'EU']",0.735366,0.877441,0.631559,0.817958
1,"['US', 'Europe']",0.543414,0.992644,0.357452,0.821754
2,"['US', 'China']",0.93827,0.668532,0.697657,0.520438
3,"['EU', 'US']",0.735366,0.877441,0.631559,0.817958
4,"['EU', 'Europe']",0.812232,0.866564,0.628524,0.995705
5,"['EU', 'China']",0.787677,0.793488,0.949908,0.638621
6,"['Europe', 'US']",0.543414,0.992644,0.357452,0.821754
7,"['Europe', 'EU']",0.812232,0.866564,0.628524,0.995705
8,"['Europe', 'China']",0.58347,0.646357,0.590276,0.63767
9,"['China', 'US']",0.93827,0.668532,0.697657,0.520438


In [84]:
present_test_diff(test_diff_region(32))

Unnamed: 0,region,levene_p_group_f32,levene_p_group_all32,levene_p_f32,levene_p_all32
0,"['US', 'EU']",0.499817,0.448983,0.851834,0.931535
1,"['US', 'Europe']",0.444848,0.451643,0.109068,0.214902
2,"['US', 'China']",0.90531,0.163014,0.93873,0.434139
3,"['EU', 'US']",0.499817,0.448983,0.851834,0.931535
4,"['EU', 'Europe']",0.906503,0.974327,0.104703,0.156634
5,"['EU', 'China']",0.593851,0.454094,0.809041,0.311157
6,"['Europe', 'US']",0.444848,0.451643,0.109068,0.214902
7,"['Europe', 'EU']",0.906503,0.974327,0.104703,0.156634
8,"['Europe', 'China']",0.532174,0.500734,0.137592,0.656724
9,"['China', 'US']",0.90531,0.163014,0.93873,0.434139


In [85]:
present_test_diff(test_diff_region())

Unnamed: 0,region,levene_p_group_f,levene_p_group_all,levene_p_f,levene_p_all
0,"['US', 'EU']",0.139609,0.198675,0.096929,0.146754
1,"['US', 'Europe']",0.17274,0.359467,0.064217,0.188595
2,"['US', 'China']",0.022264,0.019831,0.011741,0.01373
3,"['EU', 'US']",0.139609,0.198675,0.096929,0.146754
4,"['EU', 'Europe']",0.910166,0.727516,0.878763,0.883936
5,"['EU', 'China']",0.32816,0.183714,0.16643,0.09838
6,"['Europe', 'US']",0.17274,0.359467,0.064217,0.188595
7,"['Europe', 'EU']",0.910166,0.727516,0.878763,0.883936
8,"['Europe', 'China']",0.285331,0.110024,0.206585,0.10803
9,"['China', 'US']",0.022264,0.019831,0.011741,0.01373


### Top Industries

In [10]:
n = 20

df_industries = pd.DataFrame()
df_industries['region'] = regions

by_industry_group = ['industry_groups', 'industries']
by_gender_equal = ['_equal', '']

for big in by_industry_group:
    
    for bge in by_gender_equal:
        
        # initiate containers
        # top_n
        top_n_female_data = []
        top_n_not_female_data = []
        # diff
        female_diff_data = [] # female-male
        male_diff_data = [] # male-female
        
        for region in regions:
    
            df = pd.read_csv(f'../data/crunchbase-aggregated/{region}-{big}{bge}.csv')
            
            # get data
            top_n_female = list(df.sort_values(by=['#female_co'], ascending=False)[:n]['industry'])
            top_n_not_female = list(df.sort_values(by=['#not_female_co'], ascending=False)[:n]['industry'])
            female_diff = list(set(top_n_female)-set(top_n_not_female))
            male_diff = list(set(top_n_not_female)-set(top_n_female))
            
            # record data
            top_n_female_data.append(top_n_female)
            top_n_not_female_data.append(top_n_not_female)
            female_diff_data.append(female_diff)
            male_diff_data.append(male_diff)
            
        # add data to df
        if (big=='industries') and (bge==''):
            df_industries['top_n_female'] = top_n_female_data
            df_industries['top_n_not_female'] = top_n_not_female_data
            df_industries['female_diff'] = female_diff_data
            df_industries['male_diff'] = male_diff_data

        elif (big=='industries') and (bge=='_equal'):
            df_industries['top_n_female_equal'] = top_n_female_data
            df_industries['top_n_not_female_equal'] = top_n_not_female_data
            df_industries['female_diff_equal'] = female_diff_data
            df_industries['male_diff_equal'] = male_diff_data

        elif (big=='industry_groups') and (bge==''):
            df_industries['top_n_female_group'] = top_n_female_data
            df_industries['top_n_not_female_group'] = top_n_not_female_data
            df_industries['female_diff_group'] = female_diff_data
            df_industries['male_diff_group'] = male_diff_data

        elif (big=='industry_groups') and (bge=='_equal'):
            df_industries['top_n_female_group_equal'] = top_n_female_data
            df_industries['top_n_not_female_group_equal'] = top_n_not_female_data
            df_industries['female_diff_group_equal'] = female_diff_data
            df_industries['male_diff_group_equal'] = male_diff_data

In [13]:
df_industries_T = df_industries.set_index('region').transpose()
df_industries_T

region,US,EU,Europe,China
top_n_female_group_equal,"[Software, Health Care, Science and Engineerin...","[Software, Internet Services, Science and Engi...","[Software, Internet Services, Health Care, Inf...","[Software, Hardware, Media and Entertainment, ..."
top_n_not_female_group_equal,"[Software, Information Technology, Internet Se...","[Software, Internet Services, Information Tech...","[Software, Internet Services, Science and Engi...","[Software, Other, Media and Entertainment, Com..."
female_diff_group_equal,"[Design, Community and Lifestyle, Real Estate]","[Community and Lifestyle, Biotechnology, Adver...","[Design, Advertising, Real Estate]","[Consumer Goods, Food and Beverage, Artificial..."
male_diff_group_equal,"[Consumer Electronics, Privacy and Security, T...","[Consumer Electronics, Transportation, Manufac...","[Consumer Electronics, Community and Lifestyle...","[Financial Services, Manufacturing, Content an..."
top_n_female_group,"[Software, Health Care, Science and Engineerin...","[Software, Internet Services, Commerce and Sho...","[Software, Internet Services, Information Tech...","[Software, Hardware, Media and Entertainment, ..."
top_n_not_female_group,"[Software, Information Technology, Internet Se...","[Software, Internet Services, Information Tech...","[Software, Information Technology, Internet Se...","[Software, Internet Services, Information Tech..."
female_diff_group,"[Design, Community and Lifestyle, Real Estate]","[Community and Lifestyle, Design, Advertising,...","[Design, Payments, Real Estate]","[Consumer Goods, Food and Beverage, Artificial..."
male_diff_group,"[Consumer Electronics, Privacy and Security, T...","[Consumer Electronics, Transportation, Profess...","[Consumer Electronics, Transportation, Profess...","[Education, Financial Services, Manufacturing,..."
top_n_female_equal,"[Software, Health Care, Biotechnology, SaaS, I...","[Software, Health Care, E-Commerce, Internet, ...","[Software, Health Care, E-Commerce, SaaS, Info...","[Artificial Intelligence, E-Commerce, Informat..."
top_n_not_female_equal,"[Software, Information Technology, SaaS, Healt...","[Software, Information Technology, Internet, S...","[Software, Information Technology, SaaS, Finan...","[E-Commerce, Consumer, Internet, Information T..."


In [16]:
# df_industries_T.to_csv(f'../data/crunchbase-aggregated/region_top_industries.csv')