In [86]:
import pandas as pd
import numpy as np
import matplotlib as plt
from scipy.stats import pearsonr
import numpy as np

csv_fem_lab = "../Resources/labor_force_female.csv"
csv_male_lab = "../Resources/labor_male.csv"
csv_total_lab = "../Resources/labor_total.csv"
csv_lab_perc_fem = "../Resources/labor_perc_fem.csv"


# Labor force participation rate(% of population ages 15-64); among women, men, and total
fem_lab = pd.read_csv(csv_fem_lab)
male_lab = pd.read_csv(csv_male_lab)
total_lab = pd.read_csv(csv_total_lab)

# % of labor force that are women (will reflect any skew in gender proportions in a given country)
lab_perc_fem = pd.read_csv(csv_lab_perc_fem)
fem_lab.columns

Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
       '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017'],
      dtype='object')

In [87]:
# Function to clean up data sets and use only country-level data, no regions or WB categories
country_list = ['AFG', 'ALB', 'DZA', 'ASM', 'AND', 'AGO', 'AIA', 'ATG', 'ARG', 'ARM', 'ABW', 'AUS', 'AUT', 'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL', 'BLX', 'BLZ', 'BEN', 'BMU', 'BTN', 'BOL', 'BIH', 'BWA', 'BAT', 'BRA', 'IOT', 'VGB', 'BRN', 'BGR', 'BFA', 'BDI', 'KHM', 'CMR', 'CAN', 'CPV', 'CYM', 'CAF', 'TCD', 'CHL', 'CHN', 'CXR', 'CCK', 'COL', 'COM', 'ZAR', 'COG', 'COK', 'CRI', 'CIV', 'HRV', 'CUB', 'CYP', 'CZE', 'CSK', 'DNK', 'DJI', 'DMA', 'DOM', 'TMP', 'ECU', 'EGY', 'SLV', 'GNQ', 'ERI', 'EST', 'ETH', 'ETF', 'EUN', 'FRO', 'FLK', 'FJI', 'FIN', 'PCZ', 'ZW1', 'TAN', 'VDR', 'SVR', 'ZPM', 'ATF', 'FRA', 'FRE', 'GUF', 'PYF', 'GAB', 'GMB', 'GAZ', 'GEO', 'DDR', 'DEU', 'GHA', 'GIB', 'GRC', 'GRL', 'GRD', 'GLP', 'GUM', 'GTM', 'GIN', 'GNB', 'GUY', 'HTI', 'VAT', 'HND', 'HKG', 'HUN', 'ISL', 'IND', 'IDN', 'IRN', 'IRQ', 'IRL', 'ISR', 'ITA', 'JAM', 'JPN', 'JTN', 'JOR', 'KAZ', 'KEN', 'KIR', 'PRK', 'KOR', 'KWT', 'KGZ', 'LAO', 'LVA', 'LBN', 'LSO', 'LBR', 'LBY', 'LIE', 'LTU', 'LUX', 'MAC', 'MKD', 'MDG', 'MWI', 'MYS', 'MDV', 'MLI', 'MLT', 'MHL', 'MTQ', 'MRT', 'MUS', 'MEX', 'FSM', 'MID', 'MDA', 'MCO', 'MNG', 'MSR', 'MAR', 'MOZ', 'MMR', 'NAM', 'NRU', 'NPL', 'NLD', 'ANT', 'NZE', 'NCL', 'NZL', 'NIC', 'NER', 'NGA', 'NIU', 'NFK', 'MNP', 'NOR', 'OMN', 'PCE', 'PAK', 'PLW', 'PAN', 'PNG', 'PRY', 'PMY', 'PER', 'PHL', 'PCN', 'POL', 'PRT', 'PRI', 'QAT', 'REU', 'ROM', 'RUS', 'RWA', 'RYU', 'SBH', 'SHN', 'KN1', 'SPM', 'WSM', 'SMR', 'STP', 'SWK', 'SAU', 'SEN', 'SYC', 'SLE', 'SIK', 'SGP', 'SVK', 'SVN', 'SLB', 'SOM', 'ZAF', 'SVU', 'ESP', 'SPE', 'LKA', 'KNA', 'LCA', 'VCT', 'SDN', 'SUR', 'SJM', 'SWZ', 'SWE', 'CHE', 'SYR', 'TWN', 'TJK', 'TZA', 'THA', 'TGO', 'TKL', 'TON', 'TTO', 'TUN', 'TUR', 'TKM', 'TCA', 'TUV', 'UGA', 'UKR', 'ARE', 'GBR', 'USA', 'UNS', 'URY', 'USP', 'UZB', 'VUT', 'VEN', 'VNM', 'VIR', 'WAK', 'WLF', 'ESH', 'WLD', 'YDR', 'YEM', 'SER', 'YUG', 'ZMB', 'ZWE']

def clean_df(df):
    df.index = df['Country Code']
    df = df[df['Country Code'].isin(country_list)]
    df = df.drop(columns=['Indicator Code', 'Country Code',
       '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989'])
    df = df.dropna(how='any', thresh=3)
    return df

fem_lab = clean_df(fem_lab)
male_lab = clean_df(male_lab)
total_lab = clean_df(total_lab)
lab_perc_fem = clean_df(lab_perc_fem)

In [134]:
fem_lab.min()

Country Name                                            Afghanistan
Indicator Name    Labor force participation rate, female (% of f...
1990                                                          8.116
1991                                                          8.182
1992                                                          8.252
1993                                                          8.325
1994                                                          8.398
1995                                                          8.472
1996                                                          8.558
1997                                                          8.644
1998                                                          9.229
1999                                                           9.85
2000                                                         10.507
2001                                                         11.201
2002                                            

In [89]:
lab_ratio = fem_lab[['1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017']] / male_lab[['1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017']]
lab_ratio_dif = (fem_lab['2017'] - fem_lab['1990']) / fem_lab['1990']

In [90]:
lab_ratio.to_csv('lab_ratio.csv')
lab_ratio_dif.to_csv('lab_ratio_dif.csv')

In [106]:
# generating correlation and test of significance between two dataframes

# dropping all columns aside from years
df1 = total_lab[['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998',
       '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007',
       '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016',
       '2017']]
df2 = lab_perc_fem[['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998',
       '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007',
       '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016',
       '2017']]

coeffmat = np.zeros((df1.shape[1], 2))

for i in range(df1.shape[1]):
    corrtest = pearsonr(df1[df1.columns[i]], df2[df2.columns[i]])  
    coeffmat[i,0] = corrtest[0]
    coeffmat[i,1] = corrtest[1]

dfcorrelation = pd.DataFrame(coeffmat, columns={'coeff', 'pval'}, index=df1.columns)
# dfpvals = pd.DataFrame(pvalmat, columns=df2.columns, index=df1.columns)

In [100]:
#To correlate all columns with each other, not just the corresponding columns of each dataframe:
# coeffmat = np.zeros((df1.shape[1], df2.shape[1]))
# pvalmat = np.zeros((df1.shape[1], df2.shape[1]))

# for i in range(df1.shape[1]):    
#     for j in range(df2.shape[1]):        
#         corrtest = pearsonr(df1[df1.columns[i]], df2[df2.columns[j]])  

#         coeffmat[i,j] = corrtest[0]
#         pvalmat[i,j] = corrtest[1]

# dfcoeff = pd.DataFrame(coeffmat, columns=df2.columns, index=df1.columns)
# print(dfcoeff)

# dfpvals = pd.DataFrame(pvalmat, columns=df2.columns, index=df1.columns)
# print(dfpvals)

In [126]:
total_lab_mean = total_lab.mean()
total_lab_min = total_lab.min()
total_lab_max = total_lab.max()
lab_perc_fem_mean = lab_perc_fem.mean()

dfcorrelation['mean total labor participation'] = total_lab_mean
dfcorrelation['min labor participation'] = total_lab_min
dfcorrelation['max labor participation'] = total_lab_max
dfcorrelation['mean % labor force female'] = lab_perc_fem_mean
dfcorrelation['min % labor force female'] = dfcorrelation['mean % labor force female'] - ((dfcorrelation['mean total labor participation'] 
                                             - dfcorrelation['min labor participation']) * dfcorrelation['coeff'])
dfcorrelation['max % labor force female'] = dfcorrelation['mean % labor force female'] + ((dfcorrelation['max labor participation']
                                            - dfcorrelation['mean total labor participation']) * dfcorrelation['coeff'])
dfcorrelation = dfcorrelation.reindex(['min labor participation', 'mean total labor participation', 
                                       'max labor participation', 'min % labor force female', 'mean % labor force female', 
                                       'max % labor force female', 'coeff', 'pval'], axis=1)

In [127]:
dfcorrelation.to_csv('dfcorrelation.csv')
dfcorrelation

Unnamed: 0,min labor participation,mean total labor participation,max labor participation,min % labor force female,mean % labor force female,max % labor force female,coeff,pval
1990,35.406,66.471932,91.542,16.9843,38.231723,55.3783,0.683947,3.6327689999999995e-26
1991,35.914,66.557536,91.109,17.4143,38.381283,55.18,0.684224,3.408637e-26
1992,36.266,66.612174,90.59,17.7258,38.510794,54.9339,0.68493,2.89607e-26
1993,36.254,66.631768,89.987,17.9079,38.603669,54.5151,0.681281,6.68891e-26
1994,36.607,66.70035,89.335,18.5179,38.783534,54.0263,0.673427,3.892272e-25
1995,36.249,66.712914,88.615,18.5106,38.891609,53.5446,0.669023,1.020752e-24
1996,36.07,66.733239,88.55,18.4173,39.054441,53.7377,0.673025,4.253303e-25
1997,36.293,66.739258,88.489,18.7542,39.228579,53.8547,0.672475,4.80128e-25
1998,36.352,66.808593,88.693,18.9338,39.395047,54.0973,0.671815,5.549732e-25
1999,36.347,66.849743,88.912,19.2197,39.597421,54.3364,0.668063,1.256565e-24
