In [141]:
import pandas as pd
import numpy as np
import matplotlib as plt
from scipy.stats import pearsonr
import numpy as np

csv_fem_lab = "../Resources/labor_force_female.csv"
csv_male_lab = "../Resources/labor_male.csv"
csv_total_lab = "../Resources/labor_total.csv"
csv_lab_perc_fem = "../Resources/labor_perc_fem.csv"


# Labor force participation rate(% of population ages 15-64); among women, men, and total
fem_lab = pd.read_csv(csv_fem_lab)
male_lab = pd.read_csv(csv_male_lab)
total_lab = pd.read_csv(csv_total_lab)

# % of labor force that are women (will reflect any skew in gender proportions in a given country)
lab_perc_fem = pd.read_csv(csv_lab_perc_fem)
fem_lab.columns

Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
       '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017'],
      dtype='object')

In [142]:
# Function to clean up data sets and use only country-level data, no regions or WB categories
country_list = ['AFG', 'ALB', 'DZA', 'ASM', 'AND', 'AGO', 'AIA', 'ATG', 'ARG', 'ARM', 'ABW', 'AUS', 'AUT', 'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL', 'BLX', 'BLZ', 'BEN', 'BMU', 'BTN', 'BOL', 'BIH', 'BWA', 'BAT', 'BRA', 'IOT', 'VGB', 'BRN', 'BGR', 'BFA', 'BDI', 'KHM', 'CMR', 'CAN', 'CPV', 'CYM', 'CAF', 'TCD', 'CHL', 'CHN', 'CXR', 'CCK', 'COL', 'COM', 'ZAR', 'COG', 'COK', 'CRI', 'CIV', 'HRV', 'CUB', 'CYP', 'CZE', 'CSK', 'DNK', 'DJI', 'DMA', 'DOM', 'TMP', 'ECU', 'EGY', 'SLV', 'GNQ', 'ERI', 'EST', 'ETH', 'ETF', 'EUN', 'FRO', 'FLK', 'FJI', 'FIN', 'PCZ', 'ZW1', 'TAN', 'VDR', 'SVR', 'ZPM', 'ATF', 'FRA', 'FRE', 'GUF', 'PYF', 'GAB', 'GMB', 'GAZ', 'GEO', 'DDR', 'DEU', 'GHA', 'GIB', 'GRC', 'GRL', 'GRD', 'GLP', 'GUM', 'GTM', 'GIN', 'GNB', 'GUY', 'HTI', 'VAT', 'HND', 'HKG', 'HUN', 'ISL', 'IND', 'IDN', 'IRN', 'IRQ', 'IRL', 'ISR', 'ITA', 'JAM', 'JPN', 'JTN', 'JOR', 'KAZ', 'KEN', 'KIR', 'PRK', 'KOR', 'KWT', 'KGZ', 'LAO', 'LVA', 'LBN', 'LSO', 'LBR', 'LBY', 'LIE', 'LTU', 'LUX', 'MAC', 'MKD', 'MDG', 'MWI', 'MYS', 'MDV', 'MLI', 'MLT', 'MHL', 'MTQ', 'MRT', 'MUS', 'MEX', 'FSM', 'MID', 'MDA', 'MCO', 'MNG', 'MSR', 'MAR', 'MOZ', 'MMR', 'NAM', 'NRU', 'NPL', 'NLD', 'ANT', 'NZE', 'NCL', 'NZL', 'NIC', 'NER', 'NGA', 'NIU', 'NFK', 'MNP', 'NOR', 'OMN', 'PCE', 'PAK', 'PLW', 'PAN', 'PNG', 'PRY', 'PMY', 'PER', 'PHL', 'PCN', 'POL', 'PRT', 'PRI', 'QAT', 'REU', 'ROM', 'RUS', 'RWA', 'RYU', 'SBH', 'SHN', 'KN1', 'SPM', 'WSM', 'SMR', 'STP', 'SWK', 'SAU', 'SEN', 'SYC', 'SLE', 'SIK', 'SGP', 'SVK', 'SVN', 'SLB', 'SOM', 'ZAF', 'SVU', 'ESP', 'SPE', 'LKA', 'KNA', 'LCA', 'VCT', 'SDN', 'SUR', 'SJM', 'SWZ', 'SWE', 'CHE', 'SYR', 'TWN', 'TJK', 'TZA', 'THA', 'TGO', 'TKL', 'TON', 'TTO', 'TUN', 'TUR', 'TKM', 'TCA', 'TUV', 'UGA', 'UKR', 'ARE', 'GBR', 'USA', 'UNS', 'URY', 'USP', 'UZB', 'VUT', 'VEN', 'VNM', 'VIR', 'WAK', 'WLF', 'ESH', 'WLD', 'YDR', 'YEM', 'SER', 'YUG', 'ZMB', 'ZWE']

def clean_df(df):
    df.index = df['Country Code']
    df = df[df['Country Code'].isin(country_list)]
    df = df.drop(columns=['Indicator Code', 'Country Code',
       '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989'])
#     df = df.dropna(how='any', thresh=3)
    return df

fem_lab = clean_df(fem_lab)
male_lab = clean_df(male_lab)
total_lab = clean_df(total_lab)
lab_perc_fem = clean_df(lab_perc_fem)

In [143]:
fem_lab.min()

Country Name                                            Afghanistan
Indicator Name    Labor force participation rate, female (% of f...
1990                                                          8.116
1991                                                          8.182
1992                                                          8.252
1993                                                          8.325
1994                                                          8.398
1995                                                          8.472
1996                                                          8.558
1997                                                          8.644
1998                                                          9.229
1999                                                           9.85
2000                                                         10.507
2001                                                         11.201
2002                                            

In [144]:
lab_ratio = fem_lab[['1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017']] / male_lab[['1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017']]
# lab_ratio_dif = (fem_lab['2017'] - fem_lab['1990']) / fem_lab['1990']
lab_ratio.head()

Unnamed: 0_level_0,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
Country Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ABW,,,,,,,,,,,...,,,,,,,,,,
AFG,0.17058,0.171387,0.172137,0.172345,0.172433,0.173226,0.171362,0.169687,0.168608,0.168045,...,0.171435,0.169559,0.169958,0.176328,0.184791,0.194735,0.205303,0.215845,0.223935,0.224571
AGO,0.921183,0.919662,0.918335,0.914296,0.915948,0.918874,0.919928,0.921029,0.92241,0.923605,...,0.935467,0.935406,0.93575,0.935299,0.935699,0.936869,0.938037,0.939203,0.939573,0.940468
ALB,0.720141,0.745257,0.749689,0.740724,0.734181,0.72614,0.718269,0.722567,0.71531,0.706948,...,0.72284,0.699076,0.72547,0.731304,0.738176,0.712735,0.701121,0.731665,0.729236,0.727965
AND,,,,,,,,,,,...,,,,,,,,,,


In [149]:
lab_ratio.to_csv('lab_ratio.csv')
# lab_ratio_dif.to_csv('lab_ratio_dif.csv')

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/anaconda3/envs/gwarl/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-149-243080ea3544>", line 1, in <module>
    lab_ratio.to_csv('lab_ratio.csv')
  File "/anaconda3/envs/gwarl/lib/python3.6/site-packages/pandas/core/frame.py", line 1745, in to_csv
    formatter.save()
  File "/anaconda3/envs/gwarl/lib/python3.6/site-packages/pandas/io/formats/csvs.py", line 136, in save
    compression=None)
  File "/anaconda3/envs/gwarl/lib/python3.6/site-packages/pandas/io/common.py", line 400, in _get_handle
    f = open(path_or_buf, mode, encoding=encoding)
FileNotFoundError: [Errno 2] No such file or directory: 'lab_ratio.csv'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/anaconda3/envs/gwarl/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 1863, in showt

FileNotFoundError: [Errno 2] No such file or directory: 'lab_ratio.csv'

In [138]:
# generating correlation and test of significance between two dataframes

# dropping all columns aside from years
df1 = male_lab[['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998',
       '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007',
       '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016',
       '2017']]
df2 = fem_lab[['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998',
       '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007',
       '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016',
       '2017']]

coeffmat = np.zeros((df1.shape[1], 2))

for i in range(df1.shape[1]):
    corrtest = pearsonr(df1[df1.columns[i]], df2[df2.columns[i]])  
    coeffmat[i,0] = corrtest[0]
    coeffmat[i,1] = corrtest[1]

dfcorrelation = pd.DataFrame(coeffmat, columns={'coeff', 'pval'}, index=df1.columns)
# dfpvals = pd.DataFrame(pvalmat, columns=df2.columns, index=df1.columns)

In [100]:
#To correlate all columns with each other, not just the corresponding columns of each dataframe:

# coeffmat = np.zeros((df1.shape[1], df2.shape[1]))
# pvalmat = np.zeros((df1.shape[1], df2.shape[1]))

# for i in range(df1.shape[1]):    
#     for j in range(df2.shape[1]):        
#         corrtest = pearsonr(df1[df1.columns[i]], df2[df2.columns[j]])  

#         coeffmat[i,j] = corrtest[0]
#         pvalmat[i,j] = corrtest[1]

# dfcoeff = pd.DataFrame(coeffmat, columns=df2.columns, index=df1.columns)
# print(dfcoeff)

# dfpvals = pd.DataFrame(pvalmat, columns=df2.columns, index=df1.columns)
# print(dfpvals)

In [139]:
male_lab_mean = male_lab.mean()
male_lab_min = male_lab.min()
male_lab_max = male_lab.max()
fem_lab_mean = fem_lab.mean()
fem_lab_min = fem_lab.min()
fem_lab_max = fem_lab.max()
dfcorrelation['mean total labor participation'] = total_lab_mean
dfcorrelation['min labor participation'] = total_lab_min
dfcorrelation['max labor participation'] = total_lab_max
dfcorrelation['mean female labor participation'] = fem_lab_mean
dfcorrelation['min female labor participation'] = fem_lab_min
dfcorrelation['max female labor participation'] = fem_lab_max
dfcorrelation['predicted min female labor participation'] = dfcorrelation['mean female labor participation'] - ((dfcorrelation['mean total labor participation'] 
                                             - dfcorrelation['min labor participation']) * dfcorrelation['coeff'])
dfcorrelation['predicted max female labor participation'] = dfcorrelation['mean female labor participation'] + ((dfcorrelation['max labor participation']
                                            - dfcorrelation['mean total labor participation']) * dfcorrelation['coeff'])
dfcorrelation = dfcorrelation.reindex(['min labor participation', 'mean total labor participation', 
                                       'max labor participation', 'min female labor participation', 'predicted min female labor participation', 'mean female labor participation', 
                                       'max female labor participation', 'predicted max female labor participation', 'coeff', 'pval'], axis=1)

In [140]:
dfcorrelation.to_csv('dfcorrelation.csv')
dfcorrelation

Unnamed: 0,min labor participation,mean total labor participation,max labor participation,min female labor participation,predicted min female labor participation,mean female labor participation,max female labor participation,predicted max female labor participation,coeff,pval
1990,35.406,66.471932,91.542,8.116,40.0798,48.221812,90.784,54.7924,0.262088,0.000379
1991,35.914,66.557536,91.109,8.182,40.3265,48.411794,90.345,54.8897,0.263851,0.000345
1992,36.266,66.612174,90.59,8.252,40.3449,48.551201,89.845,55.0354,0.270423,0.000241
1993,36.254,66.631768,89.987,8.325,40.4544,48.618906,89.283,54.896,0.268767,0.000264
1994,36.607,66.70035,89.335,8.398,40.8038,48.821452,88.665,54.8519,0.266427,0.0003
1995,36.249,66.712914,88.615,8.472,40.5628,48.925758,87.993,54.9383,0.27452,0.000192
1996,36.07,66.733239,88.55,8.558,40.551,49.083246,87.262,55.1539,0.278255,0.000155
1997,36.293,66.739258,88.489,8.644,40.6429,49.255479,86.925,55.408,0.282879,0.000119
1998,36.352,66.808593,88.693,9.229,40.5827,49.447344,87.223,55.817,0.291058,7.4e-05
1999,36.347,66.849743,88.912,9.85,40.7586,49.665088,87.448,56.107,0.291989,7e-05
