# Monthly wages for females and males
In LOCAL CURRENCY UNITS

In [1]:
import pandas as pd
import numpy as np
import os
import glob
import seaborn as sns
import matplotlib.pyplot as plt
import json
import csv
from collections import defaultdict
import functools
import operator

In [2]:
# Read csv files from No Ceilings project folder
path = "/Users/ericaxia/Downloads/Github/project-girlboss/data/noceilings-data-master/csv"
extension = 'csv'
os.chdir(path)
result = glob.glob('*.{}'.format(extension))
# Narrow down to just the files I want
files = ['MONWAGFE.csv', 'MONWAGMA.csv']
dfs = []
for f in files:
    df = pd.read_csv(f)
    dfs.append((f, df))    

## Let's narrow down to a chosen year

In [3]:
os.chdir("/Users/ericaxia/Downloads/Github/dsci554/554_Project_Code/Project/d3layout_data")

In [4]:
# import US specific data (median dollar earnings per month)
us_data = pd.read_csv("median_US_earnings.csv")
us_data.head()  

Unnamed: 0,year,female,male
0,1995,3133,4386
1,1996,3216,4360
2,1997,3316,4471
3,1998,3388,4630
4,1999,3376,4668


In [5]:
## NEW: map ISO -> Country name for better understandability in graph
with open('iso_to_country_names.csv', mode='r') as infile:
    reader = csv.reader(infile)
    country_names = {rows[0]: rows[1] for rows in reader}
    # print(country_names)

country_names_dict = { v: k for k, v in country_names.items()}
# country_names_dict


In [6]:
def convert_iso_to_name(iso):
    # print(x)
    if iso in country_names_dict:
        return country_names_dict[iso]
    else:
        return iso

In [7]:
wages_fem = dfs[0][1]  # monthly wages for females
wages_m = dfs[1][1] # monthly wages for males


In [8]:
wages_fem = wages_fem.iloc[0:85, :]  # rm bottom descrip rows
wages_m = wages_m.iloc[0:85, :]  # rm bottom descrip rows

In [9]:
## Missing values by year
# wages_fem.isna().sum().sort_values()[0:5]

In [10]:
# narrow down to countries that DO have data for 2011
wages_fem = wages_fem[~wages_fem['2011'].isna()]
wages_m = wages_m[~wages_m['2011'].isna()]
print(wages_fem.shape, wages_m.shape) # (26,18) -> we have 26 countries

(26, 18) (26, 18)


In [11]:
wages_fem = wages_fem.fillna(axis=0, method='pad')
wages_m = wages_m.fillna(axis=0, method='pad')

In [12]:
print(wages_fem.isna().sum().sum())
print(wages_m.isna().sum().sum())

0
0


In [47]:
# process us data separartely
us_data2 = us_data.transpose()
us_data2.columns = us_data2.iloc[0,:]
us_data2['ISO'] = 'USA'
wages_fem_us = pd.DataFrame(us_data2.iloc[1,:])
wages_m_us = pd.DataFrame(us_data2.iloc[2,:])


In [51]:
wages_m_us.rename(columns={'male': 'USA'}, inplace=True)
wages_fem_us.rename(columns={'female': 'USA'}, inplace=True)


In [52]:
wages_fem_us['gender'] = 'female'
wages_fem_us.reset_index(inplace=True)

In [53]:
wages_m_us['gender'] = 'male'
wages_m_us.reset_index(inplace=True)

In [55]:
wages_us = pd.concat([wages_fem_us, wages_m_us], axis=0)
wages_us2 = wages_us[wages_us['year'] != 'ISO']
# wages_us2

Unnamed: 0,year,USA,gender
0,1995,3133,female
1,1996,3216,female
2,1997,3316,female
3,1998,3388,female
4,1999,3376,female
5,2000,3407,female
6,2001,3524,female
7,2002,3587,female
8,2003,3568,female
9,2004,3532,female


In [62]:
## combine BOTH genders one dataset for ONE country
""" 
1. Narrow down country
2. Separately for each M / F dataset, melt the dataset
3. concat the melted datasets together
"""
all_countries = pd.DataFrame()

## Choose a country
for c in wages_fem.ISO.unique():
    f1 = wages_fem[wages_fem['ISO'] == c]
    f2 = f1.melt().iloc[1:, :]
    f2.columns = ['year', c]
    f2['gender'] = 'female'
    m1 = wages_m[wages_m['ISO'] == c]
    m2 = m1.melt().iloc[1:, :]
    m2.columns = ['year', c]
    m2['gender'] = 'male'
    all = pd.concat([f2, m2], axis=0)
    all2 = all[all['year'] != 'gender']
    # all2.head()
    all_countries = pd.concat([all_countries, all2], axis=1)
    # EXPORT to CSV
all_countries.reset_index(inplace=True)
wages_us2.reset_index(inplace=True)
all_countries2 = pd.concat([all_countries, wages_us2], axis=1)
print(all_countries2.shape)

(54, 83)


In [63]:
all_countries2.head()

Unnamed: 0,index,year,AUS,gender,year.1,CAN,gender.1,year.2,CZE,gender.2,...,year.3,GBR,gender.3,year.4,URY,gender.4,index.1,year.5,USA,gender.5
0,1.0,1995,1867.23,female,1995,1867.23,female,1995,1867.23,female,...,1995,66.0,female,1995,66.0,female,0,1995,3133,female
1,2.0,1996,1923.89,female,1996,1923.89,female,1996,1923.89,female,...,1996,114.0,female,1996,114.0,female,1,1996,3216,female
2,3.0,1997,1996.15,female,1997,2015.22,female,1997,2015.22,female,...,1997,971.533,female,1997,971.533,female,2,1997,3316,female
3,4.0,1998,2043.6,female,1998,2048.76,female,1998,2048.76,female,...,1998,1016.6,female,1998,1016.6,female,3,1998,3388,female
4,5.0,1999,2081.95,female,1999,2109.29,female,1999,2109.29,female,...,1999,1066.43,female,1999,1066.43,female,4,1999,3376,female


In [64]:
df = all_countries2.loc[:,~all_countries2.columns.duplicated()]

In [65]:
df.shape

(54, 30)

In [33]:
## check for missing values
# df.isna().sum().sort_values(ascending=False)

In [68]:
df.drop(labels='index', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [69]:
df.head()

Unnamed: 0,year,AUS,gender,CAN,CZE,ECU,DEU,GTM,ISL,LVA,...,SGP,SVK,ESP,SWE,SYR,THA,UKR,GBR,URY,USA
0,1995,1867.23,female,1867.23,1867.23,150.97,1858.0,1858.0,1858.0,1858.0,...,1850.0,1850.0,1850.0,14700.0,14700.0,14700.0,66.0,66.0,66.0,3133
1,1996,1923.89,female,1923.89,1923.89,161.673,1933.0,1933.0,1933.0,1933.0,...,1979.0,1979.0,1979.0,15500.0,15500.0,15500.0,114.0,114.0,114.0,3216
2,1997,1996.15,female,2015.22,2015.22,192.329,1985.0,1985.0,1985.0,1985.0,...,2114.0,2114.0,2114.0,16200.0,16200.0,16200.0,132.0,971.533,971.533,3316
3,1998,2043.6,female,2048.76,2048.76,158.344,2039.0,2039.0,107000.0,107000.0,...,2256.0,2256.0,2256.0,16800.0,16800.0,16800.0,131.0,1016.6,1016.6,3388
4,1999,2081.95,female,2109.29,2109.29,105.445,2105.0,2105.0,117000.0,117000.0,...,2327.0,2327.0,2327.0,17600.0,17600.0,17600.0,149.0,1066.43,1066.43,3376


In [70]:
df.to_csv("wages.csv", index=False)

In [71]:
df.columns

Index(['year', 'AUS', 'gender', 'CAN', 'CZE', 'ECU', 'DEU', 'GTM', 'ISL',
       'LVA', 'LUX', 'MLT', 'MEX', 'MNG', 'NZL', 'NOR', 'PAK', 'PHL', 'QAT',
       'SGP', 'SVK', 'ESP', 'SWE', 'SYR', 'THA', 'UKR', 'GBR', 'URY', 'USA'],
      dtype='object')