# Gender Income Inequality

## 1. Dataframe 

In [1]:
# Load Dataset and import libaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import statistics as st
import cufflinks as cf
from IPython.display import display, HTML

cf.set_config_file(sharing = 'public', theme = 'ggplot', offline = True)

os.chdir('C:\\Users\\Gamer\\Python\\DataScience Python\\Portafolio\\Base de Datos')

df = pd.read_csv('Casen_2017.3.csv', delimiter = ';')

In [2]:
# Also, look at datatypes of columns
df.head()

Unnamed: 0,region,pco1,sexo,edad,pco2,oficio4,oficio1,o10,yoprcor,yoprcorh,ytrabajocor,esc,educ,activ
0,1,1,2,56,1,9112,9,8,180000,180000,250000,12,5,1
1,1,1,2,21,1,9131,9,59,200000,200000,200000,7,1,1
2,1,1,1,24,1,5123,5,8,270000,270000,281750,12,5,1
3,1,3,1,28,3,5123,5,8,300000,300000,311750,12,5,1
4,1,1,1,26,1,9141,9,8,320000,320000,341667,15,8,1


In [3]:
# look at the column types
df.dtypes

region          int64
pco1            int64
sexo            int64
edad            int64
pco2            int64
oficio4        object
oficio1        object
o10            object
yoprcor        object
yoprcorh       object
ytrabajocor    object
esc            object
educ            int64
activ          object
dtype: object

In [4]:
# And how many columns and rows have our dataframe
df.shape

(216439, 14)

## 2. Data Cleaning

### 2. 1 Change columns to STR

In [5]:
# The first thing I did, is chance datatypes of columns.
# Transforming columns in STR
df['region'] = df['region'].astype('str')
df['pco1'] = df['pco1'].astype('str')
df['sexo'] = df['sexo'].astype('str')
df['pco2'] = df['pco2'].astype('str')
df['educ'] = df['educ'].astype('str')
df['activ'] = df['activ'].astype('str')
df['oficio1'] = df['oficio1'].astype('str')

In [6]:
# I replaced numerical values with text
df['sexo'] = df['sexo'].replace(['1', '2'],['Hombre', 'Mujer'])
df['educ'] = df['educ'].replace(['0', '1', '2', '3', '4', '5', '6', '7', '8','9','10','11', '12', '99'],
                                ['Sin Educ. Formal', 'Basica Incom.', 'Basica Compl.', 'M. Hum. Incom.', 'M. Tec. Incom.',
                                 'M. Hum Compl.', ' M. Tec Compl.', 'Tecnico Superior Incom.', 'Tecnico Superior Compl.', 
                                 'Profesional Icom.', 'Prostgrado Icom.', 'Profesional Compl.', 'Postgrado Compl.', np.nan])
df['pco1'] = df['pco1'].replace(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10','11', '12','13', '14', '15'],
                                ['Jefe(a) de Hogar', 'Esposo(a) de distinto sexo', 'Esposo(a) de mismo sexo', ' Hijo(a) de ambos',
                                 'Hijo(a), solo del jefe', 'Hijo(a) solo de la esposo(a)/pareja', 'Padre o madre', 'Suegro(a)', 
                                 'Yerno o Nuera', 'Nieto', 'Hermano', 'Cuñado', 'Otro familiar', 'No familiar', 'Servicio Domestico'])
df['pco2'] = df['pco2'].replace(['1', '2', '3', '4', '5', '6', '13', '14'],
                                ['Jefe(a) de Nucleo', 'Esposo(a) de distinto sexo', 'Esposo(a) de mismo sexo', ' Hijo(a) de ambos',
                                 'Hijo(a), solo del jefe', 'Hijo(a) solo de la esposo(a)/pareja', 'Otro familiar', 'No familiar'])
df['activ'] = df['activ'].replace(['1', '2', '3'], ['Ocupados', 'Desocupados', 'Inactivos'])

In [7]:
df.head()

Unnamed: 0,region,pco1,sexo,edad,pco2,oficio4,oficio1,o10,yoprcor,yoprcorh,ytrabajocor,esc,educ,activ
0,1,Jefe(a) de Hogar,Mujer,56,Jefe(a) de Nucleo,9112,9,8,180000,180000,250000,12,M. Hum Compl.,Ocupados
1,1,Jefe(a) de Hogar,Mujer,21,Jefe(a) de Nucleo,9131,9,59,200000,200000,200000,7,Basica Incom.,Ocupados
2,1,Jefe(a) de Hogar,Hombre,24,Jefe(a) de Nucleo,5123,5,8,270000,270000,281750,12,M. Hum Compl.,Ocupados
3,1,Esposo(a) de mismo sexo,Hombre,28,Esposo(a) de mismo sexo,5123,5,8,300000,300000,311750,12,M. Hum Compl.,Ocupados
4,1,Jefe(a) de Hogar,Hombre,26,Jefe(a) de Nucleo,9141,9,8,320000,320000,341667,15,Tecnico Superior Compl.,Ocupados


### 2. 2 Fill empty cells and drop NaN.

In [8]:
# I need to change these columns to INT, but these have empty cells.
# Therefore, I filled the empty cells with NaN.
df['esc'].replace(r'\s+', np.nan, regex=True, inplace = True)
df['ytrabajocor'].replace(r'\s+', np.nan, regex=True, inplace = True)
df['yoprcor'].replace(r'\s+', np.nan, regex=True, inplace = True)
df['yoprcorh'].replace(r'\s+', np.nan, regex=True, inplace = True)
df['o10'].replace(r'\s+', np.nan, regex=True, inplace = True)

In [9]:
# check Nan
for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, pct_missing))

region - 0.0%
pco1 - 0.0%
sexo - 0.0%
edad - 0.0%
pco2 - 0.0%
oficio4 - 0.0%
oficio1 - 0.0%
o10 - 0.5735703824172168%
yoprcor - 0.585476739404636%
yoprcorh - 0.585476739404636%
ytrabajocor - 0.5507510199178521%
esc - 0.1958103668932124%
educ - 0.005230111024353282%
activ - 0.0%


In [10]:
# I convert the column in FlOATF.
df['esc'] = df['esc'].astype('float')
df['yoprcor'] = df['yoprcor'].astype('float')
df['yoprcorh'] = df['yoprcorh'].astype('float')
df['ytrabajocor'] = df['ytrabajocor'].astype('float')
df['o10'] = df['o10'].astype('float')

In [11]:
df.dtypes

region          object
pco1            object
sexo            object
edad             int64
pco2            object
oficio4         object
oficio1         object
o10            float64
yoprcor        float64
yoprcorh       float64
ytrabajocor    float64
esc            float64
educ            object
activ           object
dtype: object

In [12]:
for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, pct_missing))

region - 0.0%
pco1 - 0.0%
sexo - 0.0%
edad - 0.0%
pco2 - 0.0%
oficio4 - 0.0%
oficio1 - 0.0%
o10 - 0.5735703824172168%
yoprcor - 0.585476739404636%
yoprcorh - 0.585476739404636%
ytrabajocor - 0.5507510199178521%
esc - 0.1958103668932124%
educ - 0.005230111024353282%
activ - 0.0%


## 3. Operacionalización

In [13]:
round(df.describe(), 2)

Unnamed: 0,edad,o10,yoprcor,yoprcorh,ytrabajocor,esc
count,216439.0,92296.0,89719.0,89719.0,97235.0,174058.0
mean,37.78,42.9,491951.78,491951.78,514796.08,10.83
std,22.95,14.82,658468.5,658468.5,800524.17,4.29
min,0.0,1.0,2000.0,2000.0,42.0,0.0
25%,19.0,40.0,250000.0,250000.0,250000.0,8.0
50%,36.0,45.0,320000.0,320000.0,333333.0,12.0
75%,56.0,45.0,500000.0,500000.0,554167.0,14.0
max,117.0,220.0,40000000.0,40000000.0,80000000.0,22.0


In [16]:
table1 = round(df.pivot_table(index = 'sexo', aggfunc = 'mean', values = 'yoprcor'), 2)

table1.head()

Unnamed: 0_level_0,yoprcor
sexo,Unnamed: 1_level_1
Hombre,542960.69
Mujer,421704.04


In [17]:
table1.iplot(kind = 'bar')