## This file has been modified based on new understandings and methodologies since the original implementation (2 years later)

In [66]:
# Import the relevant python libraries for the analysis
import pandas as pd
from pandas import DataFrame
import numpy as np
import statistics

In [67]:
# Load the dataset
mortalitad_materna = pd.read_csv('../data/mortalidad_materna.csv')
#materna.info()

### Create a **materna** subdataset *in English* from the **maternalidad_materna** dataset *in Spanish* with information including: 

- Residence Area
- Local Community Size
- Total Education Completed 
- Last Recorded Age
- Reason for Mortality
- Medical Assistance

### Step 1: Explore the Data and Create Clean Sub-Dataframe for Analysis

In [68]:
# 1. Combine patient birthdate information into one column
birth = DataFrame(mortalitad_materna, columns=['Año de nacimiento', 'Mes de nacimiento', 'Día de nacimiento'])
birth = mortalitad_materna['Año de nacimiento'].map(str) + '-' + mortalitad_materna['Mes de nacimiento'].map(str) + '-' + mortalitad_materna['Mes de nacimiento'].map(str)
print(birth.sort_values(ascending=True).head(2))
len(birth)

13088    0-0-0
12833    0-0-0
dtype: object


18163

In [69]:
# Combine patient date of death information into one column
death = DataFrame(mortalitad_materna, columns=['Año de la defunción', 'Mes de la defunción', 'Día de la defunción'])
death = mortalitad_materna['Año de la defunción'].map(str) + '-' + mortalitad_materna['Mes de la defunción'].map(str) + '-' + mortalitad_materna['Mes de la defunción'].map(str)
print(death.sort_values(ascending=True).head(2))
len(death)

17886       0-0-0
17516    1914-3-3
dtype: object


18163

#### Create variables for important location, economic, and educational factors associated with the instance of maternal mortality:
- Residence Area
- Local Community Size
- Total Education Completed 
- Last Recorded Age
- Reason for Mortality
- Medical Assistance

In [70]:
# Create variable to store:

#residence information 
residence_code = mortalitad_materna['Entidad de residencia']
residence_name = mortalitad_materna['Descripción de entidad de residencia']

#local community info
local_size = mortalitad_materna['Descripción del tamaño de localidad']
local_size_code = mortalitad_materna['Tamaño de localidad']

#educational level
edu_reached_code = mortalitad_materna['Escolaridad'] 
edu_reached = mortalitad_materna['Descripción de la escolaridad']

#age fulfilled by patient
last_age = mortalitad_materna['Edad cumplida']

#mortality reason
mortality_reason = mortalitad_materna['Razón de mortalidad materna']

#medical assistance
medical_received = mortalitad_materna['Descripción de la asistencia médica']

In [71]:
# Create a sub-dataframe to hold all date- information 
materna = pd.concat([birth, 
                   death, 
                   residence_code,
                   residence_name,
                   local_size,
                   local_size_code,
                   edu_reached_code,
                   edu_reached,
                   last_age,
                   mortality_reason,
                   medical_received], axis=1)
materna.columns = ['Date of Birth', 
                 'Date of Mortality', 
                 'Residence Code',
                 'Residence Name',
                 'Local Community Size',
                 'Local Size Code',
                 'Education Code',
                 'Education Completed',
                 'Age at Death',
                 'Reason for Mortality',
                 'Medical Assistance Received']
    
materna.head(2)

Unnamed: 0,Date of Birth,Date of Mortality,Residence Code,Residence Name,Local Community Size,Local Size Code,Education Code,Education Completed,Age at Death,Reason for Mortality,Medical Assistance Received
0,1972-7-7,2002-3-3,29,Tlaxcala,40000 a 49999 habitantes,10,7,PROFESIONAL,29,1,CON ATENCION MEDICA
1,1967-4-4,2003-5-5,29,Tlaxcala,40000 a 49999 habitantes,10,6,BACHILLERATO O PREPARATORIA COMPLETA,36,1,CON ATENCION MEDICA


### Data Wrangling: 

#### Part 1: Check for null or errors within materna

In [72]:
# Order dataframe to list in ascending order of approx. age at death
materna = materna.sort_values(by=['Age at Death'],ascending=True)
materna.head()

Unnamed: 0,Date of Birth,Date of Mortality,Residence Code,Residence Name,Local Community Size,Local Size Code,Education Code,Education Completed,Age at Death,Reason for Mortality,Medical Assistance Received
9598,1998-7-7,2009-7-7,7,Chiapas,1 a 999 habitantes,1,2,PRIMARIA INCOMPLETA,11,1,CON ATENCION MEDICA
6407,1996-1-1,2008-12-12,30,Veracruz de Ignacio de la Llave,50000 a 74999 habitantes,11,3,PRIMARIA COMPLETA,12,1,CON ATENCION MEDICA
9084,2001-12-12,2014-5-5,27,Tabasco,250000 a 499999 habitantes,14,4,SECUNDARIA INCOMPLETA,12,1,CON ATENCION MEDICA
10883,1983-11-11,2002-6-6,15,México,2500 a 4999 habitantes,4,5,SECUNDARIA COMPLETA,12,1,CON ATENCION MEDICA
1266,1993-0-0,2005-1-1,12,Guerrero,2500 a 4999 habitantes,4,2,PRIMARIA INCOMPLETA,12,1,SIN ATENCION MEDICA


In [73]:
# Reset Index 
materna = materna.reset_index(drop=True)
materna.head()
materna.tail()

Unnamed: 0,Date of Birth,Date of Mortality,Residence Code,Residence Name,Local Community Size,Local Size Code,Education Code,Education Completed,Age at Death,Reason for Mortality,Medical Assistance Received
18158,0-0-0,1975-2-2,25,Sinaloa,1 a 999 habitantes,1,0,NO ESPECIFICADO,998,0,NO ESPECIFICADO
18159,0-0-0,2006-9-9,27,Tabasco,75000 a 99999 habitantes,12,1,NINGUNA,998,1,NO ESPECIFICADO
18160,0-0-0,2000-6-6,16,Michoacán de Ocampo,No Especificado.,0,0,NO ESPECIFICADO,998,0,NO ESPECIFICADO
18161,0-0-0,0-0-0,0,No especificado,No Especificado.,0,0,NO ESPECIFICADO,998,0,SIN ATENCION MEDICA
18162,0-0-0,2002-4-4,8,Chihuahua,1 a 999 habitantes,1,2,PRIMARIA INCOMPLETA,998,1,CON ATENCION MEDICA


#### Check if all values make sense
- Case 1: It is biologically impossible to reach 998 years of age. These data entries appear to all have '0-0-0' in the 'Date of Birth' column and should therefore be removed from the sub-dataset, dates.
- Case 2: What do the '0' and '1' entires for 'Reason for Mortality' mean? If one of these keys is not associated with maternal death, then those entries should also be removed from the sub-dataset, dates.

In [74]:
# Remove rows with NaN / '0-0-0' values in Date of Birth
materna = materna[materna['Date of Birth'] != '0-0-0']
materna.tail()

Unnamed: 0,Date of Birth,Date of Mortality,Residence Code,Residence Name,Local Community Size,Local Size Code,Education Code,Education Completed,Age at Death,Reason for Mortality,Medical Assistance Received
18134,1933-12-12,2015-8-8,9,Distrito Federal,500000 a 999999 habitantes,15,3,PRIMARIA COMPLETA,81,0,CON ATENCION MEDICA
18135,1933-11-11,2015-12-12,15,México,250000 a 499999 habitantes,14,2,PRIMARIA INCOMPLETA,82,0,CON ATENCION MEDICA
18136,1933-8-8,2015-9-9,24,San Luis Potosí,500000 a 999999 habitantes,15,2,PRIMARIA INCOMPLETA,82,0,CON ATENCION MEDICA
18137,1930-1-1,2016-2-2,14,Jalisco,500000 a 999999 habitantes,15,2,PRIMARIA INCOMPLETA,86,0,CON ATENCION MEDICA
18138,1930-8-8,2017-1-1,30,Veracruz de Ignacio de la Llave,1000 a 1999 habitantes,2,2,PRIMARIA INCOMPLETA,86,0,CON ATENCION MEDICA


In [75]:
materna.head()

Unnamed: 0,Date of Birth,Date of Mortality,Residence Code,Residence Name,Local Community Size,Local Size Code,Education Code,Education Completed,Age at Death,Reason for Mortality,Medical Assistance Received
0,1998-7-7,2009-7-7,7,Chiapas,1 a 999 habitantes,1,2,PRIMARIA INCOMPLETA,11,1,CON ATENCION MEDICA
1,1996-1-1,2008-12-12,30,Veracruz de Ignacio de la Llave,50000 a 74999 habitantes,11,3,PRIMARIA COMPLETA,12,1,CON ATENCION MEDICA
2,2001-12-12,2014-5-5,27,Tabasco,250000 a 499999 habitantes,14,4,SECUNDARIA INCOMPLETA,12,1,CON ATENCION MEDICA
3,1983-11-11,2002-6-6,15,México,2500 a 4999 habitantes,4,5,SECUNDARIA COMPLETA,12,1,CON ATENCION MEDICA
4,1993-0-0,2005-1-1,12,Guerrero,2500 a 4999 habitantes,4,2,PRIMARIA INCOMPLETA,12,1,SIN ATENCION MEDICA


**Case 1 Conclusion**: It appears that removing rows with '0-0-0' in the 'Date of Birth' column did remove all entries outside of the biological scope of maximum age at death. However, it is unlikely that a women who reached the age of 81 died due to maternal reasons since this age is outside the childe-bearing years. The analysis of the '0' and '1' in the 'Reason for Mortality' column may shed more light since all younger ages seem to be associated with '1' while older ages are associated with '0'. 

In [76]:
# Create a variable for the description of Reason for Mortality Description
mortality_description = mortalitad_materna['Descripción de la razón de mortalidad materna']

# Create a sub-dataframe to show interaction of Reason for Mortality Code and Description
mortality = pd.concat([mortality_reason, mortality_description], axis=1)
mortality.columns = ['Reason Mortality Code', 'Reason Mortality Description']
mortality.head()

Unnamed: 0,Reason Mortality Code,Reason Mortality Description
0,1,Muertes Maternas para la razón de Mortalidad M...
1,1,Muertes Maternas para la razón de Mortalidad M...
2,1,Muertes Maternas para la razón de Mortalidad M...
3,1,Muertes Maternas para la razón de Mortalidad M...
4,1,Muertes Maternas para la razón de Mortalidad M...


In [77]:
mortality.tail()

Unnamed: 0,Reason Mortality Code,Reason Mortality Description
18158,0,Muertes Maternas excluidas para la razón de Mo...
18159,0,Muertes Maternas excluidas para la razón de Mo...
18160,0,Muertes Maternas excluidas para la razón de Mo...
18161,0,Muertes Maternas excluidas para la razón de Mo...
18162,0,Muertes Maternas excluidas para la razón de Mo...


In [78]:
print('0 Description:')
print('Spanish: Muertes Maternas excluidas para la razón de Mortalidad Materna')
print('English: Maternal deaths excluded for the reason of Maternal Mortality')

0 Description:
Spanish: Muertes Maternas excluidas para la razón de Mortalidad Materna
English: Maternal deaths excluded for the reason of Maternal Mortality


In [79]:
print('1 Description:')
print('Spanish: Muertes Maternas para la razón de Mortalidad Materna')
print('English: Maternal deaths for the reason of Maternal Mortality')

1 Description:
Spanish: Muertes Maternas para la razón de Mortalidad Materna
English: Maternal deaths for the reason of Maternal Mortality


**Case 2 Conclusion**: Since '1' refers to recorded maternal-deaths and '0' refers to recorded deaths that are *not* associated with maternity, all rows containing '0' in the 'Reason for Mortality' column should be removed.

In [80]:
# Remove rows with 0 values in Reason for Mortality
materna = materna[materna['Reason for Mortality'] != 0 ]
materna.tail()

Unnamed: 0,Date of Birth,Date of Mortality,Residence Code,Residence Name,Local Community Size,Local Size Code,Education Code,Education Completed,Age at Death,Reason for Mortality,Medical Assistance Received
18113,1960-3-3,2010-10-10,12,Guerrero,1 a 999 habitantes,1,7,PROFESIONAL,50,1,CON ATENCION MEDICA
18116,1960-1-1,2011-10-10,12,Guerrero,1 a 999 habitantes,1,0,NO ESPECIFICADO,51,1,CON ATENCION MEDICA
18121,1963-12-12,2017-10-10,15,México,10000 a 14999 habitantes,6,7,PROFESIONAL,53,1,CON ATENCION MEDICA
18123,1961-8-8,2015-5-5,7,Chiapas,1000 a 1999 habitantes,2,1,NINGUNA,53,1,CON ATENCION MEDICA
18125,1942-12-12,2002-12-12,20,Oaxaca,2000 a 2499 habitantes,3,2,PRIMARIA INCOMPLETA,54,1,CON ATENCION MEDICA


#### Assess Data Range for Maternal Mortality Instances

In [81]:
date_min = materna['Date of Mortality'].min()
date_max = materna['Date of Mortality'].max()
print('Maternal Mortality Date Range:', date_min, ' to ', date_max)

Maternal Mortality Date Range: 1938-12-12  to  2017-9-9


#### Part 2: Translating Important Columns in **materna** from Spanish *using translation_english.txt*
- Important columns that need translation include: 
    - *Education Completed*
        - Translate Spanish Descriptions into Integer Values that Are Comparable
    - *Local Community Size*
        - Translate Spanish Descriptions into Integer Values that Are Comparable
    - *Medical Assistance Received*
        - Translate Medical Assistance Received into a Binary Column

**Education Completed**
- 9
    - 'POSGRADO' = Post-Graduate Education
- 8
    - 'PROFESIONAL' = Professional School
- 7
    - 'BACHILLERATO O PREPARATORIA COMPLETA' = High School (grades 10-12) complete
- 6
    - 'BACHILLERATO O PREPARATORIA INCOMPLETA' = High School (grades 10-12) incomplete
- 5 
    - 'PRIMARIA COMPLETA' = Elementary School (grades 1-6) complete 
- 4 
    - 'PRIMARIA INCOMPLETA' = Elementary School (grades 1-6) incomplete
- 3 
    - 'SECUNDARIA COMPLETA' = Junior High (grades 7-9) complete
- 2
    - 'SECUNDARIA INCOMPLETA' = Junior High (grades 7-9) incomplete
- 1
   - 'PREESCOLAR' = Preschool complete
-  0 
    - Combine the following entries:'SE IGNORA' = It was 'ignored' / 'NO ESPECIFICADO' = Not specified / 'NINGUNA' = NONE

In [82]:
# Create a sub-dataframe to show interaction of Education Code and Education Completed
education = materna[['Education Code', 'Education Completed']].sort_values(by='Education Code').drop_duplicates()
education_dict = dict(zip(education['Education Code'], education['Education Completed']))
education_dict

{0: 'NO ESPECIFICADO',
 1: 'NINGUNA',
 2: 'PRIMARIA INCOMPLETA',
 3: 'PRIMARIA COMPLETA',
 4: 'SECUNDARIA INCOMPLETA',
 5: 'SECUNDARIA COMPLETA',
 6: 'BACHILLERATO O PREPARATORIA COMPLETA',
 7: 'PROFESIONAL',
 10: 'POSGRADO',
 11: 'BACHILLERATO O PREPARATORIA INCOMPLETA',
 12: 'PREESCOLAR',
 99: 'SE IGNORA'}

In [83]:
# # Overwriting column with replaced value of Education

# # SE IGNORA / NINGUNA / NO ESPECIFICADO
# materna["Education Completed"]= materna["Education Completed"].replace(['SE IGNORA', 'NINGUNA', 'NO ESPECIFICADO'], 0)

# # PREESCOLAR
# materna["Education Completed"]= materna["Education Completed"].replace('PREESCOLAR', 1)

# # PRIMARIA
# #INCOMPLETA
# materna["Education Completed"]= materna["Education Completed"].replace('PRIMARIA INCOMPLETA', 2)
# #COMPLETA
# materna["Education Completed"]= materna["Education Completed"].replace('PRIMARIA COMPLETA', 3)

# # SECUNDARIA
# #INCOMPLETA
# materna["Education Completed"]= materna["Education Completed"].replace('SECUNDARIA INCOMPLETA', 4)
# #COMPLETA
# materna["Education Completed"]= materna["Education Completed"].replace('SECUNDARIA COMPLETA', 5)

# # BACHILLERATO O PREPARATORIA
# #INCOMPLETA
# materna["Education Completed"]= materna["Education Completed"].replace('BACHILLERATO O PREPARATORIA INCOMPLETA', 6)
# #COMPLETA
# materna["Education Completed"]= materna["Education Completed"].replace('BACHILLERATO O PREPARATORIA COMPLETA', 7)

# # PROFESIONAL
# materna["Education Completed"]= materna["Education Completed"].replace('PROFESIONAL', 8)

# #POSGRADO
# materna["Education Completed"]= materna["Education Completed"].replace('POSGRADO', 9)

In [84]:
# Test output
list(materna['Education Completed'].sort_values().unique())

['BACHILLERATO O PREPARATORIA COMPLETA',
 'BACHILLERATO O PREPARATORIA INCOMPLETA',
 'NINGUNA',
 'NO ESPECIFICADO',
 'POSGRADO',
 'PREESCOLAR',
 'PRIMARIA COMPLETA',
 'PRIMARIA INCOMPLETA',
 'PROFESIONAL',
 'SE IGNORA',
 'SECUNDARIA COMPLETA',
 'SECUNDARIA INCOMPLETA']

**Local Community Size**

In [85]:
# Create a sub-dataframe to show interaction of Education Code and Education Completed
local_community = materna[['Local Size Code', 'Local Community Size']].sort_values(by='Local Size Code').drop_duplicates() 
local_community_dict = dict(zip(local_community['Local Size Code'], local_community['Local Community Size']))
local_community_dict

{0: 'No Especificado.',
 1: '1 a 999            habitantes',
 2: '1000 a 1999    habitantes',
 3: '2000 a 2499    habitantes',
 4: '2500 a 4999    habitantes',
 5: '5000 a 9999    habitantes',
 6: '10000 a 14999  habitantes',
 7: '15000 a 19999   habitantes',
 8: '20000 a 29999   habitantes',
 9: '30000 a 39999   habitantes',
 10: '40000 a 49999   habitantes',
 11: '50000 a 74999   habitantes',
 12: '75000 a 99999   habitantes',
 13: '100000 a 249999 habitantes',
 14: '250000 a 499999 habitantes',
 15: '500000 a 999999  habitantes',
 16: '1000000 a 1499999  habitantes',
 17: '1500000 y más habitantes'}

*Observations*: Local Community Size appears to have corresponding codes ordered from least to greatest community size already. Thereofore the 'Local Size Code' values can remain while the 'Local Community Size' column can be dropped.

#### Medical Assistance Received
- 0: WITH Medical Assistance
- 1: Unspecified/WITHOUT Medical Assistance

In [86]:
# Create a list item to hold comparison response
binary_medassist = []

# Create an iteration function to compare region mean to popupation mean
for medassist in materna['Medical Assistance Received']:
    
    #test for assistance
    if medassist == 'CON ATENCION MEDICA':
        binary_medassist.append(1)
    else:
        binary_medassist.append(0)

In [87]:
# Test output
np.unique(binary_medassist)

array([0, 1])

In [88]:
# Convert the list to a Series and add as new column
materna['Received Medical Assistance'] = pd.Series(binary_medassist)
materna.head()

Unnamed: 0,Date of Birth,Date of Mortality,Residence Code,Residence Name,Local Community Size,Local Size Code,Education Code,Education Completed,Age at Death,Reason for Mortality,Medical Assistance Received,Received Medical Assistance
0,1998-7-7,2009-7-7,7,Chiapas,1 a 999 habitantes,1,2,PRIMARIA INCOMPLETA,11,1,CON ATENCION MEDICA,1.0
1,1996-1-1,2008-12-12,30,Veracruz de Ignacio de la Llave,50000 a 74999 habitantes,11,3,PRIMARIA COMPLETA,12,1,CON ATENCION MEDICA,1.0
2,2001-12-12,2014-5-5,27,Tabasco,250000 a 499999 habitantes,14,4,SECUNDARIA INCOMPLETA,12,1,CON ATENCION MEDICA,1.0
3,1983-11-11,2002-6-6,15,México,2500 a 4999 habitantes,4,5,SECUNDARIA COMPLETA,12,1,CON ATENCION MEDICA,1.0
4,1993-0-0,2005-1-1,12,Guerrero,2500 a 4999 habitantes,4,2,PRIMARIA INCOMPLETA,12,1,SIN ATENCION MEDICA,0.0


In [90]:
materna['Medical Assistance Received'].isnull().sum()

0

In [89]:
materna['Received Medical Assistance'].isnull().sum()

1334

In [24]:
materna[['Received Medical Assistance', 'Medical Assistance Received']].groupby('Received Medical Assistance', as_index=False).count()

Unnamed: 0,Received Medical Assistance,Medical Assistance Received
0,0.0,2177
1,1.0,13125


In [91]:
# Drop columns that are unnecessary
materna = materna.drop(columns=['Date of Mortality', 'Received Medical Assistance', 'Reason for Mortality'])
materna.head()

Unnamed: 0,Date of Birth,Residence Code,Residence Name,Local Community Size,Local Size Code,Education Code,Education Completed,Age at Death,Medical Assistance Received
0,1998-7-7,7,Chiapas,1 a 999 habitantes,1,2,PRIMARIA INCOMPLETA,11,CON ATENCION MEDICA
1,1996-1-1,30,Veracruz de Ignacio de la Llave,50000 a 74999 habitantes,11,3,PRIMARIA COMPLETA,12,CON ATENCION MEDICA
2,2001-12-12,27,Tabasco,250000 a 499999 habitantes,14,4,SECUNDARIA INCOMPLETA,12,CON ATENCION MEDICA
3,1983-11-11,15,México,2500 a 4999 habitantes,4,5,SECUNDARIA COMPLETA,12,CON ATENCION MEDICA
4,1993-0-0,12,Guerrero,2500 a 4999 habitantes,4,2,PRIMARIA INCOMPLETA,12,SIN ATENCION MEDICA


In [92]:
# Analyze shape of cleaned data
materna.describe()

Unnamed: 0,Residence Code,Local Size Code,Education Code,Age at Death
count,16636.0,16636.0,16636.0,16636.0
mean,16.279574,7.81005,4.210207,28.356756
std,7.85017,5.912597,4.966106,7.323171
min,1.0,0.0,0.0,11.0
25%,10.0,2.0,2.0,22.0
50%,15.0,6.0,4.0,28.0
75%,21.0,14.0,5.0,34.0
max,34.0,17.0,99.0,54.0


In [93]:
print('There are '+ str(len(np.unique(materna['Residence Name']))) + ' Provinces in Mexico.')
list(np.unique(materna['Residence Name']))

There are 34 Provinces in Mexico.


['Aguascalientes',
 'Baja California',
 'Baja California Sur',
 'Campeche',
 'Chiapas',
 'Chihuahua',
 'Coahuila de Zaragoza',
 'Colima',
 'Distrito Federal',
 'Durango',
 'Estados Unidos de Norteamérica',
 'Guanajuato',
 'Guerrero',
 'Hidalgo',
 'Jalisco',
 'Michoacán de Ocampo',
 'Morelos',
 'México',
 'Nayarit',
 'Nuevo León',
 'Oaxaca',
 'Otros paises latinoamericanos',
 'Puebla',
 'Querétaro Arteaga',
 'Quintana Roo',
 'San Luis Potosí',
 'Sinaloa',
 'Sonora',
 'Tabasco',
 'Tamaulipas',
 'Tlaxcala',
 'Veracruz de Ignacio de la Llave',
 'Yucatán',
 'Zacatecas']

### Additional Cleaning of Data: 
There are 31 states and 1 federal entity in Mexico, so the length of the 'Regions' column should be 32, not 34. When translating the list of regions within the sub-dataset, it becomes apparent that not all entries are Provinces within Mexico, and need to be removed. Namely:
- Estados Unidos de Norteamérica - 'United States of America'
- Otros paises latinoamericanos - 'Other Latin American countries'
- No especificado - 'Not Specified'
- Otros paises - 'Other Countries'

In [94]:
# Remove unnecessary rows from region_ages sub-dataset
materna = materna[materna['Residence Name'] != 'Estados Unidos de Norteamérica' ]
materna = materna[materna['Residence Name'] != 'Otros paises latinoamericanos' ]
materna = materna[materna['Residence Name'] != 'No especificado' ]
materna = materna[materna['Residence Name'] != 'Otros paises' ]

print('There are '+ str(len(np.unique(materna['Residence Name']))) + ' Provinces in Mexico.')
list(np.unique(materna['Residence Name']))

There are 32 Provinces in Mexico.


['Aguascalientes',
 'Baja California',
 'Baja California Sur',
 'Campeche',
 'Chiapas',
 'Chihuahua',
 'Coahuila de Zaragoza',
 'Colima',
 'Distrito Federal',
 'Durango',
 'Guanajuato',
 'Guerrero',
 'Hidalgo',
 'Jalisco',
 'Michoacán de Ocampo',
 'Morelos',
 'México',
 'Nayarit',
 'Nuevo León',
 'Oaxaca',
 'Puebla',
 'Querétaro Arteaga',
 'Quintana Roo',
 'San Luis Potosí',
 'Sinaloa',
 'Sonora',
 'Tabasco',
 'Tamaulipas',
 'Tlaxcala',
 'Veracruz de Ignacio de la Llave',
 'Yucatán',
 'Zacatecas']

In [105]:
# Store as a global variable that can be uploaded to other Jupyter Notebooks
%store materna
materna.head()

Stored 'materna' (DataFrame)


Unnamed: 0,Date of Birth,Residence Code,Residence Name,Local Community Size,Local Size Code,Education Code,Education Completed,Age at Death,Medical Assistance Received
0,1998-7-7,7,Chiapas,1 a 999 habitantes,1,2,PRIMARIA INCOMPLETA,11,CON ATENCION MEDICA
1,1996-1-1,30,Veracruz de Ignacio de la Llave,50000 a 74999 habitantes,11,3,PRIMARIA COMPLETA,12,CON ATENCION MEDICA
2,2001-12-12,27,Tabasco,250000 a 499999 habitantes,14,4,SECUNDARIA INCOMPLETA,12,CON ATENCION MEDICA
3,1983-11-11,15,México,2500 a 4999 habitantes,4,5,SECUNDARIA COMPLETA,12,CON ATENCION MEDICA
4,1993-0-0,12,Guerrero,2500 a 4999 habitantes,4,2,PRIMARIA INCOMPLETA,12,SIN ATENCION MEDICA


In [96]:
materna.isnull().sum()

Date of Birth                  0
Residence Code                 0
Residence Name                 0
Local Community Size           0
Local Size Code                0
Education Code                 0
Education Completed            0
Age at Death                   0
Medical Assistance Received    0
dtype: int64

### Preparation of Data for Machine Learning Analysis
- Create variables for age distribution by region
- Create a sub-dataframe for Machine Learning model

### Part 1: Create a sample region array variables to hold age distribution per region 

In [97]:
# Test code to create function
aqua = materna[materna['Residence Name'] == 'Aguascalientes']
aqua = aqua['Age at Death']
aqua = np.array(aqua)

mex = materna[materna['Residence Name'] == 'México']
mex = mex['Age at Death']
mex = np.array(mex)

print('Aguascalientes Sample Length: '+ str(aqua))
print('México Sample Length: ' + str(mex))

Aguascalientes Sample Length: [15 15 16 16 17 17 17 17 17 17 18 18 18 18 19 19 19 19 20 20 20 20 20 20
 20 21 21 21 21 22 22 22 22 22 22 23 23 23 23 23 23 24 24 24 25 25 25 25
 25 25 26 26 26 27 27 27 28 28 28 28 28 28 29 29 29 29 29 29 30 30 30 30
 30 30 31 31 31 31 31 31 31 32 32 32 32 33 33 33 33 33 33 34 34 34 35 35
 35 35 35 35 35 35 35 36 36 36 36 36 37 37 37 38 38 38 39 39 39 39 39 40
 40 41 42 42 42 43 43]
México Sample Length: [12 12 14 ... 49 49 53]


Since it appears that the sample size of ages of maternal death within the Provinces varies, the total sample per Province should be stored in unique age array variables. The process of creating the age array is repeatable, so a function should be created then applied to each Province. The array of ages variable can then be stored in a dictionary as a value with the associated Province as the key.

In [98]:
# Create a function to group all ages associated with materna death within a Province and store the ages in an array
def age_array(str):
    
    """Create arrays for all Ages of Maternal Death within a Region"""
    
    ages = materna[materna['Residence Name'] == str] # select the region 'str' from the 'Region' column
    ages = ages['Age at Death'] # select the ages within the region
    ages = np.array(ages) # store the ages in an array
    return ages # return the unique array

In [99]:
# Test output
print('Aguascalientes', age_array('Aguascalientes'))
print('México', age_array('México'))

Aguascalientes [15 15 16 16 17 17 17 17 17 17 18 18 18 18 19 19 19 19 20 20 20 20 20 20
 20 21 21 21 21 22 22 22 22 22 22 23 23 23 23 23 23 24 24 24 25 25 25 25
 25 25 26 26 26 27 27 27 28 28 28 28 28 28 29 29 29 29 29 29 30 30 30 30
 30 30 31 31 31 31 31 31 31 32 32 32 32 33 33 33 33 33 33 34 34 34 35 35
 35 35 35 35 35 35 35 36 36 36 36 36 37 37 37 38 38 38 39 39 39 39 39 40
 40 41 42 42 42 43 43]
México [12 12 14 ... 49 49 53]


In [100]:
# Create a variable for 'Region' names using np.unique()
list_regions = np.unique(materna['Residence Name'])

# Create an empty dictionary to hold the {Region : region_age_array} key pairs
age_by_state = {}

In [101]:
# Use the age_array function with iteration over residence to create the {Region : region_age_array} key pairs
for region in list_regions:
    age_by_state[region] = age_array(region) # add arrays as values in dictionary with region-key

In [102]:
# Test output
print('Aguascalientes', age_by_state['Aguascalientes'])

Aguascalientes [15 15 16 16 17 17 17 17 17 17 18 18 18 18 19 19 19 19 20 20 20 20 20 20
 20 21 21 21 21 22 22 22 22 22 22 23 23 23 23 23 23 24 24 24 25 25 25 25
 25 25 26 26 26 27 27 27 28 28 28 28 28 28 29 29 29 29 29 29 30 30 30 30
 30 30 31 31 31 31 31 31 31 32 32 32 32 33 33 33 33 33 33 34 34 34 35 35
 35 35 35 35 35 35 35 36 36 36 36 36 37 37 37 38 38 38 39 39 39 39 39 40
 40 41 42 42 42 43 43]


In [103]:
# Store as a global variable 
%store age_by_state

Stored 'age_by_state' (dict)


### Part 2: Create a sub-dataframe for the Machine Learning Model
- residence name
- residence code (index)
- region mean
- region sample size (n)
- region variance
- binary target
- mean educational level
- mean local community size
- mean presence of medical care

In [104]:
# Var for residence name 
residence_uniq = np.unique(materna['Residence Name'])

# Var for residence code
residence_code = np.unique(materna['Residence Code'])

# Create the sub-dateframe for region and region code
res_dataset = pd.DataFrame(residence_uniq, index=residence_code)
res_dataset = res_dataset.rename(columns={0:'Region'})
res_dataset.head()

Unnamed: 0,Region
1,Aguascalientes
2,Baja California
3,Baja California Sur
4,Campeche
5,Chiapas


#### Calculate the Mean Age per Region

In [38]:
# Test Code
mean_death_list_trial = []

aguas = materna[materna['Residence Code'] == 1 ]
aguas = aguas[['Residence Code', 'Age at Death']]
aguas_mean = aguas['Age at Death'].mean()
aguas_mean = '{0:0.2f}'.format(aguas_mean)
print(aguas_mean)
mean_death_list_trial.append(aguas_mean)
print(mean_death_list_trial)

baja = materna[materna['Residence Code'] == 2 ]
baja = baja[['Residence Code', 'Age at Death']]
baja_mean = baja['Age at Death'].mean()
baja_mean = '{0:0.2f}'.format(baja_mean)
print(baja_mean)
mean_death_list_trial.append(baja_mean)
print(mean_death_list_trial)

28.36
['28.36']
27.15
['28.36', '27.15']


In [39]:
# Create an empty list to store region sample size and mean age of maternal death
region_mean = []
region_n = []

# Calculate the mean age of maternal death per region
for i in materna['Residence Code'].sort_values().unique():
    """Calculate Length of Age Array and Mean Age per Region"""

    sub_df = materna[materna['Residence Code'] == (i - 1)] # select one region
    n = len(sub_df['Age at Death']) # calculate sample length
    mean = sub_df['Age at Death'].mean() # calculate mean of region
    region_n.append(round(n, 2)) # append n to list
    region_mean.append(round(mean, 2)) # append mean to list

In [40]:
# Test output
print(region_mean[1], region_n[1], type(region_mean[1]))
print(region_mean[2], region_n[2], type(region_mean[2]))

28.36 127 <class 'numpy.float64'>
27.15 361 <class 'numpy.float64'>


In [41]:
# Convert the list to a Series and add as new column
res_dataset['μ Age Maternal Mortality'] = pd.Series(region_mean)
res_dataset['Region (n)'] = pd.Series(region_n)
res_dataset.tail()

Unnamed: 0,Region,μ Age Maternal Mortality,Region (n)
28,Tamaulipas,28.26,377.0
29,Tlaxcala,28.45,178.0
30,Veracruz de Ignacio de la Llave,27.94,1239.0
31,Yucatán,27.5,258.0
32,Zacatecas,,


*Preliminary Observation*: Since the region 'Zacatecas' has a NaN value for mean age of maternal death, the contents of 'Zacatecas' need to be adjusted from NaN to the actual mean of the data for the res_dataset. *With a quick reference, the initial values in the res_dataset match the individually calculated mean for both Aguascalientes and Baja California, so we know the NaN is not due to shifted values.*

#### Clean Data by Replacing NaN/Null values with the Correct Data

In [42]:
# Calculate the mean Age of Death for region 'Zacatecas'
zaca = materna[materna['Residence Code'] == 32 ]
zaca = zaca['Age at Death']

# Calculate sample size
zaca_n = len(zaca)

# Calculate mean
zaca_mean = zaca.mean()
zaca_mean = round(mean, 2)
print(zaca_mean)

# Change contents of res_dataset NaN to calculated mean
res_dataset['μ Age Maternal Mortality'] = res_dataset['μ Age Maternal Mortality'].replace(np.nan, zaca_mean)
res_dataset['Region (n)'] = res_dataset['Region (n)'].replace(np.nan, zaca_n)
res_dataset.tail()

27.5


Unnamed: 0,Region,μ Age Maternal Mortality,Region (n)
28,Tamaulipas,28.26,377.0
29,Tlaxcala,28.45,178.0
30,Veracruz de Ignacio de la Llave,27.94,1239.0
31,Yucatán,27.5,258.0
32,Zacatecas,27.5,191.0


#### Calculate the Age Variance by Region

In [43]:
# Test code
aguas = materna[materna['Residence Code'] == 1 ]
aguas = aguas[['Residence Code', 'Age at Death']]
aguas_var = statistics.pvariance(aguas['Age at Death'])
print('Aguas', aguas_var)

baja = materna[materna['Residence Code'] == 2 ]
baja = baja[['Residence Code', 'Age at Death']]
baja_var = statistics.pvariance(baja['Age at Death'])
print('Baja Cal', baja_var)

Aguas 55.16014632029264
Baja Cal 46.31196814020764


In [44]:
# Create an empty list to store age of maternal death variance per region
region_var = []

for i in materna['Residence Code'].sort_values().unique():
    """Calculate Age Standard Deviation and Age Variance per Region"""
    
    sub_df = materna[materna['Residence Code'] == i]
    age = list(sub_df['Age at Death'])
    var = statistics.pvariance(age) # calculate age variance of region pop
    
    for region in sub_df['Residence Name'].unique(): # prevent repeat entries in lists
        region_var.append(round(var, 2)) # append var to region list

In [45]:
# Test output - Make sure it matches Test Results
print('Test Results - Aguas', round(aguas_var,2))
print('Function Results - Aguas', region_var[0])

Test Results - Aguas 55.16
Function Results - Aguas 55.16


In [46]:
# Convert the list to a Series and add as new column
res_dataset['μ Age Variance'] = pd.Series(region_var, index=np.arange(1,33))
res_dataset.head()

Unnamed: 0,Region,μ Age Maternal Mortality,Region (n),μ Age Variance
1,Aguascalientes,28.36,127.0,55.16
2,Baja California,27.15,361.0,46.31
3,Baja California Sur,27.56,66.0,55.37
4,Campeche,26.87,126.0,44.29
5,Chiapas,28.02,310.0,45.02


#### Create Binary Columns: 

μ Age 
- 0 : Region μ Age Maternal Mortality is *greater than or equal to* the population mean
- 1 : Region μ Age Maternal Mortality is *less than* the population mean

In [47]:
res_mean_age = res_dataset['μ Age Maternal Mortality'].mean()
print('The population mean is: ', round(res_mean_age, 2))

The population mean is:  28.16


In [48]:
# Create a dictionary item to hold comparison response
binary_mean = []

# Compare region mean to population mean
for mean in res_dataset['μ Age Maternal Mortality']:
    if mean >= res_mean_age:
        binary_mean.append(True)
    else:
        binary_mean.append(False)

In [49]:
# Test output
#binary_mean

In [50]:
# Convert the list to a Series and add as new column
res_dataset['Equal or Above μ Age in Mexico'] = pd.Series(binary_mean, index=np.arange(1,33))
res_dataset.head()

Unnamed: 0,Region,μ Age Maternal Mortality,Region (n),μ Age Variance,Equal or Above μ Age in Mexico
1,Aguascalientes,28.36,127.0,55.16,True
2,Baja California,27.15,361.0,46.31,False
3,Baja California Sur,27.56,66.0,55.37,False
4,Campeche,26.87,126.0,44.29,False
5,Chiapas,28.02,310.0,45.02,False


#### Calculate Mean Educational Level per Region

In [51]:
from scipy.stats.mstats import mode

In [52]:
# Create an empty list to store region sample size and mean age of maternal death
region_education = []
edu_dict = {}

# Create an iteration function to calculate the mean age of maternal death per region
for i in materna['Residence Code'].sort_values().unique():
    """Calculate Mean Education per Region"""

    sub_df = materna[materna['Residence Code'] == i ]
    region = str(sub_df['Residence Name'].unique()[0])
    education = mode(sub_df['Education Code'])[0][0]
    #mean_edu = round(education, 2)
    region_education.append(education)
    edu_dict[region] = education

In [53]:
# Test output
print(len(region_education))
#region_education
print(edu_dict)

# Store as a global variable
%store edu_dict

32
{'Aguascalientes': 5.0, 'Baja California': 5.0, 'Baja California Sur': 6.0, 'Campeche': 5.0, 'Coahuila de Zaragoza': 5.0, 'Colima': 5.0, 'Chiapas': 2.0, 'Chihuahua': 1.0, 'Distrito Federal': 5.0, 'Durango': 5.0, 'Guanajuato': 3.0, 'Guerrero': 3.0, 'Hidalgo': 5.0, 'Jalisco': 5.0, 'México': 5.0, 'Michoacán de Ocampo': 3.0, 'Morelos': 5.0, 'Nayarit': 5.0, 'Nuevo León': 5.0, 'Oaxaca': 3.0, 'Puebla': 3.0, 'Querétaro Arteaga': 5.0, 'Quintana Roo': 5.0, 'San Luis Potosí': 3.0, 'Sinaloa': 6.0, 'Sonora': 5.0, 'Tabasco': 5.0, 'Tamaulipas': 5.0, 'Tlaxcala': 5.0, 'Veracruz de Ignacio de la Llave': 2.0, 'Yucatán': 5.0, 'Zacatecas': 5.0}
Stored 'edu_dict' (dict)


In [54]:
# Convert the list to a Series and add as new column
res_dataset['Region Mode Education Level'] = pd.Series(region_education, index=np.arange(1,33))
res_dataset['Education Definition'] = res_dataset['Region Mode Education Level'].map(education_dict)
res_dataset.tail()

Unnamed: 0,Region,μ Age Maternal Mortality,Region (n),μ Age Variance,Equal or Above μ Age in Mexico,Region Mode Education Level,Education Definition
28,Tamaulipas,28.26,377.0,45.45,True,5.0,SECUNDARIA COMPLETA
29,Tlaxcala,28.45,178.0,55.16,True,5.0,SECUNDARIA COMPLETA
30,Veracruz de Ignacio de la Llave,27.94,1239.0,53.6,False,2.0,PRIMARIA INCOMPLETA
31,Yucatán,27.5,258.0,54.89,False,5.0,SECUNDARIA COMPLETA
32,Zacatecas,27.5,191.0,48.1,False,5.0,SECUNDARIA COMPLETA


#### Calculate Mean Educational Level per Region

In [55]:
# Create an empty list to store region sample size and mean age of maternal death
region_community_size = []
size_dict = {}

# Create an iteration function to calculate the mean age of maternal death per region
for i in materna['Residence Code'].sort_values().unique():
    """Calculate Mean Local Community Size per Region"""

    sub_df = materna[materna['Residence Code'] == i ]
    region = str(sub_df['Residence Name'].unique()[0])
    local_community = mode(sub_df['Local Size Code'])[0][0]
    mean_size = round(local_community, 2)
    region_community_size.append(mean_size)
    size_dict[region] = mean_size

In [56]:
# Test output
print(len(region_community_size))
#region_education
print(size_dict)

# Store as a global variable
%store size_dict

32
{'Aguascalientes': 15.0, 'Baja California': 16.0, 'Baja California Sur': 13.0, 'Campeche': 13.0, 'Coahuila de Zaragoza': 15.0, 'Colima': 13.0, 'Chiapas': 1.0, 'Chihuahua': 1.0, 'Distrito Federal': 14.0, 'Durango': 1.0, 'Guanajuato': 1.0, 'Guerrero': 1.0, 'Hidalgo': 1.0, 'Jalisco': 16.0, 'México': 14.0, 'Michoacán de Ocampo': 1.0, 'Morelos': 13.0, 'Nayarit': 1.0, 'Nuevo León': 14.0, 'Oaxaca': 1.0, 'Puebla': 1.0, 'Querétaro Arteaga': 15.0, 'Quintana Roo': 15.0, 'San Luis Potosí': 1.0, 'Sinaloa': 15.0, 'Sonora': 15.0, 'Tabasco': 1.0, 'Tamaulipas': 14.0, 'Tlaxcala': 5.0, 'Veracruz de Ignacio de la Llave': 1.0, 'Yucatán': 15.0, 'Zacatecas': 1.0}
Stored 'size_dict' (dict)


In [57]:
# Convert the list to a Series and add as new column
res_dataset['Most Common Region Local Community Size'] = pd.Series(region_community_size, index=np.arange(1,33))
res_dataset['Local Community Definition'] = res_dataset['Most Common Region Local Community Size'].map(local_community_dict)
res_dataset.tail()

Unnamed: 0,Region,μ Age Maternal Mortality,Region (n),μ Age Variance,Equal or Above μ Age in Mexico,Region Mode Education Level,Education Definition,Most Common Region Local Community Size,Local Community Definition
28,Tamaulipas,28.26,377.0,45.45,True,5.0,SECUNDARIA COMPLETA,14.0,250000 a 499999 habitantes
29,Tlaxcala,28.45,178.0,55.16,True,5.0,SECUNDARIA COMPLETA,5.0,5000 a 9999 habitantes
30,Veracruz de Ignacio de la Llave,27.94,1239.0,53.6,False,2.0,PRIMARIA INCOMPLETA,1.0,1 a 999 habitantes
31,Yucatán,27.5,258.0,54.89,False,5.0,SECUNDARIA COMPLETA,15.0,500000 a 999999 habitantes
32,Zacatecas,27.5,191.0,48.1,False,5.0,SECUNDARIA COMPLETA,1.0,1 a 999 habitantes


#### Calculate Mean Presence of Medical Care per Region

In [58]:
# Create an empty list to store region sample size and mean age of maternal death
region_medical = []
medical_dict = {}

# Create an iteration function to calculate the mean age of maternal death per region
for i in materna['Residence Code'].sort_values().unique():
    """Calculate Mean Education per Region"""
    
    sub_df = materna[materna['Residence Code'] == i ]
    region = str(sub_df['Residence Name'].unique()[0])
    med_assist = sub_df['Received Medical Assistance'].mean()
    region_medical.append(med_assist) 
    medical_dict[region] = med_assist

In [59]:
# Test output
print(len(region_medical))
#region_medical

# Store as a global variable
%store medical_dict

32
Stored 'medical_dict' (dict)


In [60]:
# Convert the list to a Series and add as new column
res_dataset['Received Medical Assist'] = pd.Series(region_medical, index=np.arange(1,33))
res_dataset.head()

Unnamed: 0,Region,μ Age Maternal Mortality,Region (n),μ Age Variance,Equal or Above μ Age in Mexico,Region Mode Education Level,Education Definition,Most Common Region Local Community Size,Local Community Definition,Received Medical Assist
1,Aguascalientes,28.36,127.0,55.16,True,5.0,SECUNDARIA COMPLETA,15.0,500000 a 999999 habitantes,0.842105
2,Baja California,27.15,361.0,46.31,False,5.0,SECUNDARIA COMPLETA,16.0,1000000 a 1499999 habitantes,0.844575
3,Baja California Sur,27.56,66.0,55.37,False,6.0,BACHILLERATO O PREPARATORIA COMPLETA,13.0,100000 a 249999 habitantes,0.84127
4,Campeche,26.87,126.0,44.29,False,5.0,SECUNDARIA COMPLETA,13.0,100000 a 249999 habitantes,0.859504
5,Chiapas,28.02,310.0,45.02,False,5.0,SECUNDARIA COMPLETA,15.0,500000 a 999999 habitantes,0.877133


In [61]:
res_dataset[['Received Medical Assist', 'Region (n)']].groupby('Received Medical Assist',
                                                 as_index=False).count()

Unnamed: 0,Received Medical Assist,Region (n)
0,0.814346,1
1,0.822086,1
2,0.82684,1
3,0.836237,1
4,0.84127,1
5,0.842105,1
6,0.844318,1
7,0.844575,1
8,0.844893,1
9,0.845426,1


In [62]:
# Store as a global variable 
%store res_dataset

Stored 'res_dataset' (DataFrame)
