In [1]:
# Import the relevant python libraries for the analysis
import pandas as pd
from pandas import DataFrame
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
import scipy.stats as stats
import pylab as pl
import math

In [2]:
# Load and test dataset and set the index if applicable
metro_by_region = pd.read_excel('metro_area_by_state.xlsx')
metro_by_region.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76 entries, 0 to 75
Data columns (total 5 columns):
Name               74 non-null object
Status             74 non-null object
State(s)           74 non-null object
Population 2010    74 non-null float64
Population 2015    74 non-null float64
dtypes: float64(2), object(3)
memory usage: 3.0+ KB


In [3]:
metro_by_region

Unnamed: 0,Name,Status,State(s),Population 2010,Population 2015
0,,,,,
1,,,,,
2,Acapulco,Metropolitan Area,Guerrero,863431.0,886975.0
3,Acayucan,Metropolitan Area,Veracruz de Ignacio de la Llave,112996.0,120340.0
4,Aguascalientes,Metropolitan Area,Aguascalientes,932369.0,1044049.0
5,Campeche,Metropolitan Area,Campeche,259005.0,283025.0
6,Cancún,Metropolitan Area,Quintana Roo,677379.0,763121.0
7,Celaya,Metropolitan Area,Guanajuato,690442.0,731667.0
8,Chetumal,Metropolitan Area,Quintana Roo,207810.0,224080.0
9,Chihuahua,Metropolitan Area,Chihuahua,852533.0,918339.0


##### Remove Irregular Values
- Case 1: Remove all rows with 'NaN' in the column values.

In [4]:
#Drop NaN values 
metro_by_region = metro_by_region.dropna()
metro_by_region.head()

Unnamed: 0,Name,Status,State(s),Population 2010,Population 2015
2,Acapulco,Metropolitan Area,Guerrero,863431.0,886975.0
3,Acayucan,Metropolitan Area,Veracruz de Ignacio de la Llave,112996.0,120340.0
4,Aguascalientes,Metropolitan Area,Aguascalientes,932369.0,1044049.0
5,Campeche,Metropolitan Area,Campeche,259005.0,283025.0
6,Cancún,Metropolitan Area,Quintana Roo,677379.0,763121.0


In [5]:
# Organize dataset by State in alphabetical order
metro_by_region = metro_by_region.sort_values(by=['State(s)'],ascending=True)
metro_by_region.head()

Unnamed: 0,Name,Status,State(s),Population 2010,Population 2015
4,Aguascalientes,Metropolitan Area,Aguascalientes,932369.0,1044049.0
64,Tijuana,Metropolitan Area,Baja California,1751430.0,1840710.0
20,Ensenada,Metropolitan Area,Baja California,466814.0,486639.0
34,Mexicali,Metropolitan Area,Baja California,936826.0,988417.0
28,La Paz,Metropolitan Area,Baja California Sur,251871.0,272711.0


In [6]:
# Reset index 
metro_by_region = metro_by_region.reset_index()
metro_by_region.head()

Unnamed: 0,index,Name,Status,State(s),Population 2010,Population 2015
0,4,Aguascalientes,Metropolitan Area,Aguascalientes,932369.0,1044049.0
1,64,Tijuana,Metropolitan Area,Baja California,1751430.0,1840710.0
2,20,Ensenada,Metropolitan Area,Baja California,466814.0,486639.0
3,34,Mexicali,Metropolitan Area,Baja California,936826.0,988417.0
4,28,La Paz,Metropolitan Area,Baja California Sur,251871.0,272711.0


In [7]:
# Remove unnecessary columns 
metro_by_region = metro_by_region.drop(['index', 'Status'], axis=1)
metro_by_region.head()

Unnamed: 0,Name,State(s),Population 2010,Population 2015
0,Aguascalientes,Aguascalientes,932369.0,1044049.0
1,Tijuana,Baja California,1751430.0,1840710.0
2,Ensenada,Baja California,466814.0,486639.0
3,Mexicali,Baja California,936826.0,988417.0
4,La Paz,Baja California Sur,251871.0,272711.0


### List the Metropolitan Areas in each Province

In [8]:
# Create sample variables using metro_by_region array hold metropolitan areas per region 
# as a model to construct a function to parse through metro_by_region 
aqua = metro_by_region[metro_by_region['State(s)'] == 'Aguascalientes']
aqua = np.array(aqua['Name'])
print(aqua)

baja_cal = metro_by_region[metro_by_region['State(s)'] == 'Baja California']
baja_cal = np.array(baja_cal['Name'])
print(baja_cal)

['Aguascalientes']
['Tijuana' 'Ensenada' 'Mexicali']


In [9]:
# Organize Metro areas into Provinces

# Create an empty dictionary to store metropolitan areas per region
metro_region_dict = {}

# Create an iteration function to create metropolitan groups by region
for i in metro_by_region['State(s)'].sort_values().unique():
    """Organize Metropolitan Areas by Region"""
    
    #select one region
    region_sub_df = metro_by_region[metro_by_region['State(s)'] == i]
    
    #select all metro names and store in an array
    metro_array = np.array(region_sub_df['Name'])
    
    #add the metro_array as the value to the region key
    key = i
    metro_region_dict[key] = metro_array 
    
    print(i + ': ' + str(metro_array))

Aguascalientes: ['Aguascalientes']
Baja California: ['Tijuana' 'Ensenada' 'Mexicali']
Baja California Sur: ['La Paz']
Campeche: ['Campeche']
Chiapas: ['Tuxtla Gutiérrez' 'Tapachula']
Chihuahua: ['Delicias' 'Juárez' 'Chihuahua' 'Hidalgo del Parral']
Ciudad de México / Hidalgo / México: ['Valle de México\xa0[Greater Mexico City]']
Coahuila de Zaragoza: ['Piedras Negras' 'Saltillo' 'Monclova - Frontera']
Coahuila de Zaragoza / Durango: ['La Laguna\xa0(Comarca Lagunera, Torreón - Gómez Palacio)']
Colima: ['Tecomán' 'Colima - Villa de Álvarez']
Durango: ['Durango']
Guanajuato: ['Moroleón - Uriangato' 'Guanajuato' 'León' 'Celaya'
 'San Francisco del Rincón']
Guanajuato / Michoacán de Ocampo: ['La Piedad - Pénjamo']
Guerrero: ['Acapulco' 'Chilpancingo']
Hidalgo: ['Tula' 'Pachuca' 'Tulancingo']
Jalisco: ['Ocotlán' 'Guadalajara']
Jalisco / Nayarit: ['Puerto Vallarta']
Michoacán de Ocampo: ['Morelia' 'Zamora']
Morelos: ['Cuautla' 'Cuernavaca']
México: ['Toluca' 'Tianguistenco']
Nayarit: ['Tepic'

In [10]:
# Check to verify function worked as intended 
print(metro_region_dict['Aguascalientes'], metro_region_dict['Baja California'], metro_region_dict['Ciudad de México / Hidalgo / México'])

['Aguascalientes'] ['Tijuana' 'Ensenada' 'Mexicali'] ['Valle de México\xa0[Greater Mexico City]']


In [11]:
# Store new dictionary as a global variable that can be uploaded to other Jupyter Notebooks
%store metro_region_dict

Stored 'metro_region_dict' (dict)


In [12]:
len(metro_region_dict)

37

In [15]:
# Combine regions that are in multiple variables into one

# Coahuila de Zaragoza *and* Coahuila de Zaragoza / Durango
metro_region_dict['Coahuila de Zaragoza'] = metro_region_dict['Coahuila de Zaragoza'] + metro_region_dict['Coahuila de Zaragoza / Durango']

# Guanajuato *and* Guanajuato / Michoacán de Ocampo
metro_region_dict['Guanajuato'] = metro_region_dict['Guanajuato'] + metro_region_dict['Guanajuato / Michoacán de Ocampo']

# Jalisco *and* Jalisco / Nayarit
metro_region_dict['Jalisco'] = metro_region_dict['Jalisco'] + metro_region_dict['Jalisco / Nayarit']

# Puebla *and* Puebla / Tlaxcala
metro_region_dict['Puebla'] = metro_region_dict['Puebla'] + metro_region_dict['Puebla / Tlaxcala']

# Tamaulipas *and* Tamaulipas / Veracruz de Ignacio de la Llave
metro_region_dict['Tamaulipas'] = metro_region_dict['Tamaulipas'] + metro_region_dict['Tamaulipas / Veracruz de Ignacio de la Llave']


In [18]:
# Upload metro_by_region dictionary from metro_by_region_initial_examination Jupyter File
%store -r mexico_gdp

In [27]:
mexico_gdp.head(31)

Unnamed: 0,Metropolitan Areas,Year_2010,Year_2015
0,MEX01: Mexico City,21553.0,22587.0
1,MEX02: Guadalajara,16572.0,17636.0
2,MEX03: Monterrey,31365.0,28251.0
3,MEX04: Puebla,10304.0,10681.0
4,MEX05: Toluca,10423.0,10633.0
5,MEX06: Tijuana,17388.0,19047.0
6,MEX07: Leon,12476.0,14428.0
7,MEX08: Queretaro,19732.0,21741.0
8,MEX09: Merida,12688.0,13672.0
9,MEX10: Juarez,15330.0,18101.0


In [26]:
mexico_gdp.tail(33)

Unnamed: 0,Metropolitan Areas,Year_2010,Year_2015
31,MEX32: Pachuca de Soto,11391.0,11366.0
32,MEX33: Irapuato,12673.0,14428.0
33,MEX34: Ahome,13975.0,14894.0
34,MEX35: Matamoros,17302.0,17580.0
35,MEX36: Ensenada,17723.0,19041.0
36,MEX37: Poza Rica de Hidalgo,12093.0,12105.0
37,MEX38: Tepic,11481.0,11647.0
38,MEX39: Orizaba,12337.0,12105.0
39,MEX40: Mazatlan,13850.0,14894.0
40,MEX41: Cajeme,20208.0,22840.0


In [23]:
# Create a function that parses through the mexico_gdp['Metropolita Areas'] and the metro_region_dict
# to store the GDP values 

# Sample data for function

In [25]:
print(metro_region_dict['Aguascalientes'])

['Aguascalientes']


In [34]:
aqua_gdp_2010 = int(mexico_gdp[mexico_gdp['Metropolitan Areas'] == 'MEX21: Aguascalientes']['Year_2010'])
print('Aguascalientes 2010 GDP: ' + str(aqua_gdp_2010))

aqua_gdp_2015 = int(mexico_gdp[mexico_gdp['Metropolitan Areas'] == 'MEX21: Aguascalientes']['Year_2015'])
print('Aguascalientes 2015 GDP: ' + str(aqua_gdp_2015))

Aguascalientes 2010 GDP: 16597
Aguascalientes 2015 GDP: 19528


In [24]:
print(metro_region_dict['Baja California'])

['Tijuana' 'Ensenada' 'Mexicali']


In [33]:
baja_cali_gdp_2010 = int(mexico_gdp[mexico_gdp['Metropolitan Areas'] == 'MEX06: Tijuana']['Year_2010'])
baja_cali_gdp_2010 += int(mexico_gdp[mexico_gdp['Metropolitan Areas'] == 'MEX36: Ensenada']['Year_2010'])
baja_cali_gdp_2010 += int(mexico_gdp[mexico_gdp['Metropolitan Areas'] == 'MEX13: Mexicali']['Year_2010'])
baja_cali_gdp_2010 = baja_cali_gdp_2010 / 3
print('Baja California 2010 GDP: ' + str(baja_cali_gdp_2010))

baja_cali_gdp_2015 = int(mexico_gdp[mexico_gdp['Metropolitan Areas'] == 'MEX06: Tijuana']['Year_2015'])
baja_cali_gdp_2015 += int(mexico_gdp[mexico_gdp['Metropolitan Areas'] == 'MEX36: Ensenada']['Year_2015'])
baja_cali_gdp_2015 += int(mexico_gdp[mexico_gdp['Metropolitan Areas'] == 'MEX13: Mexicali']['Year_2015'])
baja_cali_gdp_2015 = baja_cali_gdp_2015 / 3
print('Baja California 2015 GDP: ' + str(baja_cali_gdp_2015))

Baja California 2010 GDP: 17526.333333333332
Baja California 2015 GDP: 19045.333333333332


In [35]:
print(metro_region_dict['Baja California Sur'])

['La Paz']


In [37]:
baja_cali_sur_gdp_2010 = int(mexico_gdp[mexico_gdp['Metropolitan Areas'] == 'MEX57: La Paz']['Year_2010'])
print('Baja California Sur 2010 GDP: ' + str(baja_cali_sur_gdp_2010))

baja_cali_sur_gdp_2015 = int(mexico_gdp[mexico_gdp['Metropolitan Areas'] == 'MEX57: La Paz']['Year_2015'])
print('Baja California Sur 2015 GDP: ' + str(baja_cali_sur_gdp_2015))

Baja California Sur 2010 GDP: 21260
Baja California Sur 2015 GDP: 21431


In [38]:
print(metro_region_dict['Campeche'])

['Campeche']


In [43]:
campe_gdp_2010 = int(mexico_gdp[mexico_gdp['Metropolitan Areas'] == 'MEX58: Campeche']['Year_2010'])
print('Campeche 2010 GDP: ' + str(campe_gdp_2010))

campe_gdp_2015 = int(mexico_gdp[mexico_gdp['Metropolitan Areas'] == 'MEX58: Campeche']['Year_2015'])
print('Campeche 2015 GDP: ' + str(campe_gdp_2015))

ValueError: cannot convert float NaN to integer

In [40]:
print(metro_region_dict['Chiapas'])

['Tuxtla Gutiérrez' 'Tapachula']


In [42]:
chiapas_gdp_2010 = int(mexico_gdp[mexico_gdp['Metropolitan Areas'] == 'MEX45: Tapachula']['Year_2010'])
print('Chiapas 2010 GDP: ' + str(chiapas_gdp_2010))

chiapas_gdp_2015 = int(mexico_gdp[mexico_gdp['Metropolitan Areas'] == 'MEX45: Tapachula']['Year_2015'])
print('Chiapas 2015 GDP: ' + str(chiapas_gdp_2015))

Chiapas 2010 GDP: 7169
Chiapas 2015 GDP: 6696


In [44]:
print(metro_region_dict['Chihuahua'])

['Delicias' 'Juárez' 'Chihuahua' 'Hidalgo del Parral']


In [45]:
chihua_gdp_2010 = int(mexico_gdp[mexico_gdp['Metropolitan Areas'] == 'MEX10: Juarez']['Year_2010'])
chihua_gdp_2010 += int(mexico_gdp[mexico_gdp['Metropolitan Areas'] == 'MEX18: Chihuahua']['Year_2010'])
chihua_gdp_2010 += int(mexico_gdp[mexico_gdp['Metropolitan Areas'] == 'MEX13: Mexicali']['Year_2010'])
chihua_gdp_2010 = chihua_gdp_2010 / 3
print('Chihuahua 2010 GDP: ' + str(chihua_gdp_2010))

chihua_gdp_2015 = int(mexico_gdp[mexico_gdp['Metropolitan Areas'] == 'MEX10: Juarez']['Year_2015'])
chihua_gdp_2015 += int(mexico_gdp[mexico_gdp['Metropolitan Areas'] == 'MEX18: Chihuahua']['Year_2015'])
chihua_gdp_2015 += int(mexico_gdp[mexico_gdp['Metropolitan Areas'] == 'MEX13: Mexicali']['Year_2015'])
chihua_gdp_2015 = chihua_gdp_2015 / 3
print('Chihuahua 2015 GDP: ' + str(chihua_gdp_2015))

Chihuahua 2010 GDP: 16296.0
Chihuahua 2015 GDP: 18416.666666666668


In [46]:
print(metro_region_dict['Ciudad de México / Hidalgo / México'])

['Valle de México\xa0[Greater Mexico City]']


In [47]:
ciudad_gdp_2010 = int(mexico_gdp[mexico_gdp['Metropolitan Areas'] == 'MEX01: Mexico City']['Year_2010'])
print('Ciudad de México 2010 GDP: ' + str(ciudad_gdp_2010))

ciudad_gdp_2015 = int(mexico_gdp[mexico_gdp['Metropolitan Areas'] == 'MEX01: Mexico City']['Year_2015'])
print('Ciudad de México 2015 GDP: ' + str(ciudad_gdp_2015))

Ciudad de México 2010 GDP: 21553
Ciudad de México 2015 GDP: 22587


In [48]:
print(metro_region_dict['Coahuila de Zaragoza'])

['Piedras NegrasLa Laguna\xa0(Comarca Lagunera, Torreón - Gómez Palacio)La Laguna\xa0(Comarca Lagunera, Torreón - Gómez Palacio)La Laguna\xa0(Comarca Lagunera, Torreón - Gómez Palacio)'
 'SaltilloLa Laguna\xa0(Comarca Lagunera, Torreón - Gómez Palacio)La Laguna\xa0(Comarca Lagunera, Torreón - Gómez Palacio)La Laguna\xa0(Comarca Lagunera, Torreón - Gómez Palacio)'
 'Monclova - FronteraLa Laguna\xa0(Comarca Lagunera, Torreón - Gómez Palacio)La Laguna\xa0(Comarca Lagunera, Torreón - Gómez Palacio)La Laguna\xa0(Comarca Lagunera, Torreón - Gómez Palacio)']


In [None]:
# split string method after space in 'MEX ##: ' to choose the region name
# if no match for region name, skip
# if match for region name, store value per year in year variable
# for each string match per region, add value to year variable then divide sum by len(#_matches)
# add year var to region_year as key (dictionary)
# print dictionary as a bar chart 
# plot dictionary region up/down in a boxplot 