In [29]:
# Import the relevant python libraries for the analysis
import pandas as pd
from pandas import DataFrame
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
import scipy.stats as stats
import pylab as pl
import math

In [30]:
# Load and test dataset and set the index if applicable
metro_by_region = pd.read_excel('metro_area_by_state.xlsx')
metro_by_region.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76 entries, 0 to 75
Data columns (total 5 columns):
Name               74 non-null object
Status             74 non-null object
State(s)           74 non-null object
Population 2010    74 non-null float64
Population 2015    74 non-null float64
dtypes: float64(2), object(3)
memory usage: 3.0+ KB


In [31]:
metro_by_region

Unnamed: 0,Name,Status,State(s),Population 2010,Population 2015
0,,,,,
1,,,,,
2,Acapulco,Metropolitan Area,Guerrero,863431.0,886975.0
3,Acayucan,Metropolitan Area,Veracruz de Ignacio de la Llave,112996.0,120340.0
4,Aguascalientes,Metropolitan Area,Aguascalientes,932369.0,1044049.0
5,Campeche,Metropolitan Area,Campeche,259005.0,283025.0
6,Cancún,Metropolitan Area,Quintana Roo,677379.0,763121.0
7,Celaya,Metropolitan Area,Guanajuato,690442.0,731667.0
8,Chetumal,Metropolitan Area,Quintana Roo,207810.0,224080.0
9,Chihuahua,Metropolitan Area,Chihuahua,852533.0,918339.0


##### Remove Irregular Values
- Case 1: Remove all rows with 'NaN' in the column values.

In [33]:
#Drop NaN values 
metro_by_region = metro_by_region.dropna()
metro_by_region.head()

Unnamed: 0,Name,Status,State(s),Population 2010,Population 2015
2,Acapulco,Metropolitan Area,Guerrero,863431.0,886975.0
3,Acayucan,Metropolitan Area,Veracruz de Ignacio de la Llave,112996.0,120340.0
4,Aguascalientes,Metropolitan Area,Aguascalientes,932369.0,1044049.0
5,Campeche,Metropolitan Area,Campeche,259005.0,283025.0
6,Cancún,Metropolitan Area,Quintana Roo,677379.0,763121.0


In [35]:
# Organize dataset by State in alphabetical order
metro_by_region = metro_by_region.sort_values(by=['State(s)'],ascending=True)
metro_by_region.head()

Unnamed: 0,Name,Status,State(s),Population 2010,Population 2015
4,Aguascalientes,Metropolitan Area,Aguascalientes,932369.0,1044049.0
64,Tijuana,Metropolitan Area,Baja California,1751430.0,1840710.0
20,Ensenada,Metropolitan Area,Baja California,466814.0,486639.0
34,Mexicali,Metropolitan Area,Baja California,936826.0,988417.0
28,La Paz,Metropolitan Area,Baja California Sur,251871.0,272711.0


In [36]:
# Reset index 
metro_by_region = metro_by_region.reset_index()
metro_by_region.head()

Unnamed: 0,index,Name,Status,State(s),Population 2010,Population 2015
0,4,Aguascalientes,Metropolitan Area,Aguascalientes,932369.0,1044049.0
1,64,Tijuana,Metropolitan Area,Baja California,1751430.0,1840710.0
2,20,Ensenada,Metropolitan Area,Baja California,466814.0,486639.0
3,34,Mexicali,Metropolitan Area,Baja California,936826.0,988417.0
4,28,La Paz,Metropolitan Area,Baja California Sur,251871.0,272711.0


In [37]:
# Remove unnecessary columns 
metro_by_region = metro_by_region.drop(['index', 'Status'], axis=1)
metro_by_region.head()

Unnamed: 0,Name,State(s),Population 2010,Population 2015
0,Aguascalientes,Aguascalientes,932369.0,1044049.0
1,Tijuana,Baja California,1751430.0,1840710.0
2,Ensenada,Baja California,466814.0,486639.0
3,Mexicali,Baja California,936826.0,988417.0
4,La Paz,Baja California Sur,251871.0,272711.0


### Step 2: Organize the Data for Analysis (in other files)

In [45]:
# Create sample variables using metro_by_region array hold metropolitan areas per region 
# as a model to construct a function to parse through metro_by_region 
aqua = metro_by_region[metro_by_region['State(s)'] == 'Aguascalientes']
aqua = np.array(aqua['Name'])
print(aqua)

baja_cal = metro_by_region[metro_by_region['State(s)'] == 'Baja California']
baja_cal = np.array(baja_cal['Name'])
print(baja_cal)

['Aguascalientes']
['Tijuana' 'Ensenada' 'Mexicali']


In [47]:
# Organize Metro areas into Provinces

# Create an empty dictionary to store metropolitan areas per region
metro_region_dict = {}

# Create an iteration function to create metropolitan groups by region
for i in metro_by_region['State(s)'].sort_values().unique():
    """Organize Metropolitan Areas by Region"""
    
    #select one region
    region_sub_df = metro_by_region[metro_by_region['State(s)'] == i]
    
    #select all metro names and store in an array
    metro_array = np.array(region_sub_df['Name'])
    
    #add the metro_array as the value to the region key
    metro_region_dict[i] = metro_array 

In [63]:
# Check to verify function worked as intended 
print(metro_region_dict['Aguascalientes'], metro_region_dict['Baja California'], metro_region_dict['Ciudad de México / Hidalgo / México'])

['Aguascalientes'] ['Tijuana' 'Ensenada' 'Mexicali'] ['Valle de México\xa0[Greater Mexico City]']


 ## List the Metropolitan Areas in each Province

In [62]:
# Create a function to clearly print each metro_array per province

for key in metro_region_dict:

    print(key)
    #print(key + ' region has the following Metropolitan Areas: ' + str(metro_array))
     
    #dict_key = str(key)
    #metro_array = metro_by_region[dict_key] 

Aguascalientes
Baja California
Baja California Sur
Campeche
Chiapas
Chihuahua
Ciudad de México / Hidalgo / México
Coahuila de Zaragoza
Coahuila de Zaragoza / Durango
Colima
Durango
Guanajuato
Guanajuato / Michoacán de Ocampo
Guerrero
Hidalgo
Jalisco
Jalisco / Nayarit
Michoacán de Ocampo
Morelos
México
Nayarit
Nuevo León
Oaxaca
Puebla
Puebla / Tlaxcala
Querétaro / Guanajuato
Quintana Roo
San Luis Potosí
Sinaloa
Sonora
Tabasco
Tamaulipas
Tamaulipas / Veracruz de Ignacio de la Llave
Tlaxcala
Veracruz de Ignacio de la Llave
Yucatán
Zacatecas
