In [34]:
# Step 1: Import the relevant python libraries for the analysis
import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [35]:
# Step 2: Load and test dataset and set the index if applicable
mexico_gdp = pd.read_csv('mexico_region_gdp_per_capita.csv')
mexico_gdp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 3 columns):
Metropolitan Areas    64 non-null object
Year_2010             64 non-null object
Year_2015             64 non-null object
dtypes: object(3)
memory usage: 1.6+ KB


In [36]:
mexico_gdp.head()

Unnamed: 0,Metropolitan Areas,Year_2010,Year_2015
0,MEX51: Minatitlan,12108,12105
1,MEX58: Campeche,..,..
2,MEX46: Victoria,17832,17581
3,MEX30: Durango,13269,13849
4,MEX04: Puebla,10304,10681


In [37]:
# Step 3: Create a Dataframe to store data types in materna
mex_gdp_dtypes = pd.DataFrame(mexico_gdp.dtypes)

#Rename column name to DataType
mex_gdp_dtypes = mex_gdp_dtypes.rename(columns={0:'DataType'})

#Analyze Missing Values
mex_gdp_dtypes['MissingVal'] = mexico_gdp.isnull().sum()

#Identify number of unique values
mex_gdp_dtypes['NumUnique'] = mexico_gdp.nunique()

#Identify the count for each variable
mex_gdp_dtypes['Count']= mexico_gdp.count()
mex_gdp_dtypes

Unnamed: 0,DataType,MissingVal,NumUnique,Count
Metropolitan Areas,object,0,64,64
Year_2010,object,0,63,64
Year_2015,object,0,43,64


*Preliminary Observations*: Fortunately, the dataset seems rather clean as there are no missing values. However, the list of 'Metropolitan areas' appears to be twice as long as expected: there are 32 Provinces within Mexico, but the total list is twice that at 64. This requires further exploration.

In [42]:
list(np.unique(mexico_gdp['Metropolitan Areas']))

['MEX01: Mexico City',
 'MEX02: Guadalajara',
 'MEX03: Monterrey',
 'MEX04: Puebla',
 'MEX05: Toluca',
 'MEX06: Tijuana',
 'MEX07: Leon',
 'MEX08: Queretaro',
 'MEX09: Merida',
 'MEX10: Juarez',
 'MEX11: San Luis Potosi',
 'MEX12: Torreon',
 'MEX13: Mexicali',
 'MEX14: Cuernavaca',
 'MEX15: Centro',
 'MEX16: Culiacan',
 'MEX17: Morelia',
 'MEX18: Chihuahua',
 'MEX19: Veracruz',
 'MEX20: Hermosillo',
 'MEX21: Aguascalientes',
 'MEX22: Acapulco de Juarez',
 'MEX23: Tampico',
 'MEX24: Oaxaca de Juarez',
 'MEX25: Reynosa',
 'MEX26: Xalapa',
 'MEX27: Saltillo',
 'MEX28: Benito Juarez',
 'MEX29: Celaya',
 'MEX30: Durango',
 'MEX31: Tuxtla Gutierrez',
 'MEX32: Pachuca de Soto',
 'MEX33: Irapuato',
 'MEX34: Ahome',
 'MEX35: Matamoros',
 'MEX36: Ensenada',
 'MEX37: Poza Rica de Hidalgo',
 'MEX38: Tepic',
 'MEX39: Orizaba',
 'MEX40: Mazatlan',
 'MEX41: Cajeme',
 'MEX42: Nuevo Laredo',
 'MEX43: Guadalupe',
 'MEX44: Coatzacoalcos',
 'MEX45: Tapachula',
 'MEX46: Victoria',
 'MEX47: Tlaxcala',
 'MEX

In [49]:
# Convert number objects in Year_2010 and Year_2015 to numbers

# Year_2010
mexico_gdp['Year_2010'] = pd.to_numeric(mexico_gdp['Year_2010'], errors='coerce')

# Year_2015
mexico_gdp['Year_2015'] = pd.to_numeric(mexico_gdp['Year_2015'], errors='coerce')

# Verify object datatypes were switched to numbers
mexico_gdp.dtypes

Metropolitan Areas     object
Year_2010             float64
Year_2015             float64
dtype: object

In [51]:
# Step 4: Run descriptive statistics of number datatypes
mexico_gdp.describe(include=['number'])

Unnamed: 0,Year_2010,Year_2015
count,62.0,62.0
mean,15115.596774,15444.177419
std,5268.897192,4965.949402
min,7102.0,6696.0
25%,12094.5,11947.0
50%,13550.5,14661.0
75%,17804.75,19045.5
max,31370.0,28251.0
