# Preprocess original census data 2014
- Open original census data
- Extract all rows for maize
- Rename variables to english
- Add variables of interest (yield, country_code, harvest_year)
- Save file as csv

In [1]:
# Imports
import pandas as pd
from pathlib import Path

In [2]:
# Paths
original_path = Path.cwd().parent / 'original_data' 
original_path

PosixPath('/home/vant/Documents/valencia/agml_workshop/inegi_censos/original_data')

In [3]:
# Replace 'file_path.xlsx' with the path to your Excel file
file_path = original_path/'ena14_agri02.xlsx'

# Read the Excel file into a Pandas DataFrame
df = pd.read_excel(file_path,skiprows=5)

In [4]:
df.head()

Unnamed: 0,Entidad,Cultivo,Entidad federativa y cultivo con representatividad en la muestra,Superficie cultivada,Unnamed: 4,Producción
0,,,,Superficie sembrada,Superficie cosechada,
1,,,,Hectáreas,,Toneladas
2,,,,,,
3,Ags.,,Aguascalientes,,,
4,Ags.,Alfalfa,Alfalfa,6461.311717,6420.884037,


In [5]:
df.columns

Index(['Entidad', 'Cultivo',
       'Entidad federativa y cultivo con representatividad en la muestra',
       'Superficie cultivada', 'Unnamed: 4', 'Producción'],
      dtype='object')

In [6]:
# Define new column names
column_names = ['Entidad federativa', 'Cultivo','Entidad federativa y cultivo','Total superficie sembrada','Total superficie cosechada',
                'Producción total']

# Rename the columns
df.columns = column_names

# Delete rows with initial no data
df = df.drop(index=range(3)).reset_index(drop=True)

In [7]:
df.head()

Unnamed: 0,Entidad federativa,Cultivo,Entidad federativa y cultivo,Total superficie sembrada,Total superficie cosechada,Producción total
0,Ags.,,Aguascalientes,,,
1,Ags.,Alfalfa,Alfalfa,6461.311717,6420.884037,
2,Ags.,Frijol,Frijol,10156.525639,8594.411379,3797.035222
3,Ags.,Maíz blanco,Maíz blanco,74292.287463,67277.37109,274683.292887
4,BC.,,Baja California,,,


In [8]:
# Extract key:value for state code
# Filter rows when Cultivo es NaN
df.dropna(subset=['Entidad federativa', 'Entidad federativa y cultivo'], inplace=True)
estado_codigo_nombre = df.loc[df['Cultivo'].isna(), ['Entidad federativa', 'Entidad federativa y cultivo']]
estado_codigo_nombre = estado_codigo_nombre[~estado_codigo_nombre['Entidad federativa y cultivo'].isin(["Perennes", "Anuales"])]
estado_codigo_nombre.columns = ["codigo","nombre"] 
# Mostrar el nuevo DataFrame
print(estado_codigo_nombre)


     codigo                           nombre
0      Ags.                   Aguascalientes
4       BC.                  Baja California
9      BCS.              Baja California Sur
14    Camp.                         Campeche
17    Coah.             Coahuila de Zaragoza
25     Col.                           Colima
29    Chis.                          Chiapas
33    Chih.                        Chihuahua
39      DF.                 Distrito Federal
42     Dgo.                          Durango
48     Gto.                       Guanajuato
54     Gro.                         Guerrero
58     Hgo.                          Hidalgo
62     Jal.                          Jalisco
66     Mex.                           México
69    Mich.              Michoacán de Ocampo
76     Mor.                          Morelos
80     Nay.                          Nayarit
86      NL.                       Nuevo León
91     Oax.                           Oaxaca
96     Pue.                           Puebla
100    Qro

In [9]:
# Read key value to create adm_id
state_key = pd.read_csv("state_key.csv", dtype={'adm_id': str})

state_key.head(32)

Unnamed: 0,state_code,state_name,adm_id,state_acrs
0,01 Ags,Aguascalientes,1,Ags
1,02 BC,Baja California,2,BC
2,03 BCS,Baja California Sur,3,BCS
3,05 Coa,Coahuila de Zaragoza,5,Coa
4,06 Col,Colima,6,Col
5,07 Chs,Chiapas,7,Chs
6,08 Chi,Chihuahua,8,Chi
7,09 CMX,Ciudad de México,9,CMX
8,10 Dgo,Durango,10,Dgo
9,11 Gto,Guanajuato,11,Gto


In [10]:
df.dropna(subset=['Cultivo'], inplace=True)
# Drop the column at index 3
df_clean = df.drop("Entidad federativa y cultivo", axis=1)
df_clean.head()


Unnamed: 0,Entidad federativa,Cultivo,Total superficie sembrada,Total superficie cosechada,Producción total
1,Ags.,Alfalfa,6461.311717,6420.884037,
2,Ags.,Frijol,10156.525639,8594.411379,3797.035222
3,Ags.,Maíz blanco,74292.287463,67277.37109,274683.292887
5,BC.,Alfalfa,28979.202828,28780.852218,
6,BC.,Algodón,28204.512802,28146.903202,84765.363171


In [11]:
# Extract only maize
maiz_df = df_clean[df_clean['Cultivo'].str.contains('Maíz', case=False)].copy()

# #remove dot at the end to later replace with adm_id
# maiz_df['Entidad federativa'] = maiz_df['Entidad federativa'].str.rstrip('.')

maiz_df.head(10)

Unnamed: 0,Entidad federativa,Cultivo,Total superficie sembrada,Total superficie cosechada,Producción total
3,Ags.,Maíz blanco,74292.287463,67277.37109,274683.292887
12,BCS.,Maíz blanco,5351.2698,5147.8698,35171.0482
16,Camp.,Maíz blanco,167105.616512,148953.002642,408859.462636
20,Coah.,Maíz blanco,49616.524801,37750.320798,70988.073773
28,Col.,Maíz blanco,16981.536845,16320.717689,56074.4033
32,Chis.,Maíz blanco,572650.95816,543991.815373,1165423.163722
38,Chih.,Maíz blanco,151801.744651,141981.005084,488235.542461
41,DF.,Maíz blanco,4630.825465,4460.835624,8731.303384
46,Dgo.,Maíz blanco,147282.294479,142243.870047,507384.326924
47,Dgo.,Maíz forrajero,37609.693629,36616.705641,


In [12]:
maiz_df.shape

(33, 5)

In [13]:
# replace Entidad federativa codes for state names
maiz_df['adm_id'] = maiz_df['Entidad federativa'].map(estado_codigo_nombre.set_index('codigo')['nombre'])
maiz_df['adm_id'] = maiz_df['adm_id'].map(state_key.set_index('state_name')['adm_id'])

# since there was a change in Mexico how to call the capital we replace for this case manually
# before it was called DF and Ciudad de Mexico
maiz_df.loc[maiz_df['Entidad federativa'] == 'DF.', 'adm_id'] = '09'
maiz_df.head(20)

Unnamed: 0,Entidad federativa,Cultivo,Total superficie sembrada,Total superficie cosechada,Producción total,adm_id
3,Ags.,Maíz blanco,74292.287463,67277.37109,274683.292887,1
12,BCS.,Maíz blanco,5351.2698,5147.8698,35171.0482,3
16,Camp.,Maíz blanco,167105.616512,148953.002642,408859.462636,4
20,Coah.,Maíz blanco,49616.524801,37750.320798,70988.073773,5
28,Col.,Maíz blanco,16981.536845,16320.717689,56074.4033,6
32,Chis.,Maíz blanco,572650.95816,543991.815373,1165423.163722,7
38,Chih.,Maíz blanco,151801.744651,141981.005084,488235.542461,8
41,DF.,Maíz blanco,4630.825465,4460.835624,8731.303384,9
46,Dgo.,Maíz blanco,147282.294479,142243.870047,507384.326924,10
47,Dgo.,Maíz forrajero,37609.693629,36616.705641,,10


In [14]:
# translate colnames to english
english_col_names = ['state',
                     'crop_name',
                     'planted_area',
                     'harvest_area',
                     'production', 'adm_id']

maiz_df.columns = english_col_names

# translate to English crop names
# Define translations
translations = {
    'Maíz forrajero': 'Forage corn',
    'Maíz amarillo': 'Yellow corn',
    'Maíz blanco': 'White corn'
}

# Replace the values in the "Cultivo" column with their English translations
maiz_df.loc[:, "crop_name"] = maiz_df["crop_name"].replace(translations)

maiz_df.head(5)

Unnamed: 0,state,crop_name,planted_area,harvest_area,production,adm_id
3,Ags.,White corn,74292.287463,67277.37109,274683.292887,1
12,BCS.,White corn,5351.2698,5147.8698,35171.0482,3
16,Camp.,White corn,167105.616512,148953.002642,408859.462636,4
20,Coah.,White corn,49616.524801,37750.320798,70988.073773,5
28,Col.,White corn,16981.536845,16320.717689,56074.4033,6


In [15]:
# Create country and year column
maiz_df["country_code"] = "MX"
maiz_df["harvest_year"] = "2014"

# calculate yield
maiz_df["yield"] = maiz_df["production"] / maiz_df["harvest_area"]

# Transform to numeric 
maiz_df['planted_area'] = pd.to_numeric(maiz_df['planted_area'], errors='coerce')
maiz_df['harvest_area'] = pd.to_numeric(maiz_df['harvest_area'], errors='coerce')
maiz_df['yield'] = pd.to_numeric(maiz_df['yield'], errors='coerce')
maiz_df['production'] = pd.to_numeric(maiz_df['production'], errors='coerce')
maiz_df['production'] = maiz_df['production']/1000000

# reorder columns and leave the target ones
ordered_columns = [
    'crop_name','country_code','adm_id', 
    'planted_area', 'harvest_area','harvest_year', 'yield','production'
]

# Reorder the DataFrame columns
maiz_df = maiz_df[ordered_columns]

# Add prefix "MX-" to the 'adm_id' column# Convert 'adm_id' column to string type
#maiz_df['adm_id'] = maiz_df['adm_id'].astype(str)
maiz_df['adm_id'] = maiz_df['adm_id'].apply(lambda x: 'MX' + x)

print(maiz_df.shape)
maiz_df.head()

(33, 8)


Unnamed: 0,crop_name,country_code,adm_id,planted_area,harvest_area,harvest_year,yield,production
3,White corn,MX,MX01,74292.287463,67277.37109,2014,4.082848,0.274683
12,White corn,MX,MX03,5351.2698,5147.8698,2014,6.832156,0.035171
16,White corn,MX,MX04,167105.616512,148953.002642,2014,2.744889,0.408859
20,White corn,MX,MX05,49616.524801,37750.320798,2014,1.880463,0.070988
28,White corn,MX,MX06,16981.536845,16320.717689,2014,3.43578,0.056074


In [16]:
# Define metadata
metadata = {
    "source":"INEGI Encuesta Nacional Agropecuaria 2014",
    "yield": "tonnes/ha",
    "Production": "megatonnes",
    "Areas": "hectares"
}

# Store metadata in attributes or dictionaries
maiz_df.attrs['metadata'] = metadata

# Display the modified DataFrame
maiz_df.attrs

{'metadata': {'source': 'INEGI Encuesta Nacional Agropecuaria 2014',
  'yield': 'tonnes/ha',
  'Production': 'megatonnes',
  'Areas': 'hectares'}}

In [17]:
# Saving data
# Save DataFrame to CSV
maiz_df.to_csv('maize_data_2014.csv', index=False)

# Save metadata to a separate file (e.g., JSON)
import json
with open('maize_metadata_2014.json', 'w') as file:
    json.dump(metadata, file)

In [18]:
#Check saved data
# Load DataFrame from CSV
maiz_df2 = pd.read_csv('maize_data_2014.csv', index_col=0)

# Load metadata from JSON
with open('maize_metadata_2014.json', 'r') as file:
    metadata = json.load(file)

# Assign metadata back to the DataFrame
maiz_df2.attrs['metadata'] = metadata

maiz_df2.attrs
#maiz_df2.head()

{'metadata': {'source': 'INEGI Encuesta Nacional Agropecuaria 2014',
  'yield': 'tonnes/ha',
  'Production': 'megatonnes',
  'Areas': 'hectares'}}

In [19]:
valores_unicos = maiz_df["crop_name"].unique()
print(valores_unicos)


['White corn' 'Forage corn']
