# Preprocess original census data 2019
- Open original census data
- Extract all rows for maize
- Rename variables to english
- Add variables of interest (yield, country_code, harvest_year)
- Save file as csv

In [1]:
# Imports
import pandas as pd
from pathlib import Path

In [2]:
# Paths
original_path = Path.cwd().parent / 'original_data' 
original_path

PosixPath('/home/vant/Documents/valencia/agml_workshop/inegi_censos/original_data')

In [3]:
# Replace 'file_path.xlsx' with the path to your Excel file
file_path = original_path/'ena19_ent_agri02.xlsx'

# Read the Excel file into a Pandas DataFrame
df = pd.read_excel(file_path,skiprows=4)

In [4]:
df.head()

Unnamed: 0,Entidad federativa,Cultivo seleccionado,Superficie cultivada,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Producción,Unnamed: 9,Unnamed: 10
0,,,Total,,Modalidad hídrica,,,,,,
1,,,,,De temporal,,De riego,,Total,Modalidad hídrica,
2,,,Superficie sembrada,Superficie cosechada,Superficie sembrada,Superficie cosechada,Superficie sembrada,Superficie cosechada,,Bajo temporal,Bajo riego
3,,,,,,,,,,,
4,,,Hectáreas,,,,,,Toneladas,,


In [5]:
df.columns

Index(['Entidad federativa', 'Cultivo seleccionado', 'Superficie cultivada',
       'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7',
       'Producción', 'Unnamed: 9', 'Unnamed: 10'],
      dtype='object')

In [6]:
# Define new column names
column_names = ['Entidad federativa', 'Cultivo','Total superficie sembrada','Total superficie cosechada',
                'MH-temporal superficie sembrada','MH-temporal superficie cosechada','MH-riego superficie sembrada','MH-riego superficie cosechada',
                'Producción total', 'MH-temporal producción','MH-riego producción']

# Rename the columns
df.columns = column_names

# Delete rows with initial no data
df = df.drop(index=range(6)).reset_index(drop=True)

In [7]:
df.dropna(subset=['Cultivo'], inplace=True)
df.head()

Unnamed: 0,Entidad federativa,Cultivo,Total superficie sembrada,Total superficie cosechada,MH-temporal superficie sembrada,MH-temporal superficie cosechada,MH-riego superficie sembrada,MH-riego superficie cosechada,Producción total,MH-temporal producción,MH-riego producción
0,Baja California,Anuales,,,,,,,,,
1,Baja California,Cebolla,3722.67205,3721.64905,0.0,0.0,3722.67205,3721.64905,89666.876496,0.0,89666.876496
2,Baja California,Maíz blanco,635.8649,635.8649,0.0,0.0,635.8649,635.8649,6286.43536,0.0,6286.43536
3,Baja California,Trigo grano,82894.463903,80554.267803,3846.1392,3474.2976,79048.324703,77079.970203,427174.268979,12188.23215,414986.036829
4,Baja California Sur,Anuales,,,,,,,,,


In [8]:
# Extract only maize
maiz_df = df[df['Cultivo'].str.contains('Maíz', case=False)].copy()
maiz_df.head(5)

Unnamed: 0,Entidad federativa,Cultivo,Total superficie sembrada,Total superficie cosechada,MH-temporal superficie sembrada,MH-temporal superficie cosechada,MH-riego superficie sembrada,MH-riego superficie cosechada,Producción total,MH-temporal producción,MH-riego producción
2,Baja California,Maíz blanco,635.8649,635.8649,0.0,0.0,635.8649,635.8649,6286.43536,0.0,6286.43536
5,Baja California Sur,Maíz blanco,5596.0026,5511.54,102.6676,102.6676,5493.335,5408.8724,37869.150169,44.0004,37825.149769
16,Chiapas,Maíz blanco,412950.845178,395995.111668,396831.302078,379875.568568,16119.5431,16119.5431,1084385.734431,1024625.797131,59759.9373
21,Chihuahua,Maíz amarillo,287635.764182,281470.829138,51017.316192,48237.309148,236618.44799,233233.51999,2481299.064627,232423.322577,2248875.74205
22,Chihuahua,Maíz blanco,43774.04167,38248.48574,33206.05267,27989.88314,10567.989,10258.6026,101107.46144,33516.17514,67591.2863


In [9]:
maiz_df.shape

(19, 11)

In [10]:
# translate colnames to english
english_col_names = ['state_name', 'crop_name','planted_area','harvest_area',
                'Water Modality - Temporary - Cultivated area - Sown',
                'Water Modality - Temporary - Cultivated area - Harvested',
                'Water Modality - Irrigation - Cultivated area - Sown',
                'Water Modality - Irrigation - Cultivated area - Harvested',
                'production',
                'Water Modality - Temporary - Production',
                'Water Modality - Irrigation - Production']

maiz_df.columns = english_col_names

# translate to English crop names
# Define translations
translations = {
    'Maíz forrajero': 'Forage corn',
    'Maíz amarillo': 'Yellow corn',
    'Maíz blanco': 'White corn'
}

# Replace the values in the "Cultivo" column with their English translations
maiz_df.loc[:, "crop_name"] = maiz_df["crop_name"].replace(translations)

maiz_df.head(5)

Unnamed: 0,state_name,crop_name,planted_area,harvest_area,Water Modality - Temporary - Cultivated area - Sown,Water Modality - Temporary - Cultivated area - Harvested,Water Modality - Irrigation - Cultivated area - Sown,Water Modality - Irrigation - Cultivated area - Harvested,production,Water Modality - Temporary - Production,Water Modality - Irrigation - Production
2,Baja California,White corn,635.8649,635.8649,0.0,0.0,635.8649,635.8649,6286.43536,0.0,6286.43536
5,Baja California Sur,White corn,5596.0026,5511.54,102.6676,102.6676,5493.335,5408.8724,37869.150169,44.0004,37825.149769
16,Chiapas,White corn,412950.845178,395995.111668,396831.302078,379875.568568,16119.5431,16119.5431,1084385.734431,1024625.797131,59759.9373
21,Chihuahua,Yellow corn,287635.764182,281470.829138,51017.316192,48237.309148,236618.44799,233233.51999,2481299.064627,232423.322577,2248875.74205
22,Chihuahua,White corn,43774.04167,38248.48574,33206.05267,27989.88314,10567.989,10258.6026,101107.46144,33516.17514,67591.2863


In [11]:
# Read key value to create adm_id
state_key = pd.read_csv("state_key.csv", dtype={'adm_id': str})

state_key.head()

Unnamed: 0,state_code,state_name,adm_id,state_acrs
0,01 Ags,Aguascalientes,1,Ags
1,02 BC,Baja California,2,BC
2,03 BCS,Baja California Sur,3,BCS
3,05 Coa,Coahuila de Zaragoza,5,Coa
4,06 Col,Colima,6,Col


In [12]:
# Add adm_id,year, change names and drop columns
maiz_df['adm_id'] = maiz_df['state_name'].map(state_key.set_index('state_name')['adm_id'])

# Add columns
maiz_df["country_code"] = "MX"
maiz_df["harvest_year"] = "2019"

#create yield
maiz_df["yield"] = maiz_df["production"] / maiz_df["harvest_area"]

# tranform production in tonnes to mtonnes
maiz_df['production'] = maiz_df['production']/1000000

# Reorder
ordered_columns = [
    'crop_name','country_code','adm_id', 
    'planted_area', 'harvest_area','harvest_year', 'yield','production'
]

# Reorder the DataFrame columns
maiz_df = maiz_df[ordered_columns]

# Add prefix "MX-" to the 'adm_id' column# Convert 'adm_id' column to string type
maiz_df['adm_id'] = maiz_df['adm_id'].astype(str)
maiz_df['adm_id'] = maiz_df['adm_id'].apply(lambda x: 'MX' + x)

print(maiz_df.shape)

maiz_df.head()


(19, 8)


Unnamed: 0,crop_name,country_code,adm_id,planted_area,harvest_area,harvest_year,yield,production
2,White corn,MX,MX02,635.8649,635.8649,2019,9.886432,0.006286
5,White corn,MX,MX03,5596.0026,5511.54,2019,6.870884,0.037869
16,White corn,MX,MX07,412950.845178,395995.111668,2019,2.738382,1.084386
21,Yellow corn,MX,MX08,287635.764182,281470.829138,2019,8.815475,2.481299
22,White corn,MX,MX08,43774.04167,38248.48574,2019,2.643437,0.101107


In [13]:
# Define metadata
metadata = {
    "source":"INEGI Encuesta Nacional Agropecuaria 2019",
    "Production": "megatonnes",
    "yield": "tonnes/ha",
    "Areas": "hectares",
    "Note": "Data for states of Aguascalientes, Coahuila, and Quintana Roo is not published because the collected information from the selected crops is insufficient to obtain estimated data."
}

# Store metadata in attributes or dictionaries
maiz_df.attrs['metadata'] = metadata

# Display the modified DataFrame
maiz_df.attrs

{'metadata': {'source': 'INEGI Encuesta Nacional Agropecuaria 2019',
  'Production': 'megatonnes',
  'yield': 'tonnes/ha',
  'Areas': 'hectares',
  'Note': 'Data for states of Aguascalientes, Coahuila, and Quintana Roo is not published because the collected information from the selected crops is insufficient to obtain estimated data.'}}

In [14]:
# Saving data
# Save DataFrame to CSV
maiz_df.to_csv('maize_data_2019.csv')

# Save metadata to a separate file (e.g., JSON)
import json
with open('maize_metadata_2019.json', 'w') as file:
    json.dump(metadata, file)

In [15]:
#Check saved data
# Load DataFrame from CSV
maiz_df2 = pd.read_csv('maize_data_2019.csv', index_col=0)

# Load metadata from JSON
with open('maize_metadata_2019.json', 'r') as file:
    metadata = json.load(file)

# Assign metadata back to the DataFrame
maiz_df2.attrs['metadata'] = metadata

maiz_df2.attrs
#maiz_df2.head()

{'metadata': {'source': 'INEGI Encuesta Nacional Agropecuaria 2019',
  'Production': 'megatonnes',
  'yield': 'tonnes/ha',
  'Areas': 'hectares',
  'Note': 'Data for states of Aguascalientes, Coahuila, and Quintana Roo is not published because the collected information from the selected crops is insufficient to obtain estimated data.'}}