# Preprocess original census data 2022
- Open original census data
- Extract all rows for maize
- Rename variables to english
- Add variables of interest (yield, country_code, harvest_year)
- Save file as csv

In [1]:
# Imports
import pandas as pd
from pathlib import Path
import numpy as np

In [2]:
# Paths
original_path = Path.cwd().parent / 'original_data' 
original_path

PosixPath('/home/vant/Documents/valencia/agml_workshop/inegi_censos/original_data')

In [3]:
# Replace 'file_path.xlsx' with the path to your Excel file
file_path = original_path/'ca2022_agr01.xlsx'

# Read the Excel file into a Pandas DataFrame
df = pd.read_excel(file_path,skiprows=4)


In [4]:
# Now you can work with your DataFrame 'df'
# For example, you can print the first few rows:
df.head(3)

Unnamed: 0,Entidad federativa,Municipio,Cultivo,"Entidad federativa, municipio y cultivo",Unidades de producción agropecuaria activas,Unnamed: 5,Superficie cultivada,Unnamed: 7,Producción,Modalidad hídrica,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16
0,,,,,,,,,,Temporal,,,,Riego,,,
1,,,,,Total,Con agricultura a cielo abierto,Superficie sembrada,Superficie cosechada,,Unidades de producción,Superficie sembrada,Superficie cosechada,Producción,Unidades de producción,Superficie sembrada,Superficie cosechada,Producción
2,,,,,,,,,,,,,,,,,


In [5]:
df.iloc[0:5,0:6]

Unnamed: 0,Entidad federativa,Municipio,Cultivo,"Entidad federativa, municipio y cultivo",Unidades de producción agropecuaria activas,Unnamed: 5
0,,,,,,
1,,,,,Total,Con agricultura a cielo abierto
2,,,,,,
3,,,,,,
4,,,,,A,B<=A


In [6]:
df.columns

Index(['Entidad federativa', 'Municipio', 'Cultivo',
       'Entidad federativa, municipio y cultivo',
       'Unidades de producción agropecuaria activas', 'Unnamed: 5',
       'Superficie cultivada', 'Unnamed: 7', 'Producción', 'Modalidad hídrica',
       'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13',
       'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16'],
      dtype='object')

In [7]:
# Define new column names
column_names = ['Entidad federativa', 'Municipio', 'Cultivo',
       'Entidad federativa, municipio y cultivo',
       'Unidades de producción agropecuaria activas-Total', 'Unidades de producción agropecuaria activas-Con agricultura a cielo abierto',
       'Superficie cultivada-sembrada', 'Superficie cultivada-cosechada', 'Producción', 'MH-temporal-unidad-produccion',
       'MH-temporal-superficie-sembrada', 'MH-temporal-superficie-cosechada', 'MH-temporal-produccion', 'MH-riego-unidad-produccion',
       'MH-riego-superficie-sembrada', 'MH-riego-superficie-cosechada', 'MH-riego-produccion']

# Rename the columns
df.columns = column_names

# Delete rows with no data
df = df.drop(index=range(5)).reset_index(drop=True)



In [8]:
df.head(5)

Unnamed: 0,Entidad federativa,Municipio,Cultivo,"Entidad federativa, municipio y cultivo",Unidades de producción agropecuaria activas-Total,Unidades de producción agropecuaria activas-Con agricultura a cielo abierto,Superficie cultivada-sembrada,Superficie cultivada-cosechada,Producción,MH-temporal-unidad-produccion,MH-temporal-superficie-sembrada,MH-temporal-superficie-cosechada,MH-temporal-produccion,MH-riego-unidad-produccion,MH-riego-superficie-sembrada,MH-riego-superficie-cosechada,MH-riego-produccion
0,00 NAL,,,Estados Unidos Mexicanos,4629134.0,4366995.0,22937029.2021,18864519.5362,,3673541.0,16788300.4445,13081544.6158,,831305.0,6148728.7576,5782974.9204,
1,00 NAL,,Anuales,Anuales,,,,,,,,,,,,,
2,00 NAL,,Algodón,Algodón,4666.0,4665.0,169271.2457,157702.2879,686092.3256,75.0,1075.7468,1040.0138,2089.543,4592.0,168195.4989,156662.2741,684002.7826
3,00 NAL,,Amaranto,Amaranto,4532.0,4461.0,4933.7853,4659.1731,6265.1429,4119.0,4615.0523,4349.2943,5690.5142,366.0,318.733,309.8788,574.6287
4,00 NAL,,Arroz,Arroz,2952.0,2950.0,31854.1177,30417.4643,172702.6519,1393.0,15804.7797,14862.1306,71087.1147,1565.0,16049.338,15555.3337,101615.5372


In [9]:
# Drop the column at index 3
df_clean = df.drop("Entidad federativa, municipio y cultivo", axis=1)

# Display the modified DataFrame
df_clean.head()


Unnamed: 0,Entidad federativa,Municipio,Cultivo,Unidades de producción agropecuaria activas-Total,Unidades de producción agropecuaria activas-Con agricultura a cielo abierto,Superficie cultivada-sembrada,Superficie cultivada-cosechada,Producción,MH-temporal-unidad-produccion,MH-temporal-superficie-sembrada,MH-temporal-superficie-cosechada,MH-temporal-produccion,MH-riego-unidad-produccion,MH-riego-superficie-sembrada,MH-riego-superficie-cosechada,MH-riego-produccion
0,00 NAL,,,4629134.0,4366995.0,22937029.2021,18864519.5362,,3673541.0,16788300.4445,13081544.6158,,831305.0,6148728.7576,5782974.9204,
1,00 NAL,,Anuales,,,,,,,,,,,,,
2,00 NAL,,Algodón,4666.0,4665.0,169271.2457,157702.2879,686092.3256,75.0,1075.7468,1040.0138,2089.543,4592.0,168195.4989,156662.2741,684002.7826
3,00 NAL,,Amaranto,4532.0,4461.0,4933.7853,4659.1731,6265.1429,4119.0,4615.0523,4349.2943,5690.5142,366.0,318.733,309.8788,574.6287
4,00 NAL,,Arroz,2952.0,2950.0,31854.1177,30417.4643,172702.6519,1393.0,15804.7797,14862.1306,71087.1147,1565.0,16049.338,15555.3337,101615.5372


In [10]:
# Filter records for maize (in spanish maíz or Maíz)
# obtener todos los registros con cultivo que contenga: Maíz forrajero Maíz grano amarillo Maíz grano blanco
# Filter the DataFrame based on the condition
df_clean.dropna(subset=['Cultivo'], inplace=True)
df_clean.dropna(subset=['Municipio'], inplace=True)
maiz_df = df_clean[df_clean['Cultivo'].str.contains('Maíz', case=False)].copy()
maiz_df.head(5)

Unnamed: 0,Entidad federativa,Municipio,Cultivo,Unidades de producción agropecuaria activas-Total,Unidades de producción agropecuaria activas-Con agricultura a cielo abierto,Superficie cultivada-sembrada,Superficie cultivada-cosechada,Producción,MH-temporal-unidad-produccion,MH-temporal-superficie-sembrada,MH-temporal-superficie-cosechada,MH-temporal-produccion,MH-riego-unidad-produccion,MH-riego-superficie-sembrada,MH-riego-superficie-cosechada,MH-riego-produccion
93,01 AGS,001 Aguascalientes,Maíz forrajero,1591,1591,13719.6093,13083.3126,339873.3523,1257,9406.1929,8855.2401,104486.6145,380,4313.4164,4228.0725,235386.7378
94,01 AGS,001 Aguascalientes,Maíz grano amarillo,29,29,233.835,230.8305,997.6648,29,233.835,230.8305,997.6648,0,0.0,0.0,0.0
95,01 AGS,001 Aguascalientes,Maíz grano blanco,802,802,6019.7733,5826.5034,22008.7552,754,5556.7985,5364.0286,18359.1613,55,462.9748,462.4748,3649.5939
122,01 AGS,002 Asientos,Maíz forrajero,690,690,4587.2771,4320.7626,152522.1552,392,2152.3979,1961.1395,14881.8086,348,2434.8792,2359.6231,137640.3466
123,01 AGS,002 Asientos,Maíz grano amarillo,52,52,177.805,144.952,506.9692,47,157.805,125.952,363.4392,5,20.0,19.0,143.53


In [11]:
maiz_df.shape

(6433, 16)

In [12]:
maiz_df.columns

Index(['Entidad federativa', 'Municipio', 'Cultivo',
       'Unidades de producción agropecuaria activas-Total',
       'Unidades de producción agropecuaria activas-Con agricultura a cielo abierto',
       'Superficie cultivada-sembrada', 'Superficie cultivada-cosechada',
       'Producción', 'MH-temporal-unidad-produccion',
       'MH-temporal-superficie-sembrada', 'MH-temporal-superficie-cosechada',
       'MH-temporal-produccion', 'MH-riego-unidad-produccion',
       'MH-riego-superficie-sembrada', 'MH-riego-superficie-cosechada',
       'MH-riego-produccion'],
      dtype='object')

In [13]:
english_col_names = ['State', 'Municipality', 'crop_name',
 'Active agricultural production units - Total',
 'Active agricultural production units - With open agriculture',
 'planted_area',
 'harvest_area',
 'production',
 'Water Modality - Temporary - Production unit',
 'Water Modality - Temporary - planted_area',
 'Water Modality - Temporary - harvest_area',
 'Water Modality - Temporary - yield',
 'Water Modality - Irrigation - Production unit',
 'Water Modality - Irrigation - planted_area',
 'Water Modality - Irrigation - harvest_area',
 'Water Modality - Irrigation - yield']



In [14]:
maiz_df.columns = english_col_names

# Define translations for crops
translations = {
    'Maíz forrajero': 'Forage corn',
    'Maíz grano amarillo': 'Yellow corn',
    'Maíz grano blanco': 'White corn'
}

# Replace the values in the "Cultivo" column with their English translations
maiz_df.loc[:, "crop_name"] = maiz_df["crop_name"].replace(translations)

maiz_df.head(5)

Unnamed: 0,State,Municipality,crop_name,Active agricultural production units - Total,Active agricultural production units - With open agriculture,planted_area,harvest_area,production,Water Modality - Temporary - Production unit,Water Modality - Temporary - planted_area,Water Modality - Temporary - harvest_area,Water Modality - Temporary - yield,Water Modality - Irrigation - Production unit,Water Modality - Irrigation - planted_area,Water Modality - Irrigation - harvest_area,Water Modality - Irrigation - yield
93,01 AGS,001 Aguascalientes,Forage corn,1591,1591,13719.6093,13083.3126,339873.3523,1257,9406.1929,8855.2401,104486.6145,380,4313.4164,4228.0725,235386.7378
94,01 AGS,001 Aguascalientes,Yellow corn,29,29,233.835,230.8305,997.6648,29,233.835,230.8305,997.6648,0,0.0,0.0,0.0
95,01 AGS,001 Aguascalientes,White corn,802,802,6019.7733,5826.5034,22008.7552,754,5556.7985,5364.0286,18359.1613,55,462.9748,462.4748,3649.5939
122,01 AGS,002 Asientos,Forage corn,690,690,4587.2771,4320.7626,152522.1552,392,2152.3979,1961.1395,14881.8086,348,2434.8792,2359.6231,137640.3466
123,01 AGS,002 Asientos,Yellow corn,52,52,177.805,144.952,506.9692,47,157.805,125.952,363.4392,5,20.0,19.0,143.53


In [15]:
# Add country_code -  2 letter code of the country (MX)

maiz_df.loc[:,"country_code"] = "MX"

# reorder columns
maiz_df = maiz_df[['country_code'] + [col for col in maiz_df.columns if col != 'country_code']]

print(maiz_df.iloc[:3,:3])


   country_code   State        Municipality
93           MX  01 AGS  001 Aguascalientes
94           MX  01 AGS  001 Aguascalientes
95           MX  01 AGS  001 Aguascalientes


In [16]:
# Split 'State' into 'adm_id' and 'state_acrs'
maiz_df[['adm_id', 'state_acrs']] = maiz_df['State'].str.split(' ', expand=True)

maiz_df.head(5)

Unnamed: 0,country_code,State,Municipality,crop_name,Active agricultural production units - Total,Active agricultural production units - With open agriculture,planted_area,harvest_area,production,Water Modality - Temporary - Production unit,Water Modality - Temporary - planted_area,Water Modality - Temporary - harvest_area,Water Modality - Temporary - yield,Water Modality - Irrigation - Production unit,Water Modality - Irrigation - planted_area,Water Modality - Irrigation - harvest_area,Water Modality - Irrigation - yield,adm_id,state_acrs
93,MX,01 AGS,001 Aguascalientes,Forage corn,1591,1591,13719.6093,13083.3126,339873.3523,1257,9406.1929,8855.2401,104486.6145,380,4313.4164,4228.0725,235386.7378,1,AGS
94,MX,01 AGS,001 Aguascalientes,Yellow corn,29,29,233.835,230.8305,997.6648,29,233.835,230.8305,997.6648,0,0.0,0.0,0.0,1,AGS
95,MX,01 AGS,001 Aguascalientes,White corn,802,802,6019.7733,5826.5034,22008.7552,754,5556.7985,5364.0286,18359.1613,55,462.9748,462.4748,3649.5939,1,AGS
122,MX,01 AGS,002 Asientos,Forage corn,690,690,4587.2771,4320.7626,152522.1552,392,2152.3979,1961.1395,14881.8086,348,2434.8792,2359.6231,137640.3466,1,AGS
123,MX,01 AGS,002 Asientos,Yellow corn,52,52,177.805,144.952,506.9692,47,157.805,125.952,363.4392,5,20.0,19.0,143.53,1,AGS


In [17]:
# Reordenar las columnas si es necesario
# Define the desired column order
desired_columns = [
    'crop_name','country_code','adm_id', 
    'planted_area', 'harvest_area', 'production'
]

# Reorder the DataFrame columns
maiz_df = maiz_df[desired_columns]

maiz_df.head(5)

Unnamed: 0,crop_name,country_code,adm_id,planted_area,harvest_area,production
93,Forage corn,MX,1,13719.6093,13083.3126,339873.3523
94,Yellow corn,MX,1,233.835,230.8305,997.6648
95,White corn,MX,1,6019.7733,5826.5034,22008.7552
122,Forage corn,MX,1,4587.2771,4320.7626,152522.1552
123,Yellow corn,MX,1,177.805,144.952,506.9692


In [18]:
# Aggregate by state and crop_name (to be as data from other years)

agg_df = maiz_df.groupby(['adm_id', 'crop_name'], as_index=False).sum()
agg_df.loc[:,"country_code"] = "MX"
agg_df["harvest_year"] = "2022"



In [19]:
# Create yield and drop production

agg_df["yield"] = agg_df["production"] / agg_df["harvest_area"]


# Reorder
ordered_columns = [
    'crop_name','country_code','adm_id', 
    'planted_area', 'harvest_area','harvest_year', 'yield','production'
]

# Reorder the DataFrame columns
agg_df = agg_df[ordered_columns]
agg_df.head()



Unnamed: 0,crop_name,country_code,adm_id,planted_area,harvest_area,harvest_year,yield,production
0,Forage corn,MX,1,40577.2736,38703.2584,2022,34.146456,1321579.1063
1,White corn,MX,1,44560.6275,38898.8835,2022,2.446934,95182.9957
2,Yellow corn,MX,1,981.777,904.4691,2022,4.535275,4102.0158
3,Forage corn,MX,2,1356.88,1234.88,2022,45.316427,55960.3488
4,White corn,MX,2,1370.7305,1350.2059,2022,11.157839,15065.3798


In [20]:
# Transform to numeric and round
agg_df['planted_area'] = pd.to_numeric(agg_df['planted_area'], errors='coerce')
agg_df['harvest_area'] = pd.to_numeric(agg_df['harvest_area'], errors='coerce')
agg_df['yield'] = pd.to_numeric(agg_df['yield'], errors='coerce')
agg_df['production'] = pd.to_numeric(agg_df['production'], errors='coerce')
agg_df['production'] = agg_df['production']/1000000
# Round
#agg_df= agg_df.round(2)
# agg_df['harvest_area'] = agg_df['harvest_area'].round(2)
# agg_df['yield'] = agg_df['yield'].round(2)

# Add prefix "MX-" to the 'adm_id' column
agg_df['adm_id'] = agg_df['adm_id'].apply(lambda x: 'MX' + x)

agg_df.head()

Unnamed: 0,crop_name,country_code,adm_id,planted_area,harvest_area,harvest_year,yield,production
0,Forage corn,MX,MX01,40577.2736,38703.2584,2022,34.146456,1.321579
1,White corn,MX,MX01,44560.6275,38898.8835,2022,2.446934,0.095183
2,Yellow corn,MX,MX01,981.777,904.4691,2022,4.535275,0.004102
3,Forage corn,MX,MX02,1356.88,1234.88,2022,45.316427,0.05596
4,White corn,MX,MX02,1370.7305,1350.2059,2022,11.157839,0.015065


In [21]:
agg_df.shape

(96, 8)

In [22]:
# Define metadata
metadata = {
    "source":"INEGI Censo Agropecuario 2022",
    "yield": "tonnes/ha",
    "production": "megatonnes",
    "Areas": "hectares"
}

# Store metadata in attributes or dictionaries
agg_df.attrs['metadata'] = metadata

# Display the modified DataFrame
agg_df.attrs


{'metadata': {'source': 'INEGI Censo Agropecuario 2022',
  'yield': 'tonnes/ha',
  'production': 'megatonnes',
  'Areas': 'hectares'}}

In [23]:
# Saving data
# Save DataFrame to CSV
agg_df.to_csv('maize_data_2022.csv', index=False)

# Save metadata to a separate file (e.g., JSON)
import json
with open('maize_metadata_2022.json', 'w') as file:
    json.dump(metadata, file)


In [24]:
#Check saved data
# Load DataFrame from CSV
maiz_df2 = pd.read_csv('maize_data_2022.csv', index_col=0)

# Load metadata from JSON
with open('maize_metadata_2022.json', 'r') as file:
    metadata = json.load(file)

# Assign metadata back to the DataFrame
maiz_df2.attrs['metadata'] = metadata

#maiz_df2.attrs
maiz_df2.head()



Unnamed: 0_level_0,country_code,adm_id,planted_area,harvest_area,harvest_year,yield,production
crop_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Forage corn,MX,MX01,40577.2736,38703.2584,2022,34.146456,1.321579
White corn,MX,MX01,44560.6275,38898.8835,2022,2.446934,0.095183
Yellow corn,MX,MX01,981.777,904.4691,2022,4.535275,0.004102
Forage corn,MX,MX02,1356.88,1234.88,2022,45.316427,0.05596
White corn,MX,MX02,1370.7305,1350.2059,2022,11.157839,0.015065
