# Preprocess original census data 2022
- Open original census data
- Extract all rows for maize
- Rename variables to english
- Save file as csv

In [1]:
# Imports
import pandas as pd
from pathlib import Path

In [2]:
# Paths
original_path = Path.cwd().parent / 'original_data' 
original_path

PosixPath('/home/vant/Documents/valencia/agml_workshop/inegi_censos/original_data')

In [3]:
# Replace 'file_path.xlsx' with the path to your Excel file
file_path = original_path/'ca2022_agr01.xlsx'

# Read the Excel file into a Pandas DataFrame
df = pd.read_excel(file_path,skiprows=4)


In [4]:
# Now you can work with your DataFrame 'df'
# For example, you can print the first few rows:
df.head(3)

Unnamed: 0,Entidad federativa,Municipio,Cultivo,"Entidad federativa, municipio y cultivo",Unidades de producción agropecuaria activas,Unnamed: 5,Superficie cultivada,Unnamed: 7,Producción,Modalidad hídrica,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16
0,,,,,,,,,,Temporal,,,,Riego,,,
1,,,,,Total,Con agricultura a cielo abierto,Superficie sembrada,Superficie cosechada,,Unidades de producción,Superficie sembrada,Superficie cosechada,Producción,Unidades de producción,Superficie sembrada,Superficie cosechada,Producción
2,,,,,,,,,,,,,,,,,


In [5]:
df.iloc[0:5,0:6]

Unnamed: 0,Entidad federativa,Municipio,Cultivo,"Entidad federativa, municipio y cultivo",Unidades de producción agropecuaria activas,Unnamed: 5
0,,,,,,
1,,,,,Total,Con agricultura a cielo abierto
2,,,,,,
3,,,,,,
4,,,,,A,B<=A


In [6]:
df.columns

Index(['Entidad federativa', 'Municipio', 'Cultivo',
       'Entidad federativa, municipio y cultivo',
       'Unidades de producción agropecuaria activas', 'Unnamed: 5',
       'Superficie cultivada', 'Unnamed: 7', 'Producción', 'Modalidad hídrica',
       'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13',
       'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16'],
      dtype='object')

In [7]:
# Define new column names
column_names = ['Entidad federativa', 'Municipio', 'Cultivo',
       'Entidad federativa, municipio y cultivo',
       'Unidades de producción agropecuaria activas-Total', 'Unidades de producción agropecuaria activas-Con agricultura a cielo abierto',
       'Superficie cultivada-sembrada', 'Superficie cultivada-cosechada', 'Producción', 'MH-temporal-unidad-produccion',
       'MH-temporal-superficie-sembrada', 'MH-temporal-superficie-cosechada', 'MH-temporal-produccion', 'MH-riego-unidad-produccion',
       'MH-riego-superficie-sembrada', 'MH-riego-superficie-cosechada', 'MH-riego-produccion']

# Rename the columns
df.columns = column_names

# Delete rows with no data
df = df.drop(index=range(5)).reset_index(drop=True)



In [8]:
df.head(5)

Unnamed: 0,Entidad federativa,Municipio,Cultivo,"Entidad federativa, municipio y cultivo",Unidades de producción agropecuaria activas-Total,Unidades de producción agropecuaria activas-Con agricultura a cielo abierto,Superficie cultivada-sembrada,Superficie cultivada-cosechada,Producción,MH-temporal-unidad-produccion,MH-temporal-superficie-sembrada,MH-temporal-superficie-cosechada,MH-temporal-produccion,MH-riego-unidad-produccion,MH-riego-superficie-sembrada,MH-riego-superficie-cosechada,MH-riego-produccion
0,00 NAL,,,Estados Unidos Mexicanos,4629134.0,4366995.0,22937029.2021,18864519.5362,,3673541.0,16788300.4445,13081544.6158,,831305.0,6148728.7576,5782974.9204,
1,00 NAL,,Anuales,Anuales,,,,,,,,,,,,,
2,00 NAL,,Algodón,Algodón,4666.0,4665.0,169271.2457,157702.2879,686092.3256,75.0,1075.7468,1040.0138,2089.543,4592.0,168195.4989,156662.2741,684002.7826
3,00 NAL,,Amaranto,Amaranto,4532.0,4461.0,4933.7853,4659.1731,6265.1429,4119.0,4615.0523,4349.2943,5690.5142,366.0,318.733,309.8788,574.6287
4,00 NAL,,Arroz,Arroz,2952.0,2950.0,31854.1177,30417.4643,172702.6519,1393.0,15804.7797,14862.1306,71087.1147,1565.0,16049.338,15555.3337,101615.5372


In [9]:
# Drop the column at index 3
df_clean = df.drop("Entidad federativa, municipio y cultivo", axis=1)

# Display the modified DataFrame
df_clean.head()


Unnamed: 0,Entidad federativa,Municipio,Cultivo,Unidades de producción agropecuaria activas-Total,Unidades de producción agropecuaria activas-Con agricultura a cielo abierto,Superficie cultivada-sembrada,Superficie cultivada-cosechada,Producción,MH-temporal-unidad-produccion,MH-temporal-superficie-sembrada,MH-temporal-superficie-cosechada,MH-temporal-produccion,MH-riego-unidad-produccion,MH-riego-superficie-sembrada,MH-riego-superficie-cosechada,MH-riego-produccion
0,00 NAL,,,4629134.0,4366995.0,22937029.2021,18864519.5362,,3673541.0,16788300.4445,13081544.6158,,831305.0,6148728.7576,5782974.9204,
1,00 NAL,,Anuales,,,,,,,,,,,,,
2,00 NAL,,Algodón,4666.0,4665.0,169271.2457,157702.2879,686092.3256,75.0,1075.7468,1040.0138,2089.543,4592.0,168195.4989,156662.2741,684002.7826
3,00 NAL,,Amaranto,4532.0,4461.0,4933.7853,4659.1731,6265.1429,4119.0,4615.0523,4349.2943,5690.5142,366.0,318.733,309.8788,574.6287
4,00 NAL,,Arroz,2952.0,2950.0,31854.1177,30417.4643,172702.6519,1393.0,15804.7797,14862.1306,71087.1147,1565.0,16049.338,15555.3337,101615.5372


In [10]:
# Filter records for maize (in spanish maíz or Maíz)
# obtener todos los registros con cultivo que contenga: Maíz forrajero Maíz grano amarillo Maíz grano blanco
# Filter the DataFrame based on the condition
df_clean.dropna(subset=['Cultivo'], inplace=True)
df_clean.dropna(subset=['Municipio'], inplace=True)
maiz_df = df_clean[df_clean['Cultivo'].str.contains('Maíz', case=False)]
maiz_df.head(20)

Unnamed: 0,Entidad federativa,Municipio,Cultivo,Unidades de producción agropecuaria activas-Total,Unidades de producción agropecuaria activas-Con agricultura a cielo abierto,Superficie cultivada-sembrada,Superficie cultivada-cosechada,Producción,MH-temporal-unidad-produccion,MH-temporal-superficie-sembrada,MH-temporal-superficie-cosechada,MH-temporal-produccion,MH-riego-unidad-produccion,MH-riego-superficie-sembrada,MH-riego-superficie-cosechada,MH-riego-produccion
93,01 AGS,001 Aguascalientes,Maíz forrajero,1591,1591,13719.6093,13083.3126,339873.3523,1257,9406.1929,8855.2401,104486.6145,380,4313.4164,4228.0725,235386.7378
94,01 AGS,001 Aguascalientes,Maíz grano amarillo,29,29,233.835,230.8305,997.6648,29,233.835,230.8305,997.6648,0,0.0,0.0,0.0
95,01 AGS,001 Aguascalientes,Maíz grano blanco,802,802,6019.7733,5826.5034,22008.7552,754,5556.7985,5364.0286,18359.1613,55,462.9748,462.4748,3649.5939
122,01 AGS,002 Asientos,Maíz forrajero,690,690,4587.2771,4320.7626,152522.1552,392,2152.3979,1961.1395,14881.8086,348,2434.8792,2359.6231,137640.3466
123,01 AGS,002 Asientos,Maíz grano amarillo,52,52,177.805,144.952,506.9692,47,157.805,125.952,363.4392,5,20.0,19.0,143.53
124,01 AGS,002 Asientos,Maíz grano blanco,2053,2053,9618.8066,8185.7351,13855.6288,1777,8322.2844,6922.3776,6065.2098,312,1296.5222,1263.3575,7790.419
148,01 AGS,003 Calvillo,Maíz forrajero,262,262,972.6302,857.1883,14532.262,209,784.7755,678.2547,4719.2554,59,187.8547,178.9336,9813.0066
149,01 AGS,003 Calvillo,Maíz grano amarillo,15,15,47.44,45.44,196.4423,12,42.19,40.19,150.4148,3,5.25,5.25,46.0275
150,01 AGS,003 Calvillo,Maíz grano blanco,836,836,2016.9773,1797.7592,4869.0259,672,1668.2322,1449.2141,2707.3644,182,348.7451,348.5451,2161.6615
175,01 AGS,004 Cosío,Maíz forrajero,337,337,1450.8873,1418.2261,60974.8832,154,458.1881,427.9796,3637.1987,194,992.6992,990.2465,57337.6845


In [11]:
maiz_df.shape

(6433, 16)

In [12]:
maiz_df.columns

Index(['Entidad federativa', 'Municipio', 'Cultivo',
       'Unidades de producción agropecuaria activas-Total',
       'Unidades de producción agropecuaria activas-Con agricultura a cielo abierto',
       'Superficie cultivada-sembrada', 'Superficie cultivada-cosechada',
       'Producción', 'MH-temporal-unidad-produccion',
       'MH-temporal-superficie-sembrada', 'MH-temporal-superficie-cosechada',
       'MH-temporal-produccion', 'MH-riego-unidad-produccion',
       'MH-riego-superficie-sembrada', 'MH-riego-superficie-cosechada',
       'MH-riego-produccion'],
      dtype='object')

In [13]:
english_col_names = ['State', 'Municipality', 'Crop',
 'Active agricultural production units - Total',
 'Active agricultural production units - With open agriculture',
 'Cultivated area - Sown',
 'Cultivated area - Harvested',
 'Production',
 'Water Modality - Temporary - Production unit',
 'Water Modality - Temporary - Cultivated area - Sown',
 'Water Modality - Temporary - Cultivated area - Harvested',
 'Water Modality - Temporary - Production',
 'Water Modality - Irrigation - Production unit',
 'Water Modality - Irrigation - Cultivated area - Sown',
 'Water Modality - Irrigation - Cultivated area - Harvested',
 'Water Modality - Irrigation - Production']



In [14]:
maiz_df.columns = english_col_names

maiz_df.head(5)

Unnamed: 0,State,Municipality,Crop,Active agricultural production units - Total,Active agricultural production units - With open agriculture,Cultivated area - Sown,Cultivated area - Harvested,Production,Water Modality - Temporary - Production unit,Water Modality - Temporary - Cultivated area - Sown,Water Modality - Temporary - Cultivated area - Harvested,Water Modality - Temporary - Production,Water Modality - Irrigation - Production unit,Water Modality - Irrigation - Cultivated area - Sown,Water Modality - Irrigation - Cultivated area - Harvested,Water Modality - Irrigation - Production
93,01 AGS,001 Aguascalientes,Maíz forrajero,1591,1591,13719.6093,13083.3126,339873.3523,1257,9406.1929,8855.2401,104486.6145,380,4313.4164,4228.0725,235386.7378
94,01 AGS,001 Aguascalientes,Maíz grano amarillo,29,29,233.835,230.8305,997.6648,29,233.835,230.8305,997.6648,0,0.0,0.0,0.0
95,01 AGS,001 Aguascalientes,Maíz grano blanco,802,802,6019.7733,5826.5034,22008.7552,754,5556.7985,5364.0286,18359.1613,55,462.9748,462.4748,3649.5939
122,01 AGS,002 Asientos,Maíz forrajero,690,690,4587.2771,4320.7626,152522.1552,392,2152.3979,1961.1395,14881.8086,348,2434.8792,2359.6231,137640.3466
123,01 AGS,002 Asientos,Maíz grano amarillo,52,52,177.805,144.952,506.9692,47,157.805,125.952,363.4392,5,20.0,19.0,143.53


In [15]:
# translate to English crop names
# Define translations
translations = {
    'Maíz forrajero': 'Forage corn',
    'Maíz grano amarillo': 'Yellow corn',
    'Maíz grano blanco': 'White corn'
}

# Replace the values in the "Cultivo" column with their English translations
maiz_df.loc[:, "Crop"] = maiz_df["Crop"].replace(translations)

# Display the modified DataFrame
maiz_df.head()


Unnamed: 0,State,Municipality,Crop,Active agricultural production units - Total,Active agricultural production units - With open agriculture,Cultivated area - Sown,Cultivated area - Harvested,Production,Water Modality - Temporary - Production unit,Water Modality - Temporary - Cultivated area - Sown,Water Modality - Temporary - Cultivated area - Harvested,Water Modality - Temporary - Production,Water Modality - Irrigation - Production unit,Water Modality - Irrigation - Cultivated area - Sown,Water Modality - Irrigation - Cultivated area - Harvested,Water Modality - Irrigation - Production
93,01 AGS,001 Aguascalientes,Forage corn,1591,1591,13719.6093,13083.3126,339873.3523,1257,9406.1929,8855.2401,104486.6145,380,4313.4164,4228.0725,235386.7378
94,01 AGS,001 Aguascalientes,Yellow corn,29,29,233.835,230.8305,997.6648,29,233.835,230.8305,997.6648,0,0.0,0.0,0.0
95,01 AGS,001 Aguascalientes,White corn,802,802,6019.7733,5826.5034,22008.7552,754,5556.7985,5364.0286,18359.1613,55,462.9748,462.4748,3649.5939
122,01 AGS,002 Asientos,Forage corn,690,690,4587.2771,4320.7626,152522.1552,392,2152.3979,1961.1395,14881.8086,348,2434.8792,2359.6231,137640.3466
123,01 AGS,002 Asientos,Yellow corn,52,52,177.805,144.952,506.9692,47,157.805,125.952,363.4392,5,20.0,19.0,143.53


In [16]:
# Define metadata
metadata = {
    "source":"INEGI Censo Agropecuario 2022",
    "Production": "tonnes",
    "Areas": "hectares"
}

# Store metadata in attributes or dictionaries
maiz_df.attrs['metadata'] = metadata

# Display the modified DataFrame
maiz_df.attrs


{'metadata': {'source': 'INEGI Censo Agropecuario 2022',
  'Production': 'tonnes',
  'Areas': 'hectares'}}

In [27]:
# Saving data
# Save DataFrame to CSV
maiz_df.to_csv('maize_data_2022.csv')

# Save metadata to a separate file (e.g., JSON)
import json
with open('maize_metadata_2022.json', 'w') as file:
    json.dump(metadata, file)


In [28]:
#Check saved data
# Load DataFrame from CSV
maiz_df2 = pd.read_csv('maize_data_2022.csv', index_col=0)

# Load metadata from JSON
with open('maize_metadata_2022.json', 'r') as file:
    metadata = json.load(file)

# Assign metadata back to the DataFrame
maiz_df2.attrs['metadata'] = metadata

#maiz_df2.attrs
maiz_df2.head()



Unnamed: 0,State,Municipality,Crop,Active agricultural production units - Total,Active agricultural production units - With open agriculture,Cultivated area - Sown,Cultivated area - Harvested,Production,Water Modality - Temporary - Production unit,Water Modality - Temporary - Cultivated area - Sown,Water Modality - Temporary - Cultivated area - Harvested,Water Modality - Temporary - Production,Water Modality - Irrigation - Production unit,Water Modality - Irrigation - Cultivated area - Sown,Water Modality - Irrigation - Cultivated area - Harvested,Water Modality - Irrigation - Production
93,01 AGS,001 Aguascalientes,Forage corn,1591,1591,13719.6093,13083.3126,339873.3523,1257,9406.1929,8855.2401,104486.6145,380,4313.4164,4228.0725,235386.7378
94,01 AGS,001 Aguascalientes,Yellow corn,29,29,233.835,230.8305,997.6648,29,233.835,230.8305,997.6648,0,0.0,0.0,0.0
95,01 AGS,001 Aguascalientes,White corn,802,802,6019.7733,5826.5034,22008.7552,754,5556.7985,5364.0286,18359.1613,55,462.9748,462.4748,3649.5939
122,01 AGS,002 Asientos,Forage corn,690,690,4587.2771,4320.7626,152522.1552,392,2152.3979,1961.1395,14881.8086,348,2434.8792,2359.6231,137640.3466
123,01 AGS,002 Asientos,Yellow corn,52,52,177.805,144.952,506.9692,47,157.805,125.952,363.4392,5,20.0,19.0,143.53
