The following notebook has last been tested on April 4, 2024 for wheat data.

In [8]:
# Import necessary libraries
import glob
import pandas as pd  # Version 2.0.2
import numpy as np  # Version 1.25.0

# Set display options for pandas
pd.set_option("max_colwidth", 20)

# Define a dictionary for translating Portuguese column names to English.
translation_dict = {
    "Cód.":"adm_id", 
    "Município":"adm_name",
    "Ano":"planting_year",
    "Área plantada (Hectares)": "planted_area",
    "Área colhida (Hectares)": "harvested_area",
    "Quantidade produzida (Toneladas)": "production",
    "Rendimento médio da produção (Quilogramas por Hectare)": "yield"
}

# Define a list of variables
crop_variables = ["yield", "production", "harvested_area", "planted_area"]

# Define the path to the folder containing only the datasets
data_path = "C:/Users/Max Zachow/../crop_data.csv"

# Define metadata of your dataset
harvest_year_offset = 0 # 0 if harvest is in same year as planting, 1 if harvest is in the year after planting 
crop_name = "spring wheat"
crop_id = np.nan
season_name = np.nan
source = "https://sidra.ibge.gov.br/tabela/1612"
harvest_month = "12" # Check e.g. https://ipad.fas.usda.gov/countrysummary/default.aspx?id=BR&crop=Wheat
planting_month = "05" # Check e.g. https://ipad.fas.usda.gov/countrysummary/default.aspx?id=BR&crop=Wheat

### 1. Read data

In [2]:
df = (
        pd
        .read_csv(data_path, skiprows=3, skipfooter=24, engine="python")
        .rename(columns=translation_dict)
        .assign(harvest_year=lambda x: x.planting_year + harvest_year_offset, crop_id=crop_id, season_name=season_name, 
                planting_month=planting_month, crop_name=crop_name, source=source,  harvest_month=harvest_month, country_code="BRA")
        )

df.head(2)

Unnamed: 0,adm_id,adm_name,planting_year,planted_area,harvested_area,production,yield,harvest_year,crop_id,season_name,planting_month,crop_name,source,harvest_month,country_code
0,1100015,Alta Floresta D'...,1974,...,...,...,...,1974,,,5,spring wheat,https://sidra.ib...,12,BRA
1,1100015,Alta Floresta D'...,1975,...,...,...,...,1975,,,5,spring wheat,https://sidra.ib...,12,BRA


### 2. Preprocessing

In [3]:
# Define harvest and planting dates
df["harvest_date"] = df["harvest_year"].astype(str) + "-" + df["harvest_month"]
df["planting_date"] = df["planting_year"].astype(str) + "-" + df["planting_month"]

# Filter columns, replace NaNs, drop rows with all NaNs, and convert yield to t / ha
df = df[["crop_name", "crop_id", "country_code", "adm_id", "adm_name", "season_name", "planting_year", "planting_date", "harvest_year", "harvest_date", "source"] + crop_variables]
df[crop_variables] = df[crop_variables].replace({"-":np.nan, "..":np.nan, "...":np.nan})
df = df.dropna(subset=crop_variables, how="all").fillna(0).reset_index(drop=True)
df[crop_variables] = df[crop_variables].astype("float64")
df["yield"] = df["yield"] / 1000

#Adjust adm_id to include country code in it
df['adm_id'] = 'BR' + df['adm_id'].astype(str)

df.head()

Unnamed: 0,crop_name,crop_id,country_code,adm_id,adm_name,season_name,planting_year,planting_date,harvest_year,harvest_date,source,yield,production,harvested_area,planted_area
0,spring wheat,0.0,BRA,1300805,Borba (AM),0.0,2022,2022-05,2022,2022-12,https://sidra.ib...,14.0,196.0,14.0,14.0
1,spring wheat,0.0,BRA,1708205,Formoso do Aragu...,0.0,1983,1983-05,1983,1983-12,https://sidra.ib...,1.0,4.0,4.0,0.0
2,spring wheat,0.0,BRA,2311504,Quixeré (CE),0.0,2020,2020-05,2020,2020-12,https://sidra.ib...,5.4,27.0,5.0,5.0
3,spring wheat,0.0,BRA,2311504,Quixeré (CE),0.0,2021,2021-05,2021,2021-12,https://sidra.ib...,0.0,0.0,0.0,18.0
4,spring wheat,0.0,BRA,2902807,Barra da Estiva ...,0.0,1986,1986-05,1986,1986-12,https://sidra.ib...,1.2,4800.0,4000.0,0.0


**Check if data was merged correctly from different variables**

Here, we validate if production divided by harvested area equals yield with respect to some rounding error.

In [7]:
df.loc[abs((df["yield"]) - (df["production"] / df["harvested_area"])) > 0.001]

Unnamed: 0,crop_name,crop_id,country_code,adm_id,adm_name,season_name,planting_year,planting_date,harvest_year,harvest_date,source,yield,production,harvested_area,planted_area


If DataFrame is empty, proceed to export. Otherwise de-bugging is required during the preprocessing steps.

### Export

In [111]:
export_path = "C:/Users/Max Zachow/../preprocessed_data_brazil.csv"
df.to_csv(export_path, index=False)

### Next steps

If you downloaded several chunks of years from IBGE (2022-2000, 1980-1999, ..), you could run them separately through this notebook, save them through the step above and then import them here at the end for concatenation.

In [None]:
import_file_1 = "your_path/data/../...csv"
import_file_2 = "your_path/data/../...csv"

preprocessed_brazil_data_2000_2022 = pd.read_csv(import_file_1)
preprocessed_brazil_data_1980_1999 = pd.read_csv(import_file_2)

brazil_data_all_combined = pd.concat([preprocessed_brazil_data_1980_1999, preprocessed_brazil_data_2000_2022], ignore_index=True)

# Export the combined dataset
export_path = "C:/Users/Max Zachow/../all_years_preprocessed_data_brazil.csv"
brazil_data_all_combined.to_csv(export_path, index=False)