In [199]:
# Import necessary libraries
import pandas as pd  # Version 2.0.2
import numpy as np  # Version 1.25.0

# Set display options for pandas
pd.set_option("max_colwidth", 10)

# Define a dictionary for translating Portuguese column names to English
translation_dict = {
    "Área plantada (Hectares)": "planted_area",
    "Área colhida (Hectares)": "harvested_area",
    "Quantidade produzida (Toneladas)": "production",
    "Rendimento médio da produção (Quilogramas por Hectare)": "yield"
}

# Define the path and file name for the dataset
data_path = "your_directory/"
file_name = "your_filename.xlsx"

#### 1. Read data

It is recommended to pass already the appropriate keywords to *read_excel* or *read_csv* to e.g. assign the right header. This should be done iteratively, e.g. starting with no params for a general overview.

In [201]:
df = pd.read_excel("".join([data_path, file_name]), header=[0,1], skiprows=3, skipfooter=1) # footer is data source and can be skipped

df.head(2)

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,2020,2020,2020,2020,2021,2021,2021,2021,2022,2022,2022,2022
Unnamed: 0_level_1,Unnamed: 0_level_1.1,Unnamed: 1_level_1,Área plantada (Hectares),Área colhida (Hectares),Quantidade produzida (Toneladas),Rendimento médio da produção (Quilogramas por Hectare),Área plantada (Hectares),Área colhida (Hectares),Quantidade produzida (Toneladas),Rendimento médio da produção (Quilogramas por Hectare),Área plantada (Hectares),Área colhida (Hectares),Quantidade produzida (Toneladas),Rendimento médio da produção (Quilogramas por Hectare)
0,1100015,Alta F...,-,-,-,-,-,-,-,-,-,-,-,-
1,1100023,Arique...,-,-,-,-,-,-,-,-,-,-,-,-


#### 2. Basic preprocessing

**2.1 Column names**

In [202]:
df = df.rename(columns=translation_dict) # translate
df.columns = ["{}_{}".format(s[0],s[1]) for s in df.columns] # join multi-level columns to single level
df = df.rename(columns={"Unnamed: 0_level_0_Unnamed: 0_level_1": "code", "Unnamed: 1_level_0_Unnamed: 1_level_1":"name"}) # rename

df.head(2)

Unnamed: 0,code,name,2020_planted_area,2020_harvested_area,2020_production,2020_yield,2021_planted_area,2021_harvested_area,2021_production,2021_yield,2022_planted_area,2022_harvested_area,2022_production,2022_yield
0,1100015,Alta F...,-,-,-,-,-,-,-,-,-,-,-,-
1,1100023,Arique...,-,-,-,-,-,-,-,-,-,-,-,-


**2.2 Reshape and filter**

In [203]:
def obtain_dataset_for_quantity(data, quantity="yield", drop_empty="all", stack=False):
    """
    takes the dataframe and returns a preprocessed version of the specified quantity.
    
    params:
     - data: dataframe to perform the operations on
     - quantity: {"yield", "production", "harvested", "planted"}, default "yield", determine which quantity is requested
     - drop_empty: {"all", "any", "keep"}, default "all", determine if rows with missing values should be dropped
     - stack: bool, default False, If False, years will be columns and if True, results are stacked with columns (code, name, year, value) 
     
    returns:
     - quantity_df: dataframe, preprocessed for specified quantity
    """
    
    data = data.set_index(["code", "name"])
    quantity_df = data[[c for c in data.columns if quantity in c]]
    quantity_df.columns =  [c.split("_")[0] for c in quantity_df.columns]
    
    if drop_empty != "keep":
        quantity_df = quantity_df.replace({"-":np.nan, "..":np.nan, "...":np.nan})
        quantity_df = quantity_df.dropna(how="all")
    
    if stack:
        quantity_df = quantity_df.stack().reset_index()
        quantity_df.columns = ["code", "name", "year", "value"]
    elif ~stack:
        quantity_df = quantity_df.reset_index()
    
    return quantity_df


In [204]:
obtain_dataset_for_quantity(data=df, quantity="production", drop_empty="all", stack=False)

Unnamed: 0,code,name,2020,2021,2022
0,1300805,Borba ...,,,196.0
1,2311504,Quixer...,27.0,,
2,2919553,Luís E...,6500.0,6490.0,6100.0
3,2926202,Riachã...,10500.0,11247.0,10433.0
4,3100104,Abadia...,,,374.0
...,...,...,...,...,...
1056,5214606,Niquel...,1290.0,875.0,1071.0
1057,5215603,Padre ...,,703.0,703.0
1058,5220009,São Jo...,22000.0,14068.0,11670.0
1059,5222005,Vianóp...,855.0,550.0,549.0
