The following notebook has last been tested on Apr 4, 2024 for maize data.

In [75]:
# Import necessary libraries
import glob
import pandas as pd  # Version 2.0.2
import numpy as np  # Version 1.25.0

# Set display options for pandas
pd.set_option("max_colwidth", 20)

# Define a dictionary for translating Spanish column names to English.
translation_dict = {
    "idDepartamento":"adm_id", 
    "Departamento":"adm_name",
    "Campaña":"planting_year",
    "Sup. Sembrada": "planted_area",
    "Sup. Cosechada": "harvested_area",
    "Producción": "production",
    "Rendimiento": "yield"
}

# Define a list of variables
crop_variables = ["yield", "production", "harvested_area", "planted_area"]

# Define the path to the folder containing only the datasets
data_path = "C:/Users/rhd630/Downloads/maize_AR.csv"


# Define metadata of your dataset
harvest_year_offset = 1 # 0 if harvest is in same year as planting, 1 if harvest is in the year after planting 
crop_name = "Maize"
crop_id = np.nan
season_name = np.nan
source = "https://datosestimaciones.magyp.gob.ar/"
harvest_month = "06" # Check e.g. https://ipad.fas.usda.gov/countrysummary/default.aspx?id=AR&crop=Corn
planting_month = "09" # Check e.g. https://ipad.fas.usda.gov/countrysummary/default.aspx?id=AR&crop=Corn

### 1. Read data

In [76]:
df = (
        pd
        .read_csv(data_path, skiprows=0, skipfooter=0, engine="python",encoding="latin1", delimiter=';')
        .rename(columns=translation_dict)
        .assign(harvest_year=lambda x: x.planting_year.str.split('/').str[0].astype(int) + harvest_year_offset, crop_id=crop_id, season_name=season_name, 
                planting_month=planting_month, crop_name=crop_name, source=source,  harvest_month=harvest_month, country_code="ARG")
        )

df.head(2)



Unnamed: 0,Cultivo,planting_year,Provincia,adm_name,idProvincia,adm_id,planted_area,harvested_area,production,yield,harvest_year,crop_id,season_name,planting_month,crop_name,source,harvest_month,country_code
0,Maíz,1969/70,BUENOS AIRES,25 DE MAYO,6,854,36000,33000,102300,3100,1970,,,10,Maize,https://datosest...,3,ARG
1,Maíz,1970/71,BUENOS AIRES,25 DE MAYO,6,854,55000,49000,161700,3300,1971,,,10,Maize,https://datosest...,3,ARG


### 2. Preprocessing

In [77]:
# Define harvest and planting dates
df["harvest_date"] = df["harvest_year"].astype(str) + "-" + df["harvest_month"]
df["planting_date"] = df["planting_year"].astype(str) + "-" + df["planting_month"]

# Adjust adm_id to include country code
df['adm_id'] = 'AR' + df['adm1_id'].astype(str).str.zfill(3) + df['adm_id'].astype(str).str.zfill(3)

# Prepare to filter columns
filtered_columns = ["crop_name", "crop_id", "country_code", "adm_id", "adm_name", "season_name", "planting_year", "planting_date", "harvest_year", "harvest_date", "source"] + crop_variables
df = df[filtered_columns]

# Replace placeholders with NaN and inspect unique values in crop variables
placeholders = ["SD"]  
df[crop_variables] = df[crop_variables].replace(placeholders, np.nan)

# Drop rows with all NaNs in crop_variables, fill other NaNs with 0, and reset index
df = df.dropna(subset=crop_variables, how="all").fillna(0).reset_index(drop=True)

# Convert crop variables to float
df[crop_variables] = df[crop_variables].astype("float64")

# Convert yield to tonnes per hectare (assuming original unit is kg/ha)
df["yield"] = df["yield"] / 1000

# Display the first few rows to confirm changes
df.head()


Unnamed: 0,crop_name,crop_id,country_code,adm_id,adm_name,season_name,planting_year,planting_date,harvest_year,harvest_date,source,yield,production,harvested_area,planted_area
0,Maize,0.0,ARG,AR854,25 DE MAYO,0.0,1969/70,1969/70-10,1970,1970-03,https://datosest...,3.1,102300.0,33000.0,36000.0
1,Maize,0.0,ARG,AR854,25 DE MAYO,0.0,1970/71,1970/71-10,1971,1971-03,https://datosest...,3.3,161700.0,49000.0,55000.0
2,Maize,0.0,ARG,AR854,25 DE MAYO,0.0,1971/72,1971/72-10,1972,1972-03,https://datosest...,2.0,76000.0,38000.0,42000.0
3,Maize,0.0,ARG,AR854,25 DE MAYO,0.0,1972/73,1972/73-10,1973,1973-03,https://datosest...,3.3,33000.0,10000.0,40000.0
4,Maize,0.0,ARG,AR854,25 DE MAYO,0.0,1973/74,1973/74-10,1974,1974-03,https://datosest...,3.0,84000.0,28000.0,28000.0


**Check if data was merged correctly from different variables**

Here, we validate if production divided by harvested area equals yield with respect to some rounding error.

In [78]:
df.loc[abs((df["yield"]) - (df["production"] / df["harvested_area"])) > 0.001]

Unnamed: 0,crop_name,crop_id,country_code,adm_id,adm_name,season_name,planting_year,planting_date,harvest_year,harvest_date,source,yield,production,harvested_area,planted_area


If DataFrame is empty, proceed to export. Otherwise de-bugging is required during the preprocessing steps.

### Export

In [111]:
export_path = "C:/Users/rhd630/../preprocessed_data_argentina.csv"
df.to_csv(export_path, index=False)