In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
USE_CASES = ["andalusia", "italy", "greece", "poland"]

use_case = "andalusia"

BASE_PATH = f"./../data/use_case_{use_case}"

In [3]:
UC_LINK = {
    "andalusia": "AND",
    "italy": "ITA", 
    "greece": "ELL", 
    "poland": "POL"
}

In [4]:
microdata_ = pd.DataFrame()

for y in range(2014, 2021):
    MICRODATA_FILEPATH = os.path.join(BASE_PATH, f"microdata/AND{y}.csv")

    df_ = pd.read_csv(MICRODATA_FILEPATH)

    df_["YEAR"] = y

    microdata_ = pd.concat([microdata_, df_])


In [None]:
microdata_

## Relevant variables to perform crop representativeness analysis
| Variable      | Description | Formula |
| :------------ | :------- | :------- |
| I_A_{code}_TA | Total area |  |
| I_A_{code}_IR | Irrigted area |
| I_PR_{code}_Q | Production quanity |
| I_SA_{code}_Q | Sales quanity |
| I_SA_{code}_V | Sales values |  |
| SE025         | Total Urilised Agricultural Area |  ( B_UO_10_A + B_UT_20_A + B_US_30_A) / 100 |
  

In [50]:
class RepresentativenessComputer():
    def __init__(self, data_path, use_case, year):

        UC_LINK = {
            "andalusia": "AND",
            "italy": "ITA", 
            "greece": "ELL", 
            "poland": "POL"}

        DATA_PATH = data_path
        YEAR = year

        self.MICRODATA_FILEPATH = os.path.join(DATA_PATH, f"microdata/{UC_LINK[use_case]}{YEAR}.csv")
        self.METADATA_PATH = os.path.join(DATA_PATH, "metadata")
        
        crops_codes = pd.read_csv(os.path.join(DATA_PATH, f"metadata/crops_codes.csv"))
        product_mapping = pd.read_csv(os.path.join(self.METADATA_PATH, "Product_Mapping.csv"))[["product_code", "Description", "CUSTOM GROUP (EN)"]]

        display(product_mapping)
        product_mapping = product_mapping[
            product_mapping.apply(lambda x: False if "ORG_" in x["CUSTOM GROUP (EN)"] else True, axis=1)
        ]
        
        #self.results = pd.DataFrame()
        #self.results["code"] = crops_codes["code"]
        #self.results["Description"] = crops_codes["Description"]

        self.results = product_mapping[["product_code", "Description", "CUSTOM GROUP (EN)"]].copy(deep=True).rename(columns={"product_code": "fadn_code", "CUSTOM GROUP (EN)": "product_group"})

        display(self.results)
        
        

    
    def _load_external_files(self):
        """
        Read external files to be processed
        """

        microdata = pd.read_csv(self.MICRODATA_FILEPATH).reset_index(drop=True)

        
        used_variables = ["A_LO_40_N", "A_TY_90_ES", "A_TY_90_TF", "A_TY_80_W"]

        b_variables = [var for var in microdata.columns if var.startswith("B")]
        ta_variables   = [var for var in microdata.columns if var.startswith("I_A") and var.endswith("_TA")]
        pr_q_variables = [var for var in microdata.columns if var.startswith("I_PR") and var.endswith("_Q")]
        sa_q_variables = [var for var in microdata.columns if var.startswith("I_SA") and var.endswith("_Q")]
        sa_v_variables = [var for var in microdata.columns if var.startswith("I_SA") and var.endswith("_V")]

        ov_v_variables = [var for var in microdata.columns if var.startswith("I_OV") and var.endswith("_V")]
        cv_v_variables = [var for var in microdata.columns if var.startswith("I_CV") and var.endswith("_V")]
        fc_v_variables = [var for var in microdata.columns if var.startswith("I_FC") and var.endswith("_V")]
        fu_v_variables = [var for var in microdata.columns if var.startswith("I_FU") and var.endswith("_V")]

        used_variables = used_variables + \
                         b_variables + \
                         ta_variables + \
                         pr_q_variables + \
                         sa_q_variables + \
                         sa_v_variables + \
                         ov_v_variables + \
                         cv_v_variables + \
                         fc_v_variables + \
                         fu_v_variables
                         

        microdata = microdata[used_variables]

        microdata = microdata.dropna(axis=1, how='all')

        return microdata


    def _get_codes_available(self, microdata):
        """
        Get a list of crop codes available in the dataset according FADN nomenclature.

        Parameters
        ----------
        microdata: pd.DataFrame
            dataframe with the microdata of the sample to analyse

        Returns
        ----------
        codes: list
            list of crop codes
        """

        codes = []

        for ta_var in [var for var in microdata.columns if var.startswith("I_A") and var.endswith("_TA")]:
            code = int(ta_var.replace("I_A_", "").replace("_TA", ""))

            if code not in codes:
                codes.append(code)
        
        # Set codes as index for results dataframe
        self.results = self.results[self.results["fadn_code"].isin(codes)]
        self.results.index = self.results["fadn_code"].tolist()
        

    def _get_weights(self, microdata, weights_var="A_TY_80_W"):
        """
        Function used to get/compute the weights of the representativeness of each farm in the sample

        Parameters
        ----------
        microdata: pd.DataFrame
            dataframe with the microdata of the sample to analyse
        weights_var: str
            FADN variable expressing the representativeness of the farm in the population

        Returns
        ----------
        microdata: pd.DataFrame
            dataframe with the weights linked to each farm in the population sample
        """

        
        if weights_var in microdata.columns:
            pass
        else:
            weights = self._compute_weights(microdata)
            microdata = pd.concat([microdata, weights], axis=1)

        return microdata


    def _get_ancillary(self, microdata, location_var="A_LO_40_N", economic_size_var="A_TY_90_ES", ote_var="A_TY_90_TF"):
        """
        Get or compute ancillary variables to extrapolate the sample

        Parameters
        ----------
        microdata: pd.DataFrame
            dataframe with the microdata of the sample to analyse
        location_var: str
            FADN variable expressing the location of the farm
        economic_size_var: str
            FADN variable expressing the economic size of the farm
        ote_var: str
            FADN variable expressing the techno-economical orientation of the farm

        Returns
        ----------
        sample_characterisation: pd.DataFrame
            dataframe with the variables containig the information to extrapolate samples
        """

        for var in [location_var, economic_size_var, ote_var]:
            if var in microdata.columns:
                pass
            else:
                synthetic_var_data = self._get_synthetic_var_data(microdata, var)

                microdata = pd.concat([microdata, synthetic_var_data], axis=1)

        return microdata


    def _get_synthetic_var_data(self, microdata, var):
        """
        Compute synthetic variable for columns [location OR economic size OR OTE]

        Parameters
        ----------
        microdata: pd.DataFrame
            dataframe with the microdata of the sample to analyse
        location_var: str
            variable for which synthetic data is computed

        Returns
        ----------
        synthetic_var_data: pd.DataFrame
            synthetic variable computed
        """
        
        synthetic_var_data = pd.DataFrame({var: [0 for _ in range(microdata.shape[0])]})

        return synthetic_var_data


    def _extrapolate_sample(self, microdata, weight_var="A_TY_80_W", location_var="A_LO_40_N", economic_size_var="A_TY_90_ES", ote_var="A_TY_90_TF"):
        """
        Parameters
        ----------
        microdata: pd.DataFrame
            dataframe with the microdata of the sample to analyse
        weightsn_var: str
            variable for which synthetic data is computed

        Returns
        ----------
        population: pd.DataFrame
            extrapolated sample
        """
        
        population = pd.DataFrame()

        n_farms = 0

        for loc in microdata[location_var].unique():
            for es in microdata[economic_size_var].unique():
                for ote in microdata[ote_var].unique():
                    sample = microdata[(microdata[location_var]==loc)&(microdata[economic_size_var]==es)&(microdata[ote_var]==ote)]

                    if sample.shape[0] > 0:

                        weight = sample[weight_var].unique()[0]

                        n_ext = int(weight*sample.shape[0])

                        n_farms += n_ext

                        #sample_ext = pd.concat([sample for _ in range(n_ext)], axis=0)
                        #population = pd.concat([population, sample_ext], axis=0)


        population = population.reset_index(drop=True)

        return population
    

    def _compute_indicators(self, microdata, weight_var="A_TY_80_W", location_var="A_LO_40_N", economic_size_var="A_TY_90_ES", ote_var="A_TY_90_TF"):
        """
        
        """

        self.results["n_appearances_abs"] = 0
        self.results["n_appearances_rel"] = 0
        self.results["total_area"] = 0
        self.results["production_quantity"] = 0
        self.results["sales_quantity"] = 0
        self.results["sales_value"] = 0


        for crop in self.results.index:
            
            # 1. Crop counter
            count_ext = microdata.apply(lambda x: x[weight_var] if x[f"I_A_{crop}_TA"]>0 else 0, axis=1).sum()
            self.results.at[crop, "n_appearances_abs"] = round(count_ext)

            # 2. Total area
            if f"I_A_{crop}_TA" in microdata.columns:
                area_ext = microdata.apply(lambda x: x[weight_var]*x[f"I_A_{crop}_TA"]/100, axis=1).sum()
                self.results.at[crop, "total_area"] = area_ext

            # 3. Average area
            if f"I_A_{crop}_TA" in microdata.columns:
                avg_area_ext = microdata.apply(lambda x: x[weight_var]*x[f"I_A_{crop}_TA"]/100, axis=1).mean()
                self.results.at[crop, "total_area"] = avg_area_ext

            # 4. Production quantity
            if f"I_PR_{crop}_Q" in microdata.columns:
                production_quantity_ext = microdata.apply(lambda x: x[weight_var]*x[f"I_PR_{crop}_Q"], axis=1).sum()
                self.results.at[crop, "production_quantity"] = production_quantity_ext
            
            # 5. Sales quantity
            if f"I_SA_{crop}_Q" in microdata.columns:
                sales_quantity_ext = microdata.apply(lambda x: x[weight_var]*x[f"I_SA_{crop}_Q"], axis=1).sum()
                self.results.at[crop, "sales_quantity"] = sales_quantity_ext

            # 6. Sales value
            if f"I_SA_{crop}_V" in microdata.columns:
                sales_value_ext = microdata.apply(lambda x: x[weight_var]*x[f"I_SA_{crop}_V"], axis=1).sum()
                self.results.at[crop, "sales_value"] = sales_value_ext

            # 7. Share area
            if f"I_A_{crop}_TA" in microdata.columns:
                if not "SE025" in microdata.columns:
                    microdata["SE025"] = microdata.apply(lambda x: ((x["B_UO_10_A"] + x["B_UT_20_A"] + x["B_US_30_A"])/100) if x["B_UO_10_A"] + x["B_UT_20_A"] + x["B_US_30_A"] > 0 else 0, axis=1).to_frame()

                share_area = microdata.apply(lambda x: x[weight_var]*(x[f"I_A_{crop}_TA"]/100) / x["SE025"] if x["SE025"] > 0 else 0, axis=1).to_frame()
                #share_area = share_area[share_area[0]>0].mean().item()
                share_area = share_area.mean().item()
                self.results.at[crop, "share_area"] = share_area

            # 8. Share income
            if f"I_SA_{crop}_V" in microdata.columns:
                if not "S135" in microdata.columns:
                    if f"I_OV_{crop}_V" in microdata.columns and f"I_CV_{crop}_V" in microdata.columns and f"I_SA_{crop}_V" in microdata.columns and f"I_FC_{crop}_V" in microdata.columns and f"I_FU_{crop}_V" in microdata.columns:
                        microdata["SE135"] = microdata.apply(lambda x: -x[f"I_OV_{crop}_V"] + x[f"I_CV_{crop}_V"] + x[f"I_SA_{crop}_V"] + x[f"I_FC_{crop}_V"] + x[f"I_FU_{crop}_V"], axis=1).to_frame()
                        share_income = microdata.apply(lambda x: x[weight_var]*x[f"I_SA_{crop}_V"] / x["SE135"] if x["SE135"] > 0 else 0, axis=1).to_frame()
                        #share_income = share_income[share_income[0]>0].mean().item()
                        share_income = share_income.mean().item()
                    else:
                        #print(f'Missing in code {crop}:\n{[v for v in [f"I_OV_{crop}_V", f"I_CV_{crop}_V", f"I_SA_{crop}_V", f"I_FC_{crop}_V", f"I_FU_{crop}_V"] if v not in microdata.columns]}')
                        share_income = 0
                self.results.at[crop, "share_income"] = share_income
                
            # 9. n crops rotation
            if f"I_A_{crop}_TA" in microdata.columns:
                ta_variables = [cta for cta in microdata.columns if cta.startswith("I_A") and cta.endswith("_TA")]
                selection = microdata[ta_variables + [weight_var]].copy(deep=True).fillna(0)
                selection = selection[selection[f"I_A_{crop}_TA"]>0]
                selection_weight = selection[weight_var]
                selection = selection.drop(columns=weight_var)
                n_crops_mean = (selection>0).astype(int).sum(axis=1)#.mean()
                
                # Extrapolate average number of crops
                n_crops_mean = (n_crops_mean.mul(selection_weight)).sum() / selection_weight.sum()
                
                self.results.at[crop, "n_crops_combination"] = n_crops_mean

        n_farms = round(microdata[weight_var].sum())

        self.results["n_appearances_rel"] = self.results.apply(lambda x: x["n_appearances_abs"]/n_farms, axis=1)


    def _compute_total_area_representation(self, microdata, weight_var="A_TY_80_W"):
        """
        """

        for crop in self.results.index:
            
            count_ext = microdata.apply(lambda x: x[weight_var] if x[f"I_A_{crop}_TA"]>0 else 0, axis=1).sum()
        
    
    def main(self):
        
        # 0. Import external files
        microdata = self._load_external_files()

        # 1. Check codes available
        codes = self._get_codes_available(microdata)
        
        # 2. Include weights variable if not available
        microdata = self._get_weights(microdata)

        # 3. Check location, econimic size and OTE variables for weights extrapolation
        microdata = self._get_ancillary(microdata)
        
        # 4. Extrapolate population sample to total sample
        #population = self._extrapolate_sample(microdata)

        # 5. Compute indicators
        self._compute_indicators(microdata)

        return self.results
        

In [None]:
product_mapping = pd.read_csv("./../data/use_case_andalusia/metadata/Product_Mapping.csv")[["product_code", "CUSTOM GROUP (EN)"]]
display(product_mapping)

In [None]:
use_case = "andalusia"
data_path = f"./../data/use_case_{use_case}"

year = 2015

rep = RepresentativenessComputer(data_path, use_case, year)
population = rep.main()

In [None]:
order = ["n_appearances_abs", "total_area", "sales_value", "share_area", ]

rep.results.sort_values(by=order, ascending=False)

In [None]:
rep.results["n crops combination"].hist(bins=int(rep.results["n crops combination"].max()))

In [None]:
rep.results["share area"].hist()

In [None]:
rep.results["share income"].hist(bins=30)