In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

BASE_DIR = "./.."

os.chdir(BASE_DIR)
from src.report_functions import *


In [2]:
pd.options.display.float_format = '{:.2f}'.format

DATA_PATH = "./data"


In [4]:
USE_CASE = "italy"
BASE_PATH = f"./data/use_case_{USE_CASE}"
TOTALS_VARIABLES = ["cultivatedArea"]

microdata_abrev = {
    "andalusia": "AND", 
    "italy": "ITA", 
}

YEAR = 2019

FIGURES_DEST_PATH = os.path.join(BASE_PATH, "figures")

In [6]:
REPORT_PATH = f"./data/use_case_{USE_CASE}/report"

if not "report" in os.listdir(f"./data/use_case_{USE_CASE}"):
    os.mkdir(f"./data/use_case_{USE_CASE}/report")
    print("Report directory created")
else:
    print("Report directory already exists")

### 1 Load original data

In [7]:
try:
    original_ = pd.read_csv(f"./data/use_case_{USE_CASE}/microdata_agricore_format/microdata_agricore_format_{YEAR}.csv")
    weights = pd.read_csv(f"./data/use_case_{USE_CASE}/microdata_agricore_format/weights_{YEAR}.csv")
    categoricals = pd.read_csv(f"./data/use_case_{USE_CASE}/microdata_agricore_format/categoricals.csv")["0"].tolist()

    print("Data already loaded")

except:
    from src.italian_name_conversion import italian_name_conversion

    crops_variables = ["quantitySold", "valueSales", "cropProduction", "irrigatedArea", "cultivatedArea", "organicProductionType", "variableCostsCrops", "landValue", "quantityUsed", "sellingPrice"]

    METADATA_PATH = os.path.join(BASE_PATH, "metadata")
    CROPS_CODES_PATH = "Product_Mapping.csv"

    crop_codes = pd.read_csv(os.path.join(METADATA_PATH, CROPS_CODES_PATH))
    crop_codes = crop_codes["CUSTOM GROUP (EN)"].drop_duplicates().dropna().unique().tolist()

    totals_variables = ["cultivatedArea", "irrigatedArea", "cropProduction"]

    original_, categoricals, weights = italian_name_conversion(
                    BASE_PATH, USE_CASE, YEAR, crops_variables, crop_codes, totals_variables)

    if not "microdata_agricore_format" in os.listdir(f"./data/use_case_{USE_CASE}"):
        os.mkdir(f"./data/use_case_{USE_CASE}/microdata_agricore_format")
        print("Microdata directory created")
    else:
        print("Microdata directory already exists")

    original_.to_csv(f"./data/use_case_{USE_CASE}/microdata_agricore_format/microdata_agricore_format_{YEAR}.csv", index=False)
    weights.to_csv(f"./data/use_case_{USE_CASE}/microdata_agricore_format/weights_{YEAR}.csv", index=False)
    pd.DataFrame(categoricals).to_csv(f"./data/use_case_{USE_CASE}/microdata_agricore_format/categoricals.csv", index=False)


### Upsample original data

In [8]:
np.random.seed(28)
original_data = original_.loc[original_.index.repeat(weights.to_numpy().flatten())].reset_index(drop=True)

print(f"Shape of the original data {original_.shape}")
print(f"Shape of the upsampled data {original_data.shape}")

### Load synthetic data

In [9]:
SYNTHETIC_DATA_PATH = os.path.join(BASE_PATH, "synthetic_population")

for file in os.listdir(SYNTHETIC_DATA_PATH):
    
    if file.startswith(f'Synthetic-Population-{USE_CASE}-{YEAR}'):
        print(file)


In [10]:
SYNTHETIC_DATA_FILE = "Synthetic-Population-italy-2019-8-20-0-3.csv"

synthetic_data = pd.read_csv(os.path.join(SYNTHETIC_DATA_PATH, SYNTHETIC_DATA_FILE))

In [11]:
synthetic_data.head()

In [14]:
print(f"Synthetic data after cleaning: {synthetic_data.shape}")

#### Resample original data according synthetic data size

In [15]:
n_farms_original = int(weights.sum())
print(f"Number of farms original: {n_farms_original}")

n_farms_synthetic = synthetic_data.shape[0]
print(f"Number of farms synthetic: {n_farms_synthetic}")

sampling_ratio = n_farms_synthetic/n_farms_original
print(f"Sampling ratio: {sampling_ratio}")
print(f"Sampling {round(sampling_ratio, 4)*100} % of original farms")

RESAMPLE = False

if sampling_ratio < 1 and RESAMPLE:
    original_data = original_data.sample(frac=sampling_ratio, replace=False, random_state=28)
    #synthetic_data = synthetic_data.sample(n=n_farms_original, replace=False, random_state=28)
else:
    print("No need to resample")

In [16]:
n_farms_original = original_data.shape[0]
print(f"Number of farms original: {n_farms_original}")

n_farms_synthetic = synthetic_data.shape[0]
print(f"Number of farms synthetic: {n_farms_synthetic}")

In [17]:
def fill_zero_records(df, var):
    """
    Fill the records with zero values according other values in the dataset.

    Args:
        df (pd.DataFrame): DataFrame with the data.
        var (str): Variable to fill the zeros.
    Returns:
        df (pd.DataFrame): DataFrame with the zeros filled.
    """
    # Compute normalized probability for each category different from zero
    _proba = df[df[var]!=0][var].value_counts().to_dict()
    sum_proba = sum(_proba.values())
    _proba = {k: v/sum_proba for k, v in _proba.items()}

    # Compute the number of zeros to be filled
    n_zeros = df[df[var]==0].shape[0]

    # Generate random values to fill the zeros
    var_random = np.random.choice(list(_proba.keys()), size=n_zeros, p=list(_proba.values()))

    # Fill the zeros with random values
    df.loc[df[var]==0, var] = var_random

    return df


#### Fix regionLevel2

In [18]:
def fix_region_level2(df):
    """
    Replace records in populations with regionLevel2 equal to zero by the appropiate regionLevel2 code according to the regionLevel2Name.
    """
    problematic_regions = df.iloc[df[df["regionLevel2"]=="0"].index]["regionLevel2Name"].unique().tolist()

    for rl2_name in problematic_regions:
        associated_codes = df[df["regionLevel2Name"]==rl2_name]["regionLevel2"].value_counts().index.tolist()
        associated_codes = [code for code in associated_codes if code != "0"]

        if len(associated_codes)==1:
            link_dict = {rl2_name: associated_codes[0]}

            df.loc[df["regionLevel2Name"]==rl2_name, "regionLevel2"] = associated_codes[0]

    return df


original_data = fix_region_level2(original_data)
synthetic_data = fix_region_level2(synthetic_data)        

#### Fix holderFamilyMemembers datatype

In [19]:
original_data["holderFamilyMembers"] = original_data["holderFamilyMembers"].astype(int)
synthetic_data["holderFamilyMembers"] = synthetic_data["holderFamilyMembers"].astype(int)

#### Fix taxes sign

In [20]:
original_data["taxes"] = original_data["taxes"].apply(lambda x: np.abs(x))
synthetic_data["taxes"] = synthetic_data["taxes"].apply(lambda x: np.abs(x))

#### Fix manureTotalSales

In [22]:
for animal in [a.split(".")[0] for a in synthetic_data.columns if a.endswith("manureTotalSales")]:
    print(animal)
    display(pd.concat([original_data[f"{animal}.manureTotalSales"].describe().rename("Original"), 
                       synthetic_data[f"{animal}.manureTotalSales"].describe().rename("Synthetic"), 
    ], axis=1))
    #original_data[f"{animal}.manureTotalSales"] = original_data.apply(lambda x: x[f"{animal}.valueSales"]/x[f"{animal}.quantitySold"] if x[f"{animal}.quantitySold"]>0 else 0, axis=1)
    synthetic_data[f"{animal}.manureTotalSales"] = 0.0

### Plot results

In [23]:
import matplotlib.pyplot as plt
import seaborn as sns

In [24]:
plot_categoricals(original_data, synthetic_data, categoricals, REPORT_PATH, USE_CASE, YEAR)


## Computing statistics

In [25]:
result_ = compute_statistics(original_data, synthetic_data, categoricals, REPORT_PATH, USE_CASE, YEAR)

display(result_)

In [26]:
similar_df = result_[result_["KS result"]=="Similar"]
print(f"Number of similar variables: {similar_df.shape[0]}")

different_df = result_[result_["KS result"]=="Different"]
print(f"Number of different variables: {different_df.shape[0]}")


In [27]:
pd.concat([original_data[original_data["CER.cropProduction"]>0]["CER.cropProduction"].describe().rename("Original").to_frame(), 
synthetic_data[synthetic_data["CER.cropProduction"]>0]["CER.cropProduction"].describe().rename("Original").to_frame(), ], axis=1)


In [28]:
nrows = 6
ncols = 2

for sheet_, batch_ in enumerate(batch(list(different_df.index), nrows*ncols)):
    
    make_plots(original_data, synthetic_data, batch_, nrows, ncols, sheet_, REPORT_PATH, USE_CASE, YEAR, REPORT=True)
    

# Compute ratios

In [29]:
for var in ["cultivatedArea", "cropProduction", "quantitySold"]:
    display(compute_ratios(original_data, synthetic_data, var))
