In [75]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#BASE_DIR = "./work/carlos/complete_execution_andalucia"
BASE_DIR = "./.."

os.chdir(BASE_DIR)
from src.report_functions import *
from src.VariableNameConversion import VariableNameConversion


In [103]:
pd.options.display.float_format = '{:.2f}'.format

In [104]:
DATA_PATH = "./data"


In [105]:
USE_CASE = "poland"
BASE_PATH = f"./data/use_case_{USE_CASE}"
TOTALS_VARIABLES = ["cultivatedArea"]


YEAR = 2018

FIGURES_DEST_PATH = os.path.join(BASE_PATH, "figures")

In [106]:
try:
    os.chdir("./complete_execution_andalucia")
except:
    print("Already in the correct directory")

In [109]:
REPORT_PATH = f"./data/use_case_{USE_CASE}/report"

if not "report" in os.listdir(f"./data/use_case_{USE_CASE}"):
    os.mkdir(f"./data/use_case_{USE_CASE}/report")
    print("Report directory created")
else:
    print("Report directory already exists")

### 1 Load original data

In [111]:
try:
    original_ = pd.read_csv(f"./data/use_case_{USE_CASE}/microdata_agricore_format/microdata_agricore_format_{YEAR}.csv")
    #weights = pd.read_csv(f"./data/use_case_{USE_CASE}/microdata_agricore_format/weights_{YEAR}.csv")
    categoricals = pd.read_csv(f"./data/use_case_{USE_CASE}/microdata_agricore_format/categoricals.csv")["0"].tolist()

    print("Data already loaded")

except:
    print("Converting microdata to agricore format")

    vnc = VariableNameConversion(BASE_PATH, USE_CASE, YEAR, TOTALS_VARIABLES)
    result = vnc.main()

    original_ = result[0]
    categoricals = result[1]
    weights = result[2]

    if not "microdata_agricore_format" in os.listdir(f"./data/use_case_{USE_CASE}"):
        os.mkdir(f"./data/use_case_{USE_CASE}/microdata_agricore_format")
        print("Microdata directory created")

    original_.to_csv(f"./data/use_case_{USE_CASE}/microdata_agricore_format/microdata_agricore_format_{YEAR}.csv", index=False)
    weights.to_csv(f"./data/use_case_{USE_CASE}/microdata_agricore_format/weights_{YEAR}.csv", index=False)
    pd.DataFrame(categoricals).to_csv(f"./data/use_case_{USE_CASE}/microdata_agricore_format/categoricals.csv", index=False)


### Upsample original data

In [83]:
np.random.seed(28)
original_data = original_.loc[original_.index.repeat(weights.to_numpy().flatten())].reset_index(drop=True)

print(f"Shape of the original data {original_.shape}")
print(f"Shape of the upsampled data {original_data.shape}")

### Load synthetic data

In [84]:
from pathlib import Path

SYNTHETIC_DATA_PATH = os.path.join(BASE_PATH, "synthetic_population")
paths = sorted(Path(SYNTHETIC_DATA_PATH).iterdir(), key=os.path.getmtime)

print("Available synthetic populations")
for path in paths:
    print(path)

SYNTHETIC_DATA_FILE = [os.path.split(path)[-1] for path in paths if str(YEAR) in path.name][-1]

print("\nSelected synthetic population")
print(SYNTHETIC_DATA_FILE)

synthetic_data = pd.read_csv(os.path.join(SYNTHETIC_DATA_PATH, SYNTHETIC_DATA_FILE))
print("\nSynthetic population loaded")

synthetic_data.head()

### Fix sellingPrice for synthetic population

In [85]:
for crop in [c.split(".")[0] for c in synthetic_data.columns if c.endswith("cultivatedArea")]:
    
    synthetic_data[f"{crop}.sellingPrice"] = synthetic_data.apply(lambda x: x[f"{crop}.valueSales"]/x[f"{crop}.quantitySold"] if x[f"{crop}.quantitySold"] > 0 else 0, axis=1)

    # Get average value for sellingPrice
    avg_sellingPrice = synthetic_data[synthetic_data[f"{crop}.sellingPrice"]>0][f"{crop}.sellingPrice"].mean()

    if np.isnan(avg_sellingPrice):
        avg_sellingPrice = 0.0
    print(f"{crop} - Average selling price: {avg_sellingPrice}")
    
    # Get indexes of zero value
    indexes_sellingPrice = synthetic_data[(synthetic_data[f"{crop}.sellingPrice"]==0)&(synthetic_data[f"{crop}.valueSales"]>0)].index

    # Input mean value
    synthetic_data.loc[indexes_sellingPrice, f"{crop}.sellingPrice"] = np.ones(len(indexes_sellingPrice))*avg_sellingPrice


#### Resample original data according synthetic data size

In [87]:
n_farms_original = int(weights.sum())
print(f"Number of farms original: {n_farms_original}")

n_farms_synthetic = synthetic_data.shape[0]
print(f"Number of farms synthetic: {n_farms_synthetic}")

sampling_ratio = n_farms_synthetic/n_farms_original
print(f"Sampling ratio: {sampling_ratio}")
print(f"Sampling {round(sampling_ratio, 4)*100} % of original farms")

RESAMPLE = False

if sampling_ratio < 1 and RESAMPLE:
    original_data = original_data.sample(frac=sampling_ratio, replace=False, random_state=28)
    #synthetic_data = synthetic_data.sample(n=n_farms_original, replace=False, random_state=28)
else:
    print("No need to resample")

In [88]:
n_farms_original = original_data.shape[0]
print(f"Number of farms original: {n_farms_original}")

n_farms_synthetic = synthetic_data.shape[0]
print(f"Number of farms synthetic: {n_farms_synthetic}")

In [90]:
def fill_zero_records(df, var):
    """
    Fill the records with zero values according other values in the dataset.

    Args:
        df (pd.DataFrame): DataFrame with the data.
        var (str): Variable to fill the zeros.
    Returns:
        df (pd.DataFrame): DataFrame with the zeros filled.
    """
    # Compute normalized probability for each category different from zero
    _proba = df[df[var]!=0][var].value_counts().to_dict()
    sum_proba = sum(_proba.values())
    _proba = {k: v/sum_proba for k, v in _proba.items()}

    # Compute the number of zeros to be filled
    n_zeros = df[df[var]==0].shape[0]

    # Generate random values to fill the zeros
    var_random = np.random.choice(list(_proba.keys()), size=n_zeros, p=list(_proba.values()))

    # Fill the zeros with random values
    df.loc[df[var]==0, var] = var_random

    return df


#### Fix sellingPrice issues

In [93]:

for crop in [c.split(".")[0] for c in original_data.columns if c.endswith("cultivatedArea")]:
    original_data[f"{crop}.sellingPrice"] = original_data.apply(lambda x: x[f"{crop}.valueSales"]/x[f"{crop}.quantitySold"] if x[f"{crop}.quantitySold"]>0 else 0, axis=1)

    # Get average value for sellingPrice
    avg_sellingPrice = original_data[original_data[f"{crop}.sellingPrice"]>0][f"{crop}.sellingPrice"].mean()

    if np.isnan(avg_sellingPrice):
        avg_sellingPrice = 0.0
    print(f"{crop} - Average selling price: {avg_sellingPrice}")
    
    # Get indexes of zero value
    indexes_sellingPrice = original_data[(original_data[f"{crop}.sellingPrice"]==0)&(original_data[f"{crop}.valueSales"]>0)].index

    # Input mean value
    original_data.loc[indexes_sellingPrice, f"{crop}.sellingPrice"] = np.ones(len(indexes_sellingPrice))*avg_sellingPrice


### Plot results

In [95]:
import matplotlib.pyplot as plt
import seaborn as sns

## Computing statistics

In [96]:
result_ = compute_statistics(original_data, synthetic_data, categoricals, REPORT_PATH, USE_CASE, YEAR)

display(result_)

In [97]:
similar_df = result_[result_["KS result"]=="Similar"]
print(f"Number of similar variables: {similar_df.shape[0]}")

different_df = result_[result_["KS result"]=="Different"]
print(f"Number of different variables: {different_df.shape[0]}")


In [40]:
nrows = 6
ncols = 2

for sheet_, batch_ in enumerate(batch([v for v in result_.index if (not v.endswith("sellingPrice") and v not in ["B_UT_20_A"])], nrows*ncols)):
    
    make_plots(original_data, synthetic_data, batch_, nrows, ncols, sheet_, REPORT_PATH, USE_CASE, YEAR, REPORT=True)
    

# Compute ratios

In [102]:
for var in ["cultivatedArea", "cropProduction", "quantitySold"]:
    display(compute_ratios(original_data, synthetic_data, var))
