# 01 - First Exploration

First exploration of the data.

## Preliminaires 

### System 

In [None]:
cd ../

In [None]:
pwd

### Imports

In [None]:
import os, sys, logging

from dataclasses import dataclass

In [None]:
from IPython.display import display, HTML

In [None]:
import numpy as np
import pandas as pd

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

import missingno as msno

In [None]:
from gbs.etl.extract import Extract

### Data

In [None]:
extract = Extract()
extract

In [None]:
data_dir = os.path.join(
    # os.getcwd(),
    extract.base,
    extract.folder,
    extract.subfolder,
)
data_dir

In [None]:
# # get data
# if (not os.path.exists(data_dir)) or  (not os.listdir(data_dir)):
#     extract.get_all(clean=True, include_production=True)

In [None]:
# extract.get_all(clean=True, production=True)

In [None]:
data_dir = "./data/source/"
os.listdir(data_dir)

In [None]:
!rm data/source/production.zip

In [None]:
!tree -L 1 ./

In [None]:
!tree -L 3 data/

In [None]:
crops = pd.read_csv(os.path.join(data_dir, "crops.csv"))
crops

In [None]:
country_specs = pd.read_csv(os.path.join(data_dir, "country_specs.csv"))
country_specs.head()

In [None]:
_path = "./data/source/production/"

fn_list = [
    os.path.join(_path, f) for f in os.listdir(_path) if f.endswith(".csv")
]
fn_list

In [None]:
# for fn in fn_list:
#     # read file
#     with open(fn, "r", encoding="latin-1") as f:
#         txt = f.read()

#     # clean file
#     txt.replace(",'", ",")

#     # write file
#     with open(fn, "w", encoding="utf8") as f:
#         f.write(txt)

In [None]:
_path = "./data/source/production/"

In [None]:
@dataclass
class Production:
    """Production data for a crop in a country in a year"""

    areacodes = pd.read_csv(os.path.join(_path, "areacodes.csv"))
    data_normalized = pd.read_csv(os.path.join(_path, "data_normalized.csv"))
    flags = pd.read_csv(os.path.join(_path, "flags.csv"))
    itemcodes = pd.read_csv(os.path.join(_path, "itemcodes.csv"))

## Exploration 

### Country Specs 

#### Display

In [None]:
country_specs.head(10)

In [None]:
country_specs.tail(10)

In [None]:
country_specs.sample(10)

#### Structure

In [None]:
country_specs.shape

In [None]:
country_specs.columns

In [None]:
country_specs.dtypes

In [None]:
country_specs.info()

In [None]:
crops.dtypes.value_counts()

In [None]:
for dtype in ["object", "float", "int"]:
    selected_dtype = country_specs.select_dtypes(include=[dtype])
    display(selected_dtype.columns)

In [None]:
_num = country_specs.select_dtypes(include=["number"])
_num

In [None]:
_num_cols = _num.columns.tolist()
[i for i in _num_cols if ("code" not in i) and ("id" not in i)]

**Conclusion**

No data in the crop dataset.

#### Nan

In [None]:
crops.isna().sum()

In [None]:
tmp = crops.isna().mean().round(2)
tmp[tmp > 0.00]

In [None]:
len(tmp[tmp > 0.00]) / len(tmp)

In [None]:
tmp = crops.isna().mean(axis=1).round(2)
tmp.value_counts().sort_index()

In [None]:
msno.matrix(country_specs)

**Conclusion**

- delete is_EU27 and is_south_america

In [None]:
# delete is_EU27 and is_south_america

In [None]:
drop_cols = ["is_EU27", "is_south_america"]
country_specs = country_specs.drop(columns=drop_cols)
msno.matrix(country_specs)

In [None]:
tmp = country_specs.isna().mean(axis=1).round(2)
tmp.value_counts().sort_index()

In [None]:
threshold = 0.2
tmp[tmp >= threshold]

In [None]:
drop_idxs = tmp[tmp > threshold].index
country_specs.loc[drop_idxs, :]

**Conclusion**

- countries with Nan rate > 0.3 are Island or -100k pop (Monaco, Antigua)

In [None]:
country_specs = country_specs.drop(
    index=drop_idxs, columns=drop_cols, errors="ignore"
)
msno.matrix(country_specs)

In [None]:
num_cols = country_specs.select_dtypes(
    include=["float", "int"]
).columns.tolist()

num_cols = [i for i in num_cols if "code" not in i]

categ_cols = [
    "alpha_3_code",
    "FAO_country_name",
    "exiobase_region_name",
    "globio_country_code",
    "globio_country_name",
    "USS30_region_name",
]


country_specs = country_specs.loc[:, num_cols + categ_cols]

In [None]:
country_specs

**Conclusion**

- No relevant data in the crop dataset

#### Data Inspection

In [None]:
country_specs.FAO_country_name.value_counts()

In [None]:
country_specs.FAO_country_name.nunique()

In [None]:
country_specs.USS30_region_name.value_counts()

In [None]:
country_specs.FAO_country_name.value_counts().value_counts()

In [None]:
country_specs.groupby("exiobase_region_name").FAO_country_name.count()

#### DataSet Conclusion

- No relevant data in the crop dataset

### Crops 

#### Wheat Selection

In [None]:
feature = "Wheat"
crops = crops.loc[crops.item_name == feature, :]

In [None]:
item_cols = [i for i in crops.columns if "item" in i]
item_cols

In [None]:
crops.drop(columns=item_cols, inplace=True, errors="ignore")
crops

In [None]:
crops

#### Display

In [None]:
crops.head(10)

In [None]:
crops.tail(10)

In [None]:
crops.sample(10)

#### Structure

In [None]:
crops.drop(columns="id", inplace=True, errors="ignore")

In [None]:
crops.shape

In [None]:
crops.columns

In [None]:
crops.dtypes

In [None]:
crops.info()

In [None]:
crops.dtypes.value_counts()

In [None]:
for dtype in ["object", "float", "int"]:
    selected_dtype = crops.select_dtypes(include=[dtype])
    display(selected_dtype.columns)

In [None]:
_num = crops.select_dtypes(include=["number"])
_num

In [None]:
_feat_cols = [i for i in _num.columns if "msa" in i]
_feat_cols

#### Separation Static/Dynamic

In [None]:
categ_cols = [i for i in crops.columns if "msa" not in i]
static_cols = [i for i in crops.columns if "static" in i]
dynamic_cols = [i for i in crops.columns if "dynamic" in i]

display(categ_cols)
display(static_cols)
display(dynamic_cols)

In [None]:
crops_static = crops.loc[:, categ_cols + static_cols]
crops_static.head()

In [None]:
crops_dynamic = crops.loc[:, categ_cols + dynamic_cols]
crops_dynamic

# BE CAREFULL => in the report we do have to distinguish between terestrial static, dynamic and marine static, dynamic

#### Summize

crops_static

In [None]:
crops_static

In [None]:
sum_static = crops_static.iloc[:, 1:].sum(axis=1)
sum_static

In [None]:
crops_static = crops_static.iloc[:, :1]
crops_static["sum_static"] = sum_static.values

#### Data Inspection

In [None]:
crops_static.describe()

In [None]:
crops.globio_country_code.value_counts()

In [None]:
crops.globio_country_code.nunique()

**Conclusion**

????

#### Merge Country Specs and crops

In [None]:
country_specs

In [None]:
# merged = pd.merge(left=_country_specs, right=crops, on="globio_country_code",how="left")
# merged

In [None]:
tmp = country_specs.loc[:, ["globio_country_code", "globio_country_name"]]
tmp.index = tmp.globio_country_code
tmp.drop(columns="globio_country_code", inplace=True)
tmp.index.name = None
tmp = tmp.to_dict().get("globio_country_name")
tmp

In [None]:
crops_static["globio_country_name"] = crops_static.globio_country_code.apply(
    lambda i: tmp.get(i, np.nan)
)
crops_static

#### Nan

In [None]:
crops_static.isna().sum()

**Conclusion**

???

**Conclusion**
???

## Production

#### Table Analysis

In [None]:
Production.areacodes

In [None]:
Production.flags

In [None]:
flags = {
    k: v
    for k, v in zip(
        Production.flags.Flag.values, Production.flags.Description.values
    )
}
flags

In [None]:
Production.itemcodes

In [None]:
data = Production.data_normalized
data

In [None]:
data["Flag_value"] = data.Flag.apply(lambda i: flags[i])
data

In [None]:
data.columns

#### Feature selection

In [None]:
cols = [
    # "Area Code",
    # "Area Code (M49)",
    "Area",
    # "Item Code",
    # "Item Code (CPC)",
    "Item",
    # "Element Code",
    "Element",
    # "Year Code",
    "Year",
    "Unit",
    "Value",
    # "Flag",
    # "Note",
    "Flag_value",
]

In [None]:
data = data.loc[:, cols]
data

In [None]:
data.Element.nunique()

In [None]:
data.Element.value_counts()

In [None]:
data.Item.value_counts()

In [None]:
data_weat = data.loc[data.Item.str.lower().str.contains("wheat"), :]
data_weat

In [None]:
data_weat_2019 = data_weat.loc[data_weat.Year == 2019, :]
data_weat_2019

In [None]:
data_weat_2019_h = data_weat_2019.loc[data_weat_2019.Unit == "ha", :]

In [None]:
data_weat_2019_h

In [None]:
data_weat_2019_h.columns

In [None]:
cols = [
    "Area",
    # "Item",
    # "Element",
    # "Year",
    # "Unit",
    "Value",
    "Flag_value",
]

data_weat_2019_h = data_weat_2019_h.loc[:, cols]
data_weat_2019_h

#### Display 

In [None]:
data_weat_2019_h.head(10)

In [None]:
data_weat_2019_h.tail(10)

In [None]:
data_weat_2019_h.sample(10)

#### Structure

In [None]:
data_weat_2019_h.shape

In [None]:
data_weat_2019_h.info()

#### Nan

In [None]:
data_weat_2019_h.isna().sum()

In [None]:
data_weat_2019_h

data_weat_2019_h.shape

## Final Merge

### Keys Analysis

In [None]:
data_weat_2019_h.sort_values("Area", inplace=True, ascending=True)
data_weat_2019_h.rename(
    columns={"Value": "km2"}, inplace=True, errors="ignore"
)
data_weat_2019_h

####### BE CARREFULLL KM2 is supposed

In [None]:
crops_static.sort_values("globio_country_name", ascending=True, inplace=True)
crops_static

In [None]:
country_specs.sort_values("FAO_country_name", ascending=True, inplace=True)
country_specs

In [None]:
crops_static.shape

In [None]:
data_weat_2019_h.shape

In [None]:
country_specs.shape

### data vs country ON FAO_country_name

In [None]:
merge1 = pd.merge(
    left=country_specs,
    right=data_weat_2019_h,
    left_on="FAO_country_name",
    right_on="Area",
    how="outer",
    indicator=True,
)

merge1

In [None]:
merge1.rename(columns={"_merge": "_merge_1"}, inplace=True)
merge1

data_weat_2019_h.head()

### Merge1 v crops 

In [None]:
merge2 = pd.merge(
    left=merge1,
    right=crops_static,
    left_on="globio_country_name",
    right_on="globio_country_name",
    how="outer",
    indicator=True,
)

merge2

In [None]:
merge2.rename(columns={"_merge": "_merge_2"}, inplace=True)
merge2

In [None]:
merge2.loc[merge2.loc[:, "_merge_2"] == "both", :]

In [None]:
merge2.loc[merge2.loc[:, "_merge_2"] == "both", :]

In [None]:
final = merge2.loc[merge2.loc[:, "_merge_2"] == "both", :]

### Select Features 

In [None]:
final

In [None]:
final.columns

In [None]:
cols = [
    # "exiobase_region_id",
    "alpha_3_code",
    "FAO_country_name",
    # "exiobase_region_name",
    # "globio_country_code_x",
    "globio_country_name",
    "USS30_region_name",
    "Area",
    "km2",
    "Flag_value",
    # "_merge_1",
    # "globio_country_code_y",
    "sum_static",
    # "_merge_2",
]

In [None]:
final = final.loc[:, cols]
final

In [None]:
final["ms.km2"] = final.km2 * final.sum_static
final

## Feature Engineering

### population

In [None]:
fn = "./data/source/population/API_SP.POP.TOTL_DS2_en_csv_v2_6298256.csv"


encoding = "latin-1"
encoding = "utf8"
with open(fn, "r", encoding=encoding) as f:
    txt = f.readlines()

In [None]:
txt

In [None]:
txt[4:]

In [None]:
with open(fn, "w", encoding="utf8") as f:
    # txt[4:].write(f)
    f.writelines(txt[4:])

In [None]:
pop = pd.read_csv(fn)

In [None]:
pop

In [None]:
pop = pop.loc[:, pop.columns.tolist()[:2] + ["2019"]]
pop

In [None]:
pop.rename(columns={"2019": "population"}, inplace=True)

## Gpd

In [None]:
fn = "./data/source/gpd/API_NY.GDP.PCAP.CD_DS2_en_csv_v2_6298251.csv"

In [None]:
with open(fn, "r", encoding="utf8") as f:
    txt = f.readlines()

In [None]:
txt

In [None]:
txt[4:]

In [None]:
with open(fn, "w", encoding="utf8") as f:
    f.writelines(txt[4:])

In [None]:
gpd = pd.read_csv(fn)
gpd

In [None]:
gpd = gpd.loc[:, gpd.columns.tolist()[:2] + ["2019"]]
gpd

In [None]:
gpd.rename(columns={"2019": "gpd_per_capita"}, inplace=True)
gpd

### merge both

In [None]:
pop_gpd = pd.merge(
    left=pop, right=gpd, on=["Country Name", "Country Code"], how="outer"
)
pop_gpd

In [None]:
test_3_codes = ["AUT", "BEL, BGR"]

pop_gpd.loc[pop_gpd.loc[:, "Country Name"].str.contains("Bel"), :]

In [None]:
pop_gpd.loc[pop_gpd.loc[:, "Country Name"].str.contains("Bul"), :]

In [None]:
final_gpd_pop = pd.merge(
    left=final,
    right=pop_gpd,
    left_on="alpha_3_code",
    right_on="Country Code",
    how="left",
)
final_gpd_pop

### Final Selection

In [None]:
final_gpd_pop.columns

In [None]:
cols = [
    # "exiobase_region_id",
    # "alpha_3_code",
    # "FAO_country_name",
    # "exiobase_region_name",
    # "globio_country_code_x",
    # "globio_country_name",
    "USS30_region_name",
    # "Area",
    "km2",
    # "Flag_value",
    # "_merge_1",
    # "globio_country_code_y",
    "sum_static",
    # "_merge_2",
    "ms.km2",
    "Country Name",
    "Country Code",
    "population",
    "gpd_per_capita",
]

In [None]:
final_gpd_pop = final_gpd_pop.loc[:, cols]
final_gpd_pop

In [None]:
final_gpd_pop.columns = [i.lower() for i in final_gpd_pop.columns]
final_gpd_pop

In [None]:
final_gpd_pop.columns

In [None]:
cols = [
    "uss30_region_name",
    "country name",
    "country code",
    "km2",
    "sum_static",
    "ms.km2",
    "population",
    "gpd_per_capita",
]

In [None]:
final_gpd_pop = final_gpd_pop.loc[:, cols]
final_gpd_pop.rename(
    columns={
        "uss30_region_name": "region",
        "sum_static": "sum_msa_static",
        "ms.km2": "msa.km2",
    }
)

In [None]:
final_gpd_pop.to_csv("./data/final.csv", index=False)