# 01 - First Exploration

## System 

In [None]:
cd ../

In [None]:
pwd

## Imports

In [None]:
import os, sys, logging

from dataclasses import dataclass

In [None]:
from IPython.display import display, HTML

In [None]:
import numpy as np
import pandas as pd

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

import missingno as msno

In [None]:
from gbs.etl.extract import Extract

## Data

In [None]:
extract = Extract()
extract

In [None]:
data_dir = os.path.join(
    # os.getcwd(),
    extract.base,
    extract.folder,
    extract.subfolder,
)
data_dir

In [None]:
# # get data
# if (not os.path.exists(data_dir)) or  (not os.listdir(data_dir)):
#     extract.get_all(clean=True, include_production=True)

In [None]:
extract.get_all(clean=True, include_production=True)

In [None]:
data_dir = "./data/source/"
os.listdir(data_dir)

In [None]:
!rm data/source/production.zip

In [None]:
!tree -L 1 ./

In [None]:
!tree -L 3 data/

In [None]:
crops = pd.read_csv(os.path.join(data_dir, "crops.csv"))
crops

In [None]:
country_specs = pd.read_csv(os.path.join(data_dir, "country_specs.csv"))
country_specs.head()

In [None]:
_path = "./data/source/production/"

fn_list = [
    os.path.join(_path, f) for f in os.listdir(_path) if f.endswith(".csv")
]
fn_list

In [None]:
for fn in fn_list:
    # read file
    with open(fn, "r", encoding="latin-1") as f:
        txt = f.read()

    # clean file
    txt.replace(",'", ",")

    # write file
    with open(fn, "w", encoding="utf8") as f:
        f.write(txt)

In [None]:
_path = "./data/source/production/"

In [None]:
@dataclass
class Production:
    """Production data for a crop in a country in a year"""

    areacodes = pd.read_csv(os.path.join(_path, "areacodes.csv"))
    data_normalized = pd.read_csv(os.path.join(_path, "data_normalized.csv"))
    flags = pd.read_csv(os.path.join(_path, "flags.csv"))
    itemcodes = pd.read_csv(os.path.join(_path, "itemcodes.csv"))

## First exploration 

### Crops 

#### Display

In [None]:
crops.head(10)

In [None]:
crops.tail(10)

In [None]:
crops.sample(10)

#### Structure

In [None]:
crops.shape

In [None]:
crops.columns

In [None]:
crops.dtypes

In [None]:
crops.info()

In [None]:
crops.dtypes.value_counts()

In [None]:
for dtype in ["object", "float", "int"]:
    selected_dtype = crops.select_dtypes(include=[dtype])
    display(selected_dtype.columns)

In [None]:
_num = crops.select_dtypes(include=["number"])
_num

In [None]:
_num_cols = _num.columns.tolist()
[i for i in _num_cols if ("code" not in i) and ("id" not in i)]

**Conclusion**

No data in the crop dataset.

#### Nan

In [None]:
crops.isna().sum()

In [None]:
tmp = crops.isna().mean().round(2)
tmp[tmp > 0.00]

In [None]:
len(tmp[tmp > 0.00]) / len(tmp)

In [None]:
tmp = crops.isna().mean(axis=1).round(2)
tmp.value_counts().sort_index()

In [None]:
msno.matrix(crops)

**Conclusion**

- delete is_EU27 and is_south_america

In [None]:
# delete is_EU27 and is_south_america

In [None]:
drop_cols = ["is_EU27", "is_south_america"]
_crops = crops.drop(columns=drop_cols)
msno.matrix(_crops)

In [None]:
tmp = _crops.isna().mean(axis=1).round(2)
tmp.value_counts().sort_index()

In [None]:
tmp[tmp > 0.33]

In [None]:
drop_idxs = tmp[tmp > 0.33].index
crops.loc[drop_idxs, :]

**Conclusion**

- countries with Nan rate > 0.3 are Island or -100k pop (Monaco, Antigua)

In [None]:
_crops = crops.drop(index=drop_idxs, columns=drop_cols)
msno.matrix(_crops)

In [None]:
_crops

In [None]:
num_cols = _crops.select_dtypes(include=["float", "int"]).columns.tolist()
num_cols = [i for i in num_cols if "code" not in i]
categ_cols = ["alpha_3_code", "FAO_country_name", "exiobase_region_name"]


_crops.loc[:, num_cols + categ_cols]

**Conclusion**

- No relevant data in the crop dataset

#### DataSet Conclusion

- No relevant data in the crop dataset