# 00-Get-Data

Just find the data and download it.

## System

In [1]:
cd ..

/home/alex/tmp/Global-Biodiversity-Score


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
!pwd

/home/alex/tmp/Global-Biodiversity-Score


## Imports 

In [3]:
# built in modules
import os
import json

In [4]:
# third party modules
import pandas as pd

In [5]:
# custom modules
from gbs.etl.urls import Urls
from gbs.helpers import runcmd
from gbs.etl.extract import Extract

## Build Data Directories

In [None]:
# make data/ directory if it doesn't exist
if not os.path.exists("data"):
    os.makedirs("data")

In [None]:
# make data/source/ directory if it doesn't exist
if not os.path.exists("data/source"):
    os.makedirs("data/source/")

## Download Data

In [None]:
# download crops
runcmd("wget -O data/source/crops.csv " + Urls.crops_url)

In [None]:
# download countries specs
runcmd("wget -O data/source/country_specs_url.csv " + Urls.country_specs_url)

In [None]:
# dowlnoad qcl metadata
runcmd("wget -O data/source/production.json " + Urls.production_json)

## Manage Production

In [None]:
# load qcl metadata
qcl = json.load(open("data/source/production.json"))
qcl

In [None]:
# file qcl json is a list of datasets
dataset = qcl.get("Datasets").get("Dataset")
dataset[:3]

In [None]:
# convert to pandas dataframe
df = pd.DataFrame(dataset)
df.head()

In [None]:
# find qcl url
production_url = df.loc[df.DatasetCode == "QCL"].FileLocation.values[0]
production_url

In [None]:
# health check
assert production_url == Urls.production_url

In [None]:
# curl qlc.zip
runcmd(f"curl --output ./data/source/production.zip '{Urls.production_url}'")

In [None]:
# unzip the file
runcmd(f"unzip ./data/source/production.zip -d ./data/source/")

In [None]:
# make data/source/production if it doesn't exist
if not os.path.exists("data/source/production"):
    os.makedirs("data/source/production")

In [None]:
# move production files to production folder
runcmd("mv ./data/source/Pr* ./data/source/production/")

In [None]:
# update filenames to lowercase
path = "data/source/production/"
pattern = "Production_Crops_Livestock_E_All_"

for fn in os.listdir(path):
    # clean filename
    cleaned_fn = (
        fn.replace(pattern, "").lower().replace("(", "").replace(")", "")
    )

    # src and dst
    src_ = os.path.join(path, fn)
    dst_ = os.path.join(path, cleaned_fn)

    # do rename
    os.rename(src_, dst_)

In [None]:
os.listdir("data/source/production/")

In [None]:
!rm -rf data/source/production.zip

In [None]:
!rm -rf data/source/production.json

## Check Data Integrity

In [None]:
ext = ".csv"
path = "data/source"

for fn in os.listdir(path):
    print(fn)

    if not fn.endswith(ext):
        continue

    df = pd.read_csv(os.path.join(path, fn))
    assert isinstance(df, pd.DataFrame)
    assert df.shape[0] > 0
    assert df.shape[1] > 0

In [None]:
ext = ".csv"
path = "data/source/production/"

for fn in os.listdir(path):
    print(fn)

    if not fn.endswith(ext):
        continue

    df = pd.read_csv(os.path.join(path, fn))
    assert isinstance(df, pd.DataFrame)
    assert df.shape[0] > 0
    assert df.shape[1] > 0

## ... or Just use relevant classes

### One by One

In [None]:
extract = Extract()

In [None]:
extract.__dict__

In [None]:
extract.clean()

In [None]:
extract.make_folders()

In [None]:
extract.get_crops()

In [None]:
extract.get_country_specs()

In [None]:
extract.get_production()

In [None]:
extract.check_files()

### All at Once

In [6]:
extract = Extract()
extract.get_all(clean=True)

                                 Dload  Upload   Total   Spent    Left  Speed


  0 31.7M    0  4098    0     0  33881      0  0:16:23 --:--:--  0:16:23 34150
curl: (23) Failure writing output to destination


[]