# 00-Get-Data

Just find the data and download it.

## System

In [1]:
cd ..

/home/alex/tmp/Global-Biodiversity-Score


In [2]:
!pwd

/home/alex/tmp/Global-Biodiversity-Score


## Imports 

In [3]:
# built in modules
import os
import json

In [4]:
# third party modules
import pandas as pd

In [5]:
# custom modules
from gbs.etl.urls import Urls
from gbs.helpers import runcmd

## Build Data Directories

In [None]:
# make data/ directory if it doesn't exist
if not os.path.exists("data"):
    os.makedirs("data")

In [None]:
# make data/source/ directory if it doesn't exist
if not os.path.exists("data/source"):
    os.makedirs("data/source/")

## Download Data

In [None]:
# download crops
runcmd("wget -O data/source/crops.csv " + Urls.crops_url)

In [None]:
# download countries specs
runcmd("wget -O data/source/country_specs_url.csv " + Urls.country_specs_url)

In [None]:
# dowlnoad qcl metadata
runcmd("wget -O data/source/production.json " + Urls.production_json)

## Manage Production

In [None]:
# load qcl metadata
qcl = json.load(open("data/source/production.json"))
qcl

In [None]:
# file qcl json is a list of datasets
dataset = qcl.get("Datasets").get("Dataset")
dataset[:3]

In [None]:
# convert to pandas dataframe
df = pd.DataFrame(dataset)
df.head()

In [None]:
# find qcl url
production_url = df.loc[df.DatasetCode == "QCL"].FileLocation.values[0]
production_url

In [None]:
# health check
assert production_url == Urls.production_url

In [None]:
# curl qlc.zip
runcmd(f"curl --output ./data/source/production.zip '{Urls.production_url}'")

In [None]:
# unzip the file
runcmd(f"unzip ./data/source/production.zip -d ./data/source/")

In [None]:
# make data/source/production if it doesn't exist
if not os.path.exists("data/source/production"):
    os.makedirs("data/source/production")

In [None]:
# move production files to production folder
runcmd("mv ./data/source/Pr* ./data/source/production/")

In [None]:
# update filenames to lowercase
path = "data/source/production/"
pattern = "Production_Crops_Livestock_E_All_"

for fn in os.listdir(path):
    # clean filename
    cleaned_fn = (
        fn.replace(pattern, "")
        .lower()
        .replace("(", "")
        .replace(")", "")
    )

    # src and dst
    src_ = os.path.join(path, fn)
    dst_ = os.path.join(path, cleaned_fn)

    # do rename
    os.rename(src_, dst_)


In [None]:
os.listdir("data/source/production/")

In [None]:
!rm -rf data/source/production.zip

In [None]:
!rm -rf data/source/production.json

## Check Data Integrity

In [None]:
ext  = ".csv"
path = "data/source"

for fn in os.listdir(path):

    print(fn)

    if not fn.endswith(ext):
        continue

    df = pd.read_csv(os.path.join(path, fn))
    assert isinstance(df, pd.DataFrame)
    assert df.shape[0] > 0
    assert df.shape[1] > 0

## ... or Just Use relevant classes

In [6]:
from gbs.etl.extract import Extract

In [7]:
extract = Extract()

In [1]:
extract.__dict__

NameError: name 'extract' is not defined

In [None]:
extract.clean()

In [9]:
extract.make_folders()

In [10]:
extract.get_crops()

Resolving gist.githubusercontent.com (gist.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to gist.githubusercontent.com (gist.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 41236 (40K) [text/plain]
Saving to: ‘./data/source/crops.csv’

     0K .......... .......... .......... ..........           100% 3,27M=0,01s

2024-01-16 02:01:12 (3,27 MB/s) - ‘./data/source/crops.csv’ saved [41236/41236]


In [11]:
extract.get_county_specs()

Resolving gist.githubusercontent.com (gist.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to gist.githubusercontent.com (gist.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 41236 (40K) [text/plain]
Saving to: ‘./data/source/country_specs.csv’

     0K .......... .......... .......... ..........           100% 3,58M=0,01s

2024-01-16 02:01:14 (3,58 MB/s) - ‘./data/source/country_specs.csv’ saved [41236/41236]


In [12]:
extract.get_production()

                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  1 31.7M    1  367k    0     0  1118k      0  0:00:29 --:--:--  0:00:29 1119k
 11 31.7M   11 3870k    0     0  3029k      0  0:00:10  0:00:01  0:00:09 3028k
 22 31.7M   22 7178k    0     0  3152k      0  0:00:10  0:00:02  0:00:08 3152k
 32 31.7M   32 10.2M    0     0  3203k      0  0:00:10  0:00:03  0:00:07 3203k
 42 31.7M   42 13.4M    0     0  3228k      0  0:00:10  0:00:04  0:00:06 3228k
 52 31.7M   52 16.7M    0     0  3244k      0  0:00:10  0:00:05  0:00:05 3385k
 62 31.7M   62 19.9M    0     0  3254k      0  0:00:09  0:00:06  0:00:03 3312k
 72 31.7M   72 23.1M    0     0  3263k      0  0:00:09  0:00:07  0:00:02 3313k
 83 31.7M   83 26.4M    0     0  3268k      0  0:00:09  0:00:08  0:00:01 3312k
 93 31.7M   93 29.6M    0     0  3273k      0  0:00:09  0:00:09 --:--:-- 3311k
100 31.7M  100 31.7M    0     0  3277k      0  0:00: