# 00-Get-Data

Just find the data and download it.

## System

In [1]:
cd ../

/home/alex/tmp/Global-Biodiversity-Score


In [2]:
!pwd

/home/alex/tmp/Global-Biodiversity-Score


## Imports 

In [3]:
# built in modules
import os
import json

In [4]:
# third party modules
import pandas as pd

In [5]:
# custom modules
from gbs.etl.urls import Urls
from gbs.helpers import runcmd
# from gbs.etl.extract import Extract

## Build Data Directories

In [6]:
# make data/ directory if it doesn't exist
if not os.path.exists("data"):
    os.makedirs("data")

In [7]:
# make subdirectories if they don't exist
folder_list = ["source", "tmp", "final"]
for folder in folder_list:
    if not os.path.exists(f"data/{folder}"):
        os.makedirs(f"data/{folder}")

In [8]:
# make subdirectories if they don't exist
sub_folders = ["crops", "country_specs", "gdp", "population", "production"]
for folder in sub_folders:
    if not os.path.exists(f"data/source/{folder}"):
        os.makedirs(f"data/source/{folder}")

## Download Data

### Crops

In [9]:
# download crops
runcmd("wget -O data/source/crops/crops.csv " + Urls.crops)

Resolving gist.githubusercontent.com (gist.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to gist.githubusercontent.com (gist.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 794666 (776K) [text/plain]
Saving to: ‘data/source/crops/crops.csv’

     0K .......... .......... .......... .......... ..........  6% 1,23M 1s
    50K .......... .......... .......... .......... .......... 12% 3,59M 0s
   100K .......... .......... .......... .......... .......... 19% 3,39M 0s
   150K .......... .......... .......... .......... .......... 25%  919K 0s
   200K .......... .......... .......... .......... .......... 32%  199M 0s
   250K .......... .......... .......... .......... .......... 38% 2,51M 0s
   300K .......... .......... .......... .......... .......... 45%  193M 0s
   350K .......... .......... .......... .......... .......... 51% 3,44M 0s
   400K .......... .......... .........

('',
 '--2024-01-19 03:02:21--  https://gist.githubusercontent.com/AlexandreGazagnes/e47b986ad139b70d03735bd0ebb9e295/raw/c88ff0a9cc114e60b81deb1578389a1599c67a4b/crops.csv\nResolving gist.githubusercontent.com (gist.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...\nConnecting to gist.githubusercontent.com (gist.githubusercontent.com)|185.199.109.133|:443... connected.\nHTTP request sent, awaiting response... 200 OK\nLength: 794666 (776K) [text/plain]\nSaving to: ‘data/source/crops/crops.csv’\n\n     0K .......... .......... .......... .......... ..........  6% 1,23M 1s\n    50K .......... .......... .......... .......... .......... 12% 3,59M 0s\n   100K .......... .......... .......... .......... .......... 19% 3,39M 0s\n   150K .......... .......... .......... .......... .......... 25%  919K 0s\n   200K .......... .......... .......... .......... .......... 32%  199M 0s\n   250K .......... .......... .......... .......... .......... 38% 2,51M 0s\n   30

### Country Specs

In [10]:
# download countries specs
runcmd(
    "wget -O data/source/country_specs/country_specs.csv "
    + Urls.country_specs
)

Resolving gist.githubusercontent.com (gist.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to gist.githubusercontent.com (gist.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 41236 (40K) [text/plain]
Saving to: ‘data/source/country_specs/country_specs.csv’

     0K .......... .......... .......... ..........           100% 3,90M=0,01s

2024-01-19 03:02:22 (3,90 MB/s) - ‘data/source/country_specs/country_specs.csv’ saved [41236/41236]


('',
 '--2024-01-19 03:02:21--  https://gist.githubusercontent.com/AlexandreGazagnes/57eedb7f88d249f2d5bb85e525e55260/raw/1e559190503cc05f0e298ddddf74934acffb0efe/country_specs.csv\nResolving gist.githubusercontent.com (gist.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...\nConnecting to gist.githubusercontent.com (gist.githubusercontent.com)|185.199.109.133|:443... connected.\nHTTP request sent, awaiting response... 200 OK\nLength: 41236 (40K) [text/plain]\nSaving to: ‘data/source/country_specs/country_specs.csv’\n\n     0K .......... .......... .......... ..........           100% 3,90M=0,01s\n\n2024-01-19 03:02:22 (3,90 MB/s) - ‘data/source/country_specs/country_specs.csv’ saved [41236/41236]\n\n')

### Production

In [11]:
# curl qlc.zip
runcmd(
    f"curl --output ./data/source/production/production.zip '{Urls.production}'"
)

                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  1 31.7M    1  559k    0     0  1514k      0  0:00:21 --:--:--  0:00:21 1513k
 12 31.7M   12 4038k    0     0  2996k      0  0:00:10  0:00:01  0:00:09 2995k
 22 31.7M   22 7430k    0     0  3164k      0  0:00:10  0:00:02  0:00:08 3164k
 33 31.7M   33 10.5M    0     0  3209k      0  0:00:10  0:00:03  0:00:07 3209k
 43 31.7M   43 13.7M    0     0  3232k      0  0:00:10  0:00:04  0:00:06 3231k
 51 31.7M   51 16.5M    0     0  3157k      0  0:00:10  0:00:05  0:00:05 3278k
 62 31.7M   62 19.7M    0     0  3192k      0  0:00:10  0:00:06  0:00:04 3244k
 72 31.7M   72 23.1M    0     0  3219k      0  0:00:10  0:00:07  0:00:03 3245k
 83 31.7M   83 26.4M    0     0  3240k      0  0:00:10  0:00:08  0:00:02 3261k
 93 31.7M   93 29.7M    0     0  3256k      0  0:00:09  0:00:09 --:--:-- 3276k
 99 31.7M   99 31.6M    0     0  3131k      0  0:00:

('',
 '  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n                                 Dload  Upload   Total   Spent    Left  Speed\n\n  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0\n  1 31.7M    1  559k    0     0  1514k      0  0:00:21 --:--:--  0:00:21 1513k\n 12 31.7M   12 4038k    0     0  2996k      0  0:00:10  0:00:01  0:00:09 2995k\n 22 31.7M   22 7430k    0     0  3164k      0  0:00:10  0:00:02  0:00:08 3164k\n 33 31.7M   33 10.5M    0     0  3209k      0  0:00:10  0:00:03  0:00:07 3209k\n 43 31.7M   43 13.7M    0     0  3232k      0  0:00:10  0:00:04  0:00:06 3231k\n 51 31.7M   51 16.5M    0     0  3157k      0  0:00:10  0:00:05  0:00:05 3278k\n 62 31.7M   62 19.7M    0     0  3192k      0  0:00:10  0:00:06  0:00:04 3244k\n 72 31.7M   72 23.1M    0     0  3219k      0  0:00:10  0:00:07  0:00:03 3245k\n 83 31.7M   83 26.4M    0     0  3240k      0  0:00:10  0:00:08  0:00:02 3261k\n 93 31.7M   93 29.7M    0     0

### GDP

In [12]:
# curl qlc.zip
runcmd(
    f"curl --output ./data/source/gdp/gdp.zip '{Urls.gdp}'"
)

                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
 73  129k   73 97864    0     0  99123      0  0:00:01 --:--:--  0:00:01 99052
100  129k  100  129k    0     0   130k      0 --:--:-- --:--:-- --:--:--  130k


('',
 '  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n                                 Dload  Upload   Total   Spent    Left  Speed\n\n  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0\n 73  129k   73 97864    0     0  99123      0  0:00:01 --:--:--  0:00:01 99052\n100  129k  100  129k    0     0   130k      0 --:--:-- --:--:-- --:--:--  130k\n')

### Population

In [13]:
# curl pip.zip
runcmd(f"curl --output ./data/source/population/population.zip '{Urls.population}'")

                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
 56 85964   56 48709    0     0  50439      0  0:00:01 --:--:--  0:00:01 50423
100 85964  100 85964    0     0  80737      0  0:00:01  0:00:01 --:--:-- 80793


('',
 '  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n                                 Dload  Upload   Total   Spent    Left  Speed\n\n  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0\n 56 85964   56 48709    0     0  50439      0  0:00:01 --:--:--  0:00:01 50423\n100 85964  100 85964    0     0  80737      0  0:00:01  0:00:01 --:--:-- 80793\n')

## Manage Files

### Production

In [14]:
# unzip the file
runcmd(
    f"unzip ./data/source/production/production.zip -d ./data/source/production"
)

  inflating: ./data/source/production/Production_Crops_Livestock_E_All_Data_(Normalized).csv  
  inflating: ./data/source/production/Production_Crops_Livestock_E_AreaCodes.csv  
  inflating: ./data/source/production/Production_Crops_Livestock_E_Flags.csv  
  inflating: ./data/source/production/Production_Crops_Livestock_E_ItemCodes.csv


('Archive:  ./data/source/production/production.zip\n  inflating: ./data/source/production/Production_Crops_Livestock_E_All_Data_(Normalized).csv  \n  inflating: ./data/source/production/Production_Crops_Livestock_E_AreaCodes.csv  \n  inflating: ./data/source/production/Production_Crops_Livestock_E_Flags.csv  \n  inflating: ./data/source/production/Production_Crops_Livestock_E_ItemCodes.csv  \n',
 '')

In [15]:
# update filenames to lowercase
path = "data/source/production/"
pattern = "Production_Crops_Livestock_E_All_"

for fn in os.listdir(path):
    # clean filename
    cleaned_fn = (
        fn.replace(pattern, "").lower().replace("(", "").replace(")", "")
    )

    # src and dst
    src_ = os.path.join(path, fn)
    dst_ = os.path.join(path, cleaned_fn.replace(pattern.lower(), ""))

    # do rename
    os.rename(src_, dst_)

In [16]:
os.listdir("data/source/production/")

['.gitkeep',
 'production_crops_livestock_e_flags.csv',
 'production_crops_livestock_e_itemcodes.csv',
 'data_normalized.csv',
 'production_crops_livestock_e_areacodes.csv',
 'production.zip']

In [17]:
# !rm -rf data/source/production/production.zip

### GDP & Popuplation

In [18]:
# unzip the file
runcmd(f"unzip ./data/source/population/population.zip -d ./data/source/population")

  inflating: ./data/source/population/Metadata_Indicator_API_SP.POP.TOTL_DS2_en_csv_v2_6298256.csv  
  inflating: ./data/source/population/API_SP.POP.TOTL_DS2_en_csv_v2_6298256.csv  
  inflating: ./data/source/population/Metadata_Country_API_SP.POP.TOTL_DS2_en_csv_v2_6298256.csv


('Archive:  ./data/source/population/population.zip\n  inflating: ./data/source/population/Metadata_Indicator_API_SP.POP.TOTL_DS2_en_csv_v2_6298256.csv  \n  inflating: ./data/source/population/API_SP.POP.TOTL_DS2_en_csv_v2_6298256.csv  \n  inflating: ./data/source/population/Metadata_Country_API_SP.POP.TOTL_DS2_en_csv_v2_6298256.csv  \n',
 '')

In [19]:
# unzip the file
runcmd(
    f"unzip ./data/source/gdp/gdp.zip -d ./data/source/gdp"
)

  inflating: ./data/source/gdp/Metadata_Indicator_API_NY.GDP.PCAP.CD_DS2_en_csv_v2_6298251.csv  
  inflating: ./data/source/gdp/API_NY.GDP.PCAP.CD_DS2_en_csv_v2_6298251.csv  
  inflating: ./data/source/gdp/Metadata_Country_API_NY.GDP.PCAP.CD_DS2_en_csv_v2_6298251.csv


('Archive:  ./data/source/gdp/gdp.zip\n  inflating: ./data/source/gdp/Metadata_Indicator_API_NY.GDP.PCAP.CD_DS2_en_csv_v2_6298251.csv  \n  inflating: ./data/source/gdp/API_NY.GDP.PCAP.CD_DS2_en_csv_v2_6298251.csv  \n  inflating: ./data/source/gdp/Metadata_Country_API_NY.GDP.PCAP.CD_DS2_en_csv_v2_6298251.csv  \n',
 '')

## Check Data Integrity

### Crops and country specs

In [20]:
ext = ".csv"
base = "data/source/"

folders = ["crops", "country_specs"]


for folder in folders : 
    path = os.path.join(base, folder)

    for fn in os.listdir(path):

        if not fn.endswith(ext):
            continue

        print(fn)

        df = pd.read_csv(os.path.join(path, fn))
        assert isinstance(df, pd.DataFrame)
        assert df.shape[0] > 0
        assert df.shape[1] > 0

crops.csv
country_specs.csv


### Population and Gdp

In [22]:
ext = ".csv"
base = "data/source/"

folders = ["population", "gdp"]

for folder in folders : 
    path = os.path.join(base, folder)

    fn = [i for i in os.listdir(path) if i.startswith("API")][0]
    print(fn)


    with open(os.path.join(path, fn)) as f:
        txt  = f.readlines()
    
    txt = txt[4:]
    
    with open(os.path.join(path, fn), "w") as f:
        f.writelines(txt)


API_SP.POP.TOTL_DS2_en_csv_v2_6298256.csv
API_NY.GDP.PCAP.CD_DS2_en_csv_v2_6298251.csv


In [25]:
ext = ".csv"
base = "data/source/"

folders = ["population", "gdp"]

for folder in folders:
    path = os.path.join(base, folder)
    fn = [i for i in os.listdir(path) if i.startswith("API")][0]

    print(fn)

    df = pd.read_csv(os.path.join(path, fn), encoding="latin-1")

    assert isinstance(df, pd.DataFrame)
    assert df.shape[0] > 0
    assert df.shape[1] > 0
    df.to_csv(os.path.join(path, fn), index=False, encoding="utf-8")

API_SP.POP.TOTL_DS2_en_csv_v2_6298256.csv
API_NY.GDP.PCAP.CD_DS2_en_csv_v2_6298251.csv


### Production

In [28]:
ext = ".csv"
base = "data/source/production/"

for fn in os.listdir(base):
    if not fn.endswith(ext):
        continue

    print(fn)

    df = pd.read_csv(os.path.join(base, fn), encoding="latin-1")
    assert isinstance(df, pd.DataFrame)
    assert df.shape[0] > 0
    assert df.shape[1] > 0
    df.to_csv(os.path.join(base, fn), encoding="utf-8", index=False)

production_crops_livestock_e_flags.csv
production_crops_livestock_e_itemcodes.csv
data_normalized.csv


  df = pd.read_csv(os.path.join(base, fn), encoding="latin-1")


production_crops_livestock_e_areacodes.csv
