
# Dev Notes

Gonna use pandas directly. Shouldn't be any consistency issues as these excels are machine generated, 
and will take forever with databaker as excessive depth (databaker has diminishing returns on speed of lookups). 


## Usage

1.) CHANGE THE variables `importsInFile` and `exportsInFile` in the below cell.

2.) Use `Cell->Run All` from the above ribbon.
<br>
<br>
NOTE - this is not quick, expect it to take 5-10 mins and just leave it to run, it's fine.




In [None]:

import pandas as pd
from databakerUtils.v4Functions import v4Integers
import requests
import glob

# ###########################
# CHANGE INPUT FILENAMES HERE
# ###########################

location = '*.xlsx'
files = glob.glob(location)
importsInFile = [file for file in files if 'import' in file][0]
exportsInFile = [file for file in files if 'export' in file][0]

dfI = pd.read_excel(importsInFile, 'Country by Commodity Import')   # create imports dataframe
dfE = pd.read_excel(exportsInFile, 'Country by Commodity Export')   # create exports dataframe

# Sanity check imports dataframe - literally just so we can eyeball the first three lines.
dfI[:3]

In [None]:
# Sanity check exports dataframe - literally just so we can eyeball the first three lines.
dfE[:3]

# Functions we'll reuse a few times.

def fixTime(cell):
    """
    Takes the horrible date i.e '1998JAN' and returns something cmd friendly. i.e 1998JAN becomes 'Jan-98'.
    """
    
    # get rid of pointess quotes
    cell = cell.replace("'", "")
    
    # Some validation, as this is most likely place to encounter 'fun'
    assert len(cell) == 7, "Aborting. Expecting 'date' to be 7 characters long (eg 1998JAN). We got: " + cell
    
    try:
        pointless = int(cell[:4]) # hacky
    except:
        raise ValueError("First 4 characters of 'date' should be a year, we got: " + cell[:4])
        
    return cell[-3:].title() + "-" + cell[2:4]

url = 'https://api.beta.ons.gov.uk/v1/code-lists/sitc/editions/one-off/codes'
r = requests.get(url)
wholeDict = r.json()
commodityDict = {}
for item in wholeDict['items']:
    commodityDict.update({item['code']:item['label']})
def CommodityLabels(value):
    #returns sitc labels from api
    return commodityDict[value]

url = 'https://api.beta.ons.gov.uk/v1/code-lists/countries-and-territories/editions/one-off/codes'
r = requests.get(url)
wholeDict = r.json()
countryDict = {}
for item in wholeDict['items']:
    countryDict.update({item['code']:item['label']})
def CountryLabels(value):
    #returns countries-and-territories labels from api
    return countryDict[value]


# Transformation Script

At this point the dataframes and function are loaded (by executing previous cells), so executing the following cell creates the 
final v4 file.

## Explanation
We're just gonna pivot the data a column at a time. Creating a list containing dataframes - then concatenating them into 
one "master" dataframe which is written to csv.

If you think of the source data (see sanity checks above), each of the sub-datasets we're creating is made from
the first-3-columns + cmd bumf and 1 * time column.



In [None]:

allFrames = [] # list for holding each sub-dataframe

# TODO - no real need to have both sources in memory at the same time
num = 0
for source in [dfI, dfE]:
        
    num += 1 # simeple counter for feedback.
    
    # For each date column:
    for dateCol in source.columns.values[3:]:
        
        df = pd.DataFrame()

        df["v4_0"] = source[dateCol]
        
        df["mmm-yy"] = fixTime(dateCol)
        df["Time"] = fixTime(dateCol)
        
        df["uk-only"] = "K02000001"
        df["Geography"] = "United Kingdom"
        
        # For the three topic dimensions, they appear to have put the code and label together.
        # just need to split them out
        
        # TODO - messy. Less lambda more func
        # NOTE - replacing "/" with "-" as "/" has syntactical meaning in Cypher and breaks dimension importer
        df["sitc"] = source["COMMODITY"].map(lambda x: x.split(" ")[0]).str.replace("/", "-")
        df["StandardIndustrialTradeClassification"] = source["COMMODITY"].map(lambda x: " ".join(x.split(" ")[1:]))
        
        df["countries-and-territories"] = source["COUNTRY"].map(lambda x: x.split(" ")[0])
        df["CountriesAndTerritories"] = source["COUNTRY"].map(lambda x: " ".join(x.split(" ")[1:]))
        
        df["trade-direction"] = source["DIRECTION"].map(lambda x: x.split(" ")[0])
        df["Direction"] = source["DIRECTION"].map(lambda x: x.split(" ")[1])
        
        allFrames.append(df)
        print("Generated sub-dataframe for {dc} from source {n} of 2.".format(dc=dateCol, n=num))
    
allDf = pd.concat(allFrames)

allDf['v4_0'] = allDf['v4_0'].apply(v4Integers) #changes floats to string-integers

allDf['CountriesAndTerritories'] = allDf['countries-and-territories'].apply(CountryLabels) #change country labels
allDf['StandardIndustrialTradeClassification'] = allDf['sitc'].apply(CommodityLabels) #change commodity labels

allDf.to_csv("v4_Trade.csv", index=False) # output to csv
print("v4 File successfully generated.")

# Sanity check output, 5 lines only
allDf[:5]