In [1]:
# =============================================================================
# Examples using methods in the COGS package dm_tools
# =============================================================================
# Packages needed:
#    gssutils 
#    os
#    pandas
#    re
#    Levenshtein
#    fuzzywuzzy
#    IPython.display
# =============================================================================
#  pip install --upgrade git+https://github.com/GSS-Cogs/dm_tools
#  pip install git+https://github.com/GSS-Cogs/dm_tools
# =============================================================================

# This is Nerd free zone, do not change things just because you can, KISS
import dmtools as dm
import pandas as pd
from gssutils import *
import numpy as np

In [2]:
# Pull in example data
example_data_one = pd.read_csv('example_data_one.csv')
example_data_one.head(5)

Unnamed: 0,Value,Period,CDID,Estimate Type,Aggregate
0,1919641,2015,YBHA,current-price,gross-domestic-product-at-market-prices
1,207569,2015,NTAP,current-price,less-basic-price-adjustment
2,1712072,2015,ABML,current-price,gross-value-added-at-basic-prices
3,2043909,2015,ABMI,current-price,gross-domestic-product-at-market-prices
4,221355,2015,NTAO,chained-volume-measure,less-basic-price-adjustment


In [3]:
# Pull in example data
example_data_two = pd.read_csv('example_data_two.csv')
example_data_two.head(5)

Unnamed: 0,Value,Period,CDID,Weights 2018,Sector,Industry
0,103,2015,L2KL,6.0,agriculture,agriculture-forestry-fishing
1,96,2015,L2KR,11.0,production,mining-quarrying-including-oil-and-gas-extraction
2,96,2015,L2KX,101.0,production,manufacturing
3,99,2015,L2MW,14.0,production,electricity-gas-steam-and-air
4,93,2015,L2N2,13.0,production,water-supply-sewerage-etc


In [4]:
# METHOD ONE
#help('dmtools.display_dataset_unique_values')

In [5]:
# Display unique values within each column (dimension), except the Value column
dm.display_dataset_unique_values(example_data_one)

Number of rows: 40
Number of columns: 5
Column names: 
Index(['Value', 'Period', 'CDID', 'Estimate Type', 'Aggregate'], dtype='object')




'Period'

['2015', '2015 Q1', '2015 Q2', '2015 Q3', '2015 Q4']



'CDID'

['ABMI', 'ABML', 'ABMM', 'KLS2', 'NTAO', 'NTAP', 'YBHA']



'Estimate Type'

['basic-prices', 'chained-volume-measure', 'current-price']



'Aggregate'

['gross-domestic-product-at-market-prices',
 'gross-value-added-at-basic-prices',
 'gross-value-added-excluding-oil-gas',
 'less-basic-price-adjustment']



In [6]:
# METHOD TWO
#help('dmtools.search_codelists_for_codes')

In [7]:
# Takes a codelists list of codes and checks to see if they are in any codelist csv files, codelist folder address passed
dimension = 'Aggregate'                                           # Column you want to look at
codes = example_data_one[dimension].unique()                      # The unique set of values within the column
fldrpth = 'codelists/'                                            # Path to the codelist folder, eg: 'users/leigh/development/family-trade/reference/codelists/'
colnme = 'Notation'                                               # Which column of each codelist to compare to
flenme = dm.search_codelists_for_codes(codes, fldrpth, colnme, dimension)
print('Best matched file: ' + flenme)

Search Directory: codelists/

Dimension: Aggregate
Outputting File: aggregate-codelist-folder-search.csv with 7 rows
In Folder: aggregate-codelist-analysis
Outputting File: aggregate-codelist-folder-search-percentage-split.csv with 7 rows
In Folder: aggregate-codelist-analysis
Best matched file: national-accounts-aggregate.csv


In [8]:
# METHOD THREE
#help('dmtools.check_all_codes_in_codelist')

In [12]:
# Takes a unique list of codes and checks to see if they are in any a particular codelist csv files, make sure to pass the path to the actual csv file
# If a file has been output it returns the path and name for use in the next method
dimension = 'Estimate Type'                                       # Column you want to look at
codes = example_data_one[dimension].unique()                      # The unique set of values within the column
filepth = 'codelists/national-accounts-estimate-type.csv'         # Path to the codelist file
colnme = 'Notation'                                               # Which column of each codelist to compare to
outputfoundcodes = True                                           # Output all results (True) or just found codes (False)
filename = dm.check_all_codes_in_codelist(codes, filepth, colnme, dimension, outputfoundcodes)

Search File: codelists/national-accounts-estimate-type.csv

Outputting File: estimatetype-codelist-search.csv with 3 rows
In Folder: estimatetype-codelist-analysis


In [13]:
# METHOD FOUR
#help('dmtools.add_missing_codes_to_codelist')

In [17]:
# If missing codes have been found in the previous method you can add them to the codelist (from the file) 
# using this method. Sort Priority id reconfigured, everything else is left as is.
# It does check to see if it has already been added.
filepth = 'codelists/national-accounts-estimate-type.csv'
dm.add_missing_codes_to_codelist(filename, filepth)

Missing codes: 3
New codes have been added to file: 
codelists/national-accounts-estimate-type.csv


Unnamed: 0,Label,Notation,Parent Notation,Sort Priority
0,Current price,current-price,,1
1,Chained Volume Measure,chained-volume-measure,,2
2,Deflator,deflator,,3
3,People,people,,4
4,Basic prices,basic-prices,,5


In [None]:
# METHOD FIVE
#help('dmtools.search_for_codes_using_levenshtein_and_fuzzywuzzy')

In [None]:
dimension = 'Sector'
codes = example_data_two[dimension].unique()
pth = 'codelists/'
colnme = 'Notation'
setDistance = 3
setRatio = 0.8
dm.search_for_codes_using_levenshtein_and_fuzzywuzzy(codes, pth, colnme, dimension, setDistance, setRatio)

In [None]:
# METHOD SIX
#help('dmtools.search_codes_in_codelists_and_then_search_highest_scoring_codelist_file')

In [None]:
# Combines 2 other methods: search_codelists_for_codes & check_all_codes_in_codelist
dimension = 'Estimate Type'                                       # Column you want to look at
codes = example_data_one[dimension].unique()                      # The unique set of values within the column
filepth = 'codelists/'                                            # Path to the codeliss file
colnme = 'Notation'                                               # Which column of each codelist to compare to
outputfoundcodes = True 
dm.search_codes_in_codelists_and_then_search_highest_scoring_codelist_file(codes, filepth, colnme, dimension, outputfoundcodes)

In [None]:
pip install --upgrade git+https://github.com/GSS-Cogs/dm_tools

In [None]:
pip uninstall dmtools --yes 