In [1]:
# =============================================================================
# Examples using methods in the COGS package dm_tools
# =============================================================================
# Packages needed:
#    gssutils 
#    os
#    pandas
#    re
#    Levenshtein
#    fuzzywuzzy
#    IPython.display
# =============================================================================
#  pip install --upgrade git+https://github.com/GSS-Cogs/dm_tools
#  pip install git+https://github.com/GSS-Cogs/dm_tools
# =============================================================================

# This is Nerd free zone, do not change things just because you can, KISS
import dmtools as dm
import pandas as pd

In [2]:
# Pull in example data
example_data_one = pd.read_csv('example_data_one.csv')
example_data_one.head(5)

Unnamed: 0,Value,Period,CDID,Estimate Type,Aggregate
0,1919641,2015,YBHA,current-price,gross-domestic-product-at-market-prices
1,207569,2015,NTAP,current-price,less-basic-price-adjustment
2,1712072,2015,ABML,current-price,gross-value-added-at-basic-prices
3,2043909,2015,ABMI,current-price,gross-domestic-product-at-market-prices
4,221355,2015,NTAO,chained-volume-measure,less-basic-price-adjustment


In [3]:
# Pull in example data
example_data_two = pd.read_csv('example_data_two.csv')
example_data_two.head(5)

Unnamed: 0,Value,Period,CDID,Weights 2018,Sector,Industry
0,103,2015,L2KL,6.0,agriculture,agriculture-forestry-fishing
1,96,2015,L2KR,11.0,production,mining-quarrying-including-oil-and-gas-extraction
2,96,2015,L2KX,101.0,production,manufacturing
3,99,2015,L2MW,14.0,production,electricity-gas-steam-and-air
4,93,2015,L2N2,13.0,production,water-supply-sewerage-etc


In [19]:
# METHOD ONE
#help('dmtools.display_dataset_unique_values')

In [5]:
# Display unique values within each column (dimension), except the Value column
dm.display_dataset_unique_values(example_data_one)

'Period'

Index(['2015', '2015 Q1', '2015 Q2', '2015 Q3', '2015 Q4'], dtype='object')

'CDID'

Index(['ABMI', 'ABML', 'ABMM', 'KLS2', 'NTAO', 'NTAP', 'YBHA'], dtype='object')

'Estimate Type'

Index(['basic-prices', 'chained-volume-measure', 'current-price'], dtype='object')

'Aggregate'

Index(['gross-domestic-product-at-market-prices',
       'gross-value-added-at-basic-prices',
       'gross-value-added-excluding-oil-gas', 'less-basic-price-adjustment'],
      dtype='object')

In [None]:
# METHOD TWO
#help('dmtools.search_codelists_for_codes')

In [6]:
# Takes a codelists list of codes and checks to see if they are in any codelist csv files, codelist folder address passed
dimension = 'Aggregate'                                           # Column you want to look at
codes = example_data_one[dimension].unique()                      # The unique set of values within the column
fldrpth = 'codelists/'                                            # Path to the codelist folder, eg: 'users/leigh/development/family-trade/reference/codelists/'
colnme = 'Notation'                                               # Which column of each codelist to compare to
dm.search_codelists_for_codes(codes, fldrpth, colnme, dimension)

Search Directory: codelists/

------------------------------------------------------------------
Outputting File: aggregate-codelist-folder-search.csv with 7 rows
In Folder: aggregate-codelist-analysis
------------------------------------------------------------------
------------------------------------------------------------------
Outputting File: aggregate-codelist-folder-search-percentage-split.csv with 7 rows
In Folder: aggregate-codelist-analysis
------------------------------------------------------------------


'national-accounts-aggregate.csv'

In [7]:
# METHOD THREE
#help('dmtools.check_all_codes_in_codelist')

In [8]:
# Takes a unique list of codes and checks to see if they are in any a particular codelist csv files, make sure to pass the path to the actual csv file
dimension = 'Estimate Type'                                       # Column you want to look at
codes = example_data_one[dimension].unique()                      # The unique set of values within the column
filepth = 'codelists/national-accounts-estimate-type.csv'         # Path to the codelist file
colnme = 'Notation'                                               # Which column of each codelist to compare to
outputfoundcodes = True                                           # Output all results (True) or just found codes (False)
dm.check_all_codes_in_codelist(codes, filepth, colnme, dimension, outputfoundcodes)

Search File: codelists/national-accounts-estimate-type.csv

------------------------------------------------------------------
Outputting File: estimatetype-codelist-search.csv with 3 rows
In Folder: estimatetype-codelist-analysis
------------------------------------------------------------------


In [9]:
# METHOD FOUR
#help('dmtools.search_for_codes_using_levenshtein_and_fuzzywuzzy')

In [11]:
dimension = 'Sector'
codes = example_data_two[dimension].unique()
pth = 'codelists/'
colnme = 'Notation'
setDistance = 3
setRatio = 0.8
dm.search_for_codes_using_levenshtein_and_fuzzywuzzy(codes, pth, colnme, dimension, setDistance, setRatio)

------------------------------------------------------------------
Searching in Codelist Directory: codelists/
in Column: Notation
Levenshtein Distance set to : 3
Levenshtein Ratio set to : 0.8
FuzzyWuzzy Ratio set to : 80.0
------------------------------------------------------------------
------------------------------------------------------------------
Outputting File: sector-dimension-levenshtein.csv with 20 rows
In Folder: sector-codelist-analysis
------------------------------------------------------------------


In [None]:
# METHOD FIVE
#help('dmtools.search_codes_in_codelists_and_then_search_highest_scoring_codelist_file')

In [12]:
# Combines 2 other methods: search_codelists_for_codes & check_all_codes_in_codelist
dimension = 'Estimate Type'                                       # Column you want to look at
codes = example_data_one[dimension].unique()                      # The unique set of values within the column
filepth = 'codelists/'                                            # Path to the codeliss file
colnme = 'Notation'                                               # Which column of each codelist to compare to
outputfoundcodes = True 
dm.search_codes_in_codelists_and_then_search_highest_scoring_codelist_file(codes, filepth, colnme, dimension, outputfoundcodes)

Seaching codelist folder for codes
************************************************************************
Search Directory: codelists/

------------------------------------------------------------------
Outputting File: estimatetype-codelist-folder-search.csv with 2 rows
In Folder: estimatetype-codelist-analysis
------------------------------------------------------------------
------------------------------------------------------------------
Outputting File: estimatetype-codelist-folder-search-percentage-split.csv with 2 rows
In Folder: estimatetype-codelist-analysis
------------------------------------------------------------------
************************************************************************
Seaching codes against codelist file: national-accounts-estimate-type.csv
Search File: codelists/national-accounts-estimate-type.csv

------------------------------------------------------------------
Outputting File: estimatetype-codelist-search.csv with 3 rows
In Folder: estimate

In [17]:
#help('dmtools')
help('dmtools.check_all_codes_in_codelist')

Help on function check_all_codes_in_codelist in dmtools:

dmtools.check_all_codes_in_codelist = check_all_codes_in_codelist(codes, pth, colnme, dimension, outputfoundcodes)
    CHECK IF ALL YOUR DIMENSION VALUES (CODES) ARE IN A SPECIFIC CODELIST
    This methods takes a unique list of values (codes) and checks to see if they are in a specific csv codelist file (pth), column from file is sleected with colnme.
    the dimension variable is used to name the resulting file, which lists if the code has been found or not, it also looks for any Nan values.
    Once the search has completed it creates a folder called {dimension}-codelist-analysis within your current directory and saves a file called {dimension}-code-search.csv,
    that lists the results of the search (Columns = Dataset Codes, Codelist Codes, Result: {Found, NOT FOUND, ITS A NAN})
    This methods takes as its arguments:
        codes: A list of unique values taken from a datasets column within a transform
        pth: This i

In [None]:
#pip install --upgrade git+https://github.com/GSS-Cogs/dm_tools

In [None]:
#pip uninstall dmtools --yes 