## Usage

In [1]:
from moodys_datahub.tools import *
from moodys_datahub.utils import *

### Connect to SFTP server using the provided SSH key file (.pem)

In [2]:
# Connects to default CBS SFTP server
SFTP = Sftp(privatekey="user_provided-ssh-key.pem")

# Connects to custom SFTP server
SFTP = Sftp(hostname = "example.com", username = "username", port = 22,privatekey="user_provided-ssh-key.pem") 

In [4]:
SFTP.delete_files = False # True/False: To delete the downloaded files post curation (to prevent very large amounts of data being stored locally)
SFTP.concat_files = True # True/False: To concatenate the curated data product sub files into a single output file.
SFTP.output_format =  [".csv"] # Defining output formats.
SFTP.file_size_mb = 300 # Cut-off size for when to split output files into multiple files.

## company name -> bvd_numbers (using fuzzy match algoritm)

In [13]:
SFTP.remote_path = "IvdS14LwRxucymVszEBE3Q/unscheduled/giin"
SFTP.local_path = "company_names" 
df_names = SFTP.process_all(SFTP.remote_files,destination='company_name',select_cols= ['bvd_id_number','name_as_in_the_fatca_register_']) # Only loading the columns defined in filter_col

Remote path is valid:'IvdS14LwRxucymVszEBE3Q/unscheduled/giin'
Folder 'company_names' created.


In [None]:
# Example usage:
input_strings = ["Bank of America", "AXA2", "JPMorgan Chase"]
extended_list = input_strings * 100
match_col = 'name_as_in_the_fatca_register_'
return_col = 'bvd_id_number'

str_remove = ["GMBH"," - Branch","CO.","LTD","Ltd","Limited", "GMBH","A/S"]

result_df = fuzzy_match(input_strings=extended_list,df=df_names,num_workers=-1, match_column=match_col, return_column=return_col,cut_off=50,remove_str=str_remove)

result_df.head()

## national id numbers -> bvd_numbers 

In [5]:
SFTP.local_path = "Identifiers"
SFTP.remote_path = "IvdS14LwRxucymVszEBE3Q/unscheduled/identifiers"

# Below is just to collect 1000 random 'national_id_numbers'
SFTP.output_format =  None # Defining output formats.
df_id_number = SFTP.process_all(files =SFTP.remote_files[0:4],num_workers=-1,destination='identifiers',select_cols= ['bvd_id_number','national_id_number'])
national_id_number = random_sample = df_id_number['national_id_number'].sample(n=1000, random_state=42)


Folder 'Identifiers' already exists.
Remote path is valid:'IvdS14LwRxucymVszEBE3Q/unscheduled/identifiers'


In [6]:
SFTP.output_format =  [".csv"] # Defining output formats.

# Define query statement
query_args = list(national_id_number.dropna())
query_str =f"national_id_number in {query_args}"

# Execute
query_id_numbers = SFTP.process_all(SFTP.remote_files[0:4],num_workers = -1,destination ="query_2",select_cols = ['bvd_id_number','national_id_number'],query = query_str,query_args=query_args) 

# Sanity check 
query_id_numbers['national_id_number'].isin(query_args).all()

True