In [2]:
import pandas as pd
from tqdm import tqdm
from datetime import datetime as dt
from pytz import timezone


In [3]:
# Load in the merged dataframe

ET = 'Canada/Eastern'
start_time = dt.now(timezone(ET))
today = str(start_time)[:10]
today = '2022-07-04'

mergedFilePathName = f'/home/jovyan/ODBiz/3-Merging/output/ODBiz_merged_{today}.csv'
total_lines = 1354503
chunksize = 1000
# df = pd.read_csv(mergedFilePathName, low_memory=False)
df = pd.concat([chunk for chunk in tqdm(pd.read_csv(mergedFilePathName, chunksize=chunksize), desc='Loading data', total=total_lines//chunksize)])
num_of_rows = df.shape[0]

print(f'Successfully loaded {mergedFilePathName}')

Loading data: 100%|██████████| 1354/1354 [00:12<00:00, 105.46it/s]
Successfully loaded /home/jovyan/ODBiz/3-Merging/output/ODBiz_merged_2022-07-04.csv


In [4]:
# Verify that all source files are present in the merged file and extract a list of non-Canadian 'provinces'

print(f'Number of unique data sources: {len(pd.unique(df.localfile))}')
print('--------------------------------------------------------------')
print('Unique Non-Canadian Province Entries:')
can_prov_lst = ['AB', 'BC', 'MB', 'NB', 'NL', 'NS', 'NT', 'NU', 'ON', 'PE', 'QC', 'SK', 'YT']
provinces = pd.unique(df.province)
non_cad_provs = []
for i in provinces:
    if not(i in can_prov_lst):
        non_cad_provs.append(i)
        print(i)




Number of unique data sources: 60
--------------------------------------------------------------
Unique Non-Canadian Province Entries:
nan
78
LAKEVIEW ESTATES


In [5]:
# Extract a list of indices corresponding to non-Canadian entries and obtain the subset of the df that contains these entries
non_cad_idx= []
for i, row in tqdm(df.iterrows(), total = num_of_rows):
    if row.province in non_cad_provs:
        non_cad_idx.append(i)
print(f'Number of non-Canadian entries found: {len(non_cad_idx)}')

non_cad_df = df.loc[non_cad_idx]
nonCanadianEntriesFileName = '/home/jovyan/ODBiz/3-Merging/output/nonCanadianEntries.csv'
non_cad_df.to_csv(nonCanadianEntriesFileName)
print(f'{nonCanadianEntriesFileName} saved')

100%|██████████| 1353709/1353709 [01:03<00:00, 21293.66it/s]
Number of non-Canadian entries found: 3433
/home/jovyan/ODBiz/3-Merging/output/nonCanadianEntries.csv saved


In [6]:
pd.unique(non_cad_df.localfile)

array(['ON_Pickering_Business_Directory.csv',
       'BC_Vancouver_Business_Licences.csv',
       'BC_Nanaimo_Business_Licences.csv',
       'ON_York_Region_Business_Directory.csv',
       'ON_Durham_Business_Directory.csv'], dtype=object)

In [9]:
USA_states = [ 'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
           'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
           'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
           'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
           'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'ARIZONA']

foreign_prov_codes = list(set(non_cad_provs) - set(USA_states))
# for i in foreign_prov_codes:
#     print(i)

# Extract a list of indices corresponding to foreign code entries and obtain the subset of the df that contains these entries
foreign_codes_idx = []
for i, row in tqdm(non_cad_df.iterrows(), total = 3433 ):
    if row.province in foreign_prov_codes:
        foreign_codes_idx.append(i)
print(f'Number of foreign entries found: {len(foreign_codes_idx)}')

foreign_codes_df = df.loc[foreign_codes_idx]
pd.options.display.max_columns = None
foreign_codes_df.head()

100%|██████████| 3433/3433 [00:00<00:00, 20916.65it/s]Number of foreign entries found: 3433



Unnamed: 0,idx,localfile,business_name,business_sector,business_subsector,business_description,business_id_no,licence_number,licence_type,primary_NAICS,secondary_NAICS,NAICS_descr,alt_econ_act_code,alt_econ_act_descrip,latitude,longitude,full_address,full_address_2,mailing_address,unit,street_no,street_name,street_direction,street_type,city,province,postal_code,country,business_website,email,telephone,telephone_extension,toll_free_telephone,fax,total_no_employees,no_full_time,no_part_time,no_seasonal,date_established,indigenous,status,provider,duplicated,idx_basic,temp,geo_source
72031,0039a977c31c57aeb420,ON_Pickering_Business_Directory.csv,Scugog Council for the Arts,Council for the arts. Arts resource centre and...,,,99931,,,813410.0,,Civic and social organizations,,,44.105208,-78.944945,,,,g-1,1266,Townline,West,Road,Scugog,,L9L1A7,,www.scugogarts.ca,,9059852121.0,,,,,,,,,no,,Regional Municipality of Durham,False,72032,SCUGOGCOUNCILFORTHEARTS-NULL-99931-8134100-NUL...,Source
72037,c0df9e756fd7b628fc2d,ON_Pickering_Business_Directory.csv,Forget-Me-Not Alpacas,"Raise Alpacas, Sell Clothing made of Alpaca fu...",,,552799,,,112990.0,,All Other Animal Production,,,44.406632,-79.098971,,,,,1595,Brock Concession Rd 3,,,Beaverton,,L0K1A0,,www.forgetmenotalpacas.ca,,2892216102.0,,,,,,,,,no,,Regional Municipality of Durham,False,72038,FORGETMENOTALPACAS-NULL-552799-1129900-NULL-NU...,Source
72230,01a278895da164ece77c,ON_Pickering_Business_Directory.csv,Elwill Acres,OFA Farm,,,239024,,,111999.0,,All other miscellaneous crop farming,,,44.252557,-79.010867,,,,,S13325,Brock Concession Rd 4,,,Sunderland,,L0C1H0,,,,7053572677.0,,,,,,,,,no,,Regional Municipality of Durham,False,72231,ELWILLACRES-NULL-239024-1119990-NULL-NULL-NULL,Source
72237,b2375d963df4c7b4a362,ON_Pickering_Business_Directory.csv,OFA Farm - B33245,OFA Farm,,,238726,,,111999.0,,All other miscellaneous crop farming,,,44.495986,-79.112838,,,,,B33245,Thorah Concession Rd 10,,,Beaverton,,L0K1A0,,,,,,,,,,,,,no,,Regional Municipality of Durham,False,72238,OFAFARMB33245-NULL-238726-1119990-NULL-NULL-NULL,Source
72251,51a293bf2cf2f33b9c9d,ON_Pickering_Business_Directory.csv,Lai & Associates DMC,Digital media production and consultancy.,,,569845,,,541890.0,,Other Services Related to Advertising,,,43.838946,-79.08391,,,,,606,The Esplanade,North,,Pickering,,L1V6V2,,www.laiandassociates.ca,,6476463386.0,,,,,,,,,no,,Regional Municipality of Durham,False,72252,LAI&ASSOCIATESDMC-NULL-569845-5418900-NULL-NUL...,Source


In [19]:
# Print the unique countries present
print('--------------------------------------------------------------')
print('Unique Country Entries:')
countries = pd.unique(df['country'])
for i in countries:
        print(i)

--------------------------------------------------------------
Unique Country Entries:
nan
CANADA


In [20]:
sub_df = df[df.province == 'WA']
for i,row in sub_df.iterrows():
    print(row)
    break
# df.loc['idx', '8f6bd3c96413edd33d9d']

In [21]:
# Write the non-cad prov codes to a file
filename = '/home/jovyan/ODBiz/3-Merging/output/nonCadProvCodes.txt'
with open(filename, 'w') as f:
    for i in non_cad_provs:
        f.write(f'{i}\n')
print(f'List saved to {filename}')

List saved to /home/jovyan/ODBiz/3-Merging/output/nonCadProvCodes.txt


In [22]:
foreign_prov_codes

[nan, '78', 'LAKEVIEW ESTATES']

In [23]:
status_lst = pd.unique(df.status)
for i in status_lst:
    print(i)

nan
1.0
OPEN
Issued
Inactive
Gone Out of Business
Cancelled
Pending
Invalid Status Code
APPROVED
Renewal Licensed
Pending Renewal
Licensed
Move in Progress
Renewal Notification Sent
Close in Progress
ISSUED
RENEWAL NOTICE


In [24]:
provinces

array(['BC', 'AB', 'NB', 'ON', nan, 'QC', 'NS', 'MB', 'SK', '78', 'YT',
       'NL', 'LAKEVIEW ESTATES', 'NT', 'NU', 'PE'], dtype=object)