In [1]:
import pandas as pd
from tqdm import tqdm
from datetime import datetime as dt
from pytz import timezone


In [2]:
# Load in the merged dataframe

ET = 'Canada/Eastern'
start_time = dt.now(timezone(ET))
today = str(start_time)[:10]

mergedFilePathName = f'/home/jovyan/ODBiz/3-Merging/output/ODBiz_merged_{today}.csv'
total_lines = 1354503
chunksize = 1000
# df = pd.read_csv(mergedFilePathName, low_memory=False)
df = pd.concat([chunk for chunk in tqdm(pd.read_csv(mergedFilePathName, chunksize=chunksize), desc='Loading data', total=total_lines//chunksize+1)])
num_of_rows = df.shape[0]

print(f'Successfully loaded {mergedFilePathName}')

Loading data: 100%|██████████| 1355/1355 [00:14<00:00, 92.00it/s]
Successfully loaded /home/jovyan/ODBiz/3-Merging/output/ODBiz_merged_2022-06-24.csv


In [3]:
# Verify that all source files are present in the merged file and extract a list of non-Canadian 'provinces'

print(f'Number of unique data sources: {len(pd.unique(df.localfile))}')
print('--------------------------------------------------------------')
print('Unique Non-Canadian Province Entries:')
can_prov_lst = ['AB', 'BC', 'MB', 'NB', 'NL', 'NS', 'NT', 'NU', 'ON', 'PE', 'QC', 'SK', 'YT']
provinces = pd.unique(df.province)
non_cad_provs = []
for i in provinces:
    if not(i in can_prov_lst):
        non_cad_provs.append(i)
        print(i)




Number of unique data sources: 60
--------------------------------------------------------------
Unique Non-Canadian Province Entries:
WA
nan
NC
IL
CA
TX
NY
PQ
CO
NV
AL
GA
AZ
NJ
FL
US
MA
PA
OH
KS
WI
VA
UT
DE
OR
78
`
SC
MI
MD
CT
QB
NF
ID
MN
MO
SP
RJ
IN
HB
TN
LAKEVIEW ESTATES
TEHRAN
MT
OK
CHESHITE
ARIZONA


In [21]:
# Extract a list of indices corresponding to non-Canadian entries and obtain the subset of the df that contains these entries
non_cad_idx= []
for i, row in tqdm(df.iterrows(), total = num_of_rows):
    if row.province in non_cad_provs:
        non_cad_idx.append(i)
print(f'Number of non-Canadian entries found: {len(non_cad_idx)}')

non_cad_df = df.loc[non_cad_idx]
nonCanadianEntriesFileName = '/home/jovyan/ODBiz/3-Merging/output/nonCanadianEntries.csv'
non_cad_df.to_csv(nonCanadianEntriesFileName)
print(f'{nonCanadianEntriesFileName} saved')

100%|██████████| 1354503/1354503 [01:14<00:00, 18242.20it/s]
Number of non-Canadian entries found: 4268
/home/jovyan/ODBiz/3-Merging/output/nonCanadianEntries.csv saved


In [7]:
pd.unique(non_cad_df.localfile)

array(['BC_Vancouver_Business_Licences.csv',
       'ON_Durham_Business_Directory.csv',
       'BC_Nanaimo_Business_Licences.csv',
       'ON_York_Region_Business_Directory.csv',
       'ON_Pickering_Business_Directory.csv'], dtype=object)

In [19]:
USA_states = [ 'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
           'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
           'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
           'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
           'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY', 'ARIZONA']

foreign_prov_codes = list(set(non_cad_provs) - set(USA_states))
# for i in foreign_prov_codes:
#     print(i)

# Extract a list of indices corresponding to foreign code entries and obtain the subset of the df that contains these entries
foreign_codes_idx = []
for i, row in tqdm(non_cad_df.iterrows(), total = 4268 ):
    if row.province in foreign_prov_codes:
        foreign_codes_idx.append(i)
print(f'Number of foreign entries found: {len(foreign_codes_idx)}')

foreign_codes_df = df.loc[foreign_codes_idx]
foreign_codes_df

100%|██████████| 4268/4268 [00:00<00:00, 17463.33it/s]Number of foreign entries found: 3485



Unnamed: 0,idx,localfile,business_name,business_sector,business_subsector,business_description,business_id_no,licence_number,licence_type,primary_NAICS,...,no_full_time,no_part_time,no_seasonal,date_established,indigenous,provider,duplicated,idx_basic,temp,geo_source
99261,c103f9f66001544d655d,BC_Vancouver_Business_Licences.csv,Ipsos Reid Corporation,Administration,Office,,,13-184744,,,...,,,,,no,City of Vancouver,False,99262,IPSOSREIDCORPORATION-13184744-NULL-NULL-NULL-N...,
104860,a2d286aba44fbc7cf0aa,BC_Vancouver_Business_Licences.csv,Creekside Architects Ltd,Consultant,Office,,,13-181328,,,...,,,,,no,City of Vancouver,False,104861,CREEKSIDEARCHITECTSLTD-13181328-NULL-NULL-NULL...,
104904,5a375baae54e8bda2f7f,BC_Vancouver_Business_Licences.csv,Golden Properties Ltd,Property Management,Office,,,13-181458,,,...,,,,,no,City of Vancouver,False,104905,GOLDENPROPERTIESLTD-13181458-NULL-NULL-NULL-NU...,
104993,1ea63da9455ce22e0bcf,BC_Vancouver_Business_Licences.csv,Oceanic Business Centre Inc,Secretarial Services,Office,,,13-181682,,,...,,,,,no,City of Vancouver,False,104994,OCEANICBUSINESSCENTREINC-13181682-NULL-NULL-NU...,
105015,5aab43277f19558e9fdb,BC_Vancouver_Business_Licences.csv,North Arm Transportation Ltd,Shipping Agent,Office,,,13-181741,,,...,,,,,no,City of Vancouver,False,105016,NORTHARMTRANSPORTATIONLTD-13181741-NULL-NULL-N...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1348794,153132a72dcb33408ddb,ON_Pickering_Business_Directory.csv,Stanmore Equipment Ltd,Supplier of construction equipment to businesses.,,,608063,,,532410.0,...,,,,,no,Regional Municipality of Durham,True,1348795,STANMOREEQUIPMENTLTD-NULL-608063-5324100-NULL-...,
1351281,18431bf5e63851cd8ac0,ON_Pickering_Business_Directory.csv,Berger Heating Ltd.,"Heating, ventilation, & air conditioning (HVAC...",,,238870,,,238220.0,...,,,,,no,Regional Municipality of Durham,True,1351282,BERGERHEATINGLTD-NULL-238870-2382200-NULL-NULL...,
1352011,48a6f5c3fa4441ffddbd,ON_Pickering_Business_Directory.csv,Brimacombe,Ski hill and chalet.,,,4210,,,713920.0,...,,,,,no,Regional Municipality of Durham,True,1352012,BRIMACOMBE-NULL-4210-7139200-NULL-NULL-NULL,
1352085,db5be65a27c6c5602e00,ON_Pickering_Business_Directory.csv,Ground Up Renovations,Renovation Company,,,552600,,,236118.0,...,,,,,no,Regional Municipality of Durham,True,1352086,GROUNDUPRENOVATIONS-NULL-552600-2361180-NULL-N...,


In [5]:
# Print the unique countries present
print('--------------------------------------------------------------')
print('Unique Country Entries:')
countries = pd.unique(df['country'])
for i in countries:
        print(i)

--------------------------------------------------------------
Unique Country Entries:
nan
CANADA


In [6]:
sub_df = df[df.province == 'WA']
for i,row in sub_df.iterrows():
    print(row)
    break
# df.loc['idx', '8f6bd3c96413edd33d9d']

idx                                                  8f6bd3c96413edd33d9d
localfile                              BC_Vancouver_Business_Licences.csv
business_name                                        IMATS Vancouver 2013
business_sector                                              Show/Contest
business_subsector                             Exhibitions/Shows/Concerts
business_description                                                  NaN
business_id_no                                                        NaN
licence_number                                                  13-389885
licence_type                                                          NaN
primary_NAICS                                                         NaN
secondary_NAICS                                                       NaN
NAICS_descr                                                           NaN
alt_econ_act_code                                                     NaN
alt_econ_act_descrip                  