In [1]:
import pandas as pd
import numpy as np
import altair as alt

In [2]:
def tab_reader(filepathpattern,tabnum,skiprow,low_memory=False):
    tabs = []
    for i in range (tabnum):
        cur_index = i+1
        cur_filepath = filepathpattern.format(cur_index)
        try:
            tab = pd.read_csv(cur_filepath, skiprows=skiprow, low_memory=False)
        except:
            tab = pd.read_csv(cur_filepath, skiprows=skiprow, encoding='latin-1', low_memory=low_memory)
        tabs.append(tab)
    return tabs


In [3]:
# 2017 data
rcbp_2017_mb_revenue_Tabs = tab_reader("../../../data/2017_csv_eng/2017_Medium businesses_Total revenue based_csv/_2017_Eng_Medium_Revenue_Tab{}.csv",5,5)
rcbp_2017_sb_revenue_Tabs = tab_reader("../../../data/2017_csv_eng/2017_Small businesses_Total revenue based_csv/_2017_Small businesses_Total revenue based_Tab{}.csv",7,5)
rcbp_2017_sb_revenue_Tabs[0] = tab_reader("../../../data/2017_csv_eng/2017_Small businesses_Total revenue based_csv/_2017_Small businesses_Total revenue based_Tab{}.csv",1,6)[0]

In [4]:
# 2018 data
rcbp_2018_mb_revenue_Tabs = tab_reader("../../../data/2018_csv_eng/2018_Medium businesses_Total revenue based_csv/_2018_Medium businesses_Total revenue_Tab{}.csv",5,5)
rcbp_2018_sb_revenue_Tabs = tab_reader("../../../data/2018_csv_eng/2018_Small businesses_Total revenue based_csv/_2018_Small businesses_Total revenue_Tab{}.csv",7,5)
rcbp_2018_sb_revenue_Tabs[0] = tab_reader("../../../data/2018_csv_eng/2018_Small businesses_Total revenue based_csv/_2018_Small businesses_Total revenue_Tab{}.csv",1,6)[0]

In [5]:
# 2019 data
rcbp_2019_mb_revenue_Tabs = tab_reader("../../../data/2019_csv_eng/2019_Medium businesses_Total revenue_csv/_2019_Medium businesses_Total revenue_Tab{}.csv",5,5)
rcbp_2019_sb_revenue_Tabs = tab_reader("../../../data/2019_csv_eng/2019_Small businesses_Total revenue_csv/_2019_Small businesses_Total revenue_Tab{}.csv",7,5)
rcbp_2019_sb_revenue_Tabs[0] = tab_reader("../../../data/2019_csv_eng/2019_Small businesses_Total revenue_csv/_2019_Small businesses_Total revenue_Tab{}.csv",1,6)[0]

In [6]:
# Drop the first NaN row, select all industry code level, drop quartile columns and concat different years
def selectData(tabs):
    concatTabs = []
    for tab in tabs:
        tab = tab.drop(0)
        cols_without_quartile = [x for x in tab.columns if 'quartile' not in x.lower() and 'reporting' not in x.lower()]
        tab = tab[cols_without_quartile]
        concatTabs.append(tab)
    return pd.concat(concatTabs)

In [7]:
## medium tab1-tab5 3year contain All industry(code=0)
mb_revenue_tab1_3y_pca = selectData([rcbp_2017_mb_revenue_Tabs[0], rcbp_2018_mb_revenue_Tabs[0], rcbp_2019_mb_revenue_Tabs[0]]).reset_index(drop=True)
mb_revenue_tab2_3y_pca = selectData([rcbp_2017_mb_revenue_Tabs[1], rcbp_2018_mb_revenue_Tabs[1], rcbp_2019_mb_revenue_Tabs[1]]).reset_index(drop=True)
mb_revenue_tab3_3y_pca = selectData([rcbp_2017_mb_revenue_Tabs[2], rcbp_2018_mb_revenue_Tabs[2], rcbp_2019_mb_revenue_Tabs[2]]).reset_index(drop=True)
mb_revenue_tab4_3y_pca = selectData([rcbp_2017_mb_revenue_Tabs[3], rcbp_2018_mb_revenue_Tabs[3], rcbp_2019_mb_revenue_Tabs[3]]).reset_index(drop=True)
mb_revenue_tab5_3y_pca = selectData([rcbp_2017_mb_revenue_Tabs[4], rcbp_2018_mb_revenue_Tabs[4], rcbp_2019_mb_revenue_Tabs[4]]).reset_index(drop=True)

## small tab1-tab7 3year contain All industry
sb_revenue_tab1_3y_pca = selectData([rcbp_2017_sb_revenue_Tabs[0], rcbp_2018_sb_revenue_Tabs[0], rcbp_2019_sb_revenue_Tabs[0]]).reset_index(drop=True)
sb_revenue_tab2_3y_pca = selectData([rcbp_2017_sb_revenue_Tabs[1], rcbp_2018_sb_revenue_Tabs[1], rcbp_2019_sb_revenue_Tabs[1]]).reset_index(drop=True)
sb_revenue_tab3_3y_pca = selectData([rcbp_2017_sb_revenue_Tabs[2], rcbp_2018_sb_revenue_Tabs[2], rcbp_2019_sb_revenue_Tabs[2]]).reset_index(drop=True)
sb_revenue_tab4_3y_pca = selectData([rcbp_2017_sb_revenue_Tabs[3], rcbp_2018_sb_revenue_Tabs[3], rcbp_2019_sb_revenue_Tabs[3]]).reset_index(drop=True)
sb_revenue_tab5_3y_pca = selectData([rcbp_2017_sb_revenue_Tabs[4], rcbp_2018_sb_revenue_Tabs[4], rcbp_2019_sb_revenue_Tabs[4]]).reset_index(drop=True)
sb_revenue_tab6_3y_pca = selectData([rcbp_2017_sb_revenue_Tabs[5], rcbp_2018_sb_revenue_Tabs[5], rcbp_2019_sb_revenue_Tabs[5]]).reset_index(drop=True)
sb_revenue_tab7_3y_pca = selectData([rcbp_2017_sb_revenue_Tabs[6], rcbp_2018_sb_revenue_Tabs[6], rcbp_2019_sb_revenue_Tabs[6]]).reset_index(drop=True)

### 2. Select Columns, Drop irrelevant columns

**mb_tab1: 'Total number of businesses', 'Total revenue', 'Sales of goods and services ( percent of total revenue)'**

In [8]:
sb_tab1_pca_left_col = ['North American Industry Classification System, NAICS - code', 'North American Industry Classification System, NAICS',
                        'Geography - code', 'Geography','Location indicator - code', 'Location indicator', 'Incorporation status - code',
       'Incorporation status','Reference year', 
                        'Total number of businesses', 
                        'Total revenue',
                        'Sales of goods and services* ( percent of total revenue)']
sb_revenue_tab1_3y_pca = sb_revenue_tab1_3y_pca[sb_tab1_pca_left_col]

In [9]:
sb_tab2_pca_left_col = ['North American Industry Classification System, NAICS - code', 'North American Industry Classification System, NAICS',
                        'Geography - code', 'Geography','Location indicator - code', 'Location indicator', 'Incorporation status - code',
       'Incorporation status','Reference year', 
                        'Cost of sales (direct expenses) (%)', 
                        'Operating expenses (indirect expenses) (%)']
sb_revenue_tab2_3y_pca = sb_revenue_tab2_3y_pca[sb_tab2_pca_left_col]

In [10]:
sb_tab3_pca_left_col = ['North American Industry Classification System, NAICS - code', 'North American Industry Classification System, NAICS',
                        'Geography - code', 'Geography','Location indicator - code', 'Location indicator', 'Incorporation status - code',
       'Incorporation status','Reference year', 
                        'Total expenses',
                        'Cost of sales (direct expenses)', 
                        'Operating expenses (indirect expenses)',
                        'Net Profit/Loss']
sb_revenue_tab3_3y_pca = sb_revenue_tab3_3y_pca[sb_tab3_pca_left_col]

In [11]:
sb_tab4_pca_left_col = ['North American Industry Classification System, NAICS - code', 'North American Industry Classification System, NAICS',
                        'Geography - code', 'Geography','Location indicator - code', 'Location indicator', 'Incorporation status - code',
       'Incorporation status','Reference year', 
                        'Total assets', 
                        'Total current assets', 
                        'Total liabilities', 
                        'Total current liabilities', 
                        'Total equity']
sb_revenue_tab4_3y_pca = sb_revenue_tab4_3y_pca[sb_tab4_pca_left_col]

In [12]:
sb_tab5a_pca_left_col = ['North American Industry Classification System, NAICS - code', 'North American Industry Classification System, NAICS',
                        'Geography - code', 'Geography','Location indicator - code', 'Location indicator','Incorporation status - code',
       'Incorporation status', 'Reference year', 
                        'Debt to equity ratio (times)', 
                        'Current debt to equity (%)', 
                        'Revenue to equity ratio (times)', 
                        'Net profit to equity (%)']
sb_revenue_tab5a_3y_pca = sb_revenue_tab5_3y_pca[sb_tab5a_pca_left_col]

In [13]:
sb_tab5b_pca_left_col = ['North American Industry Classification System, NAICS - code', 'North American Industry Classification System, NAICS',
                        'Geography - code', 'Geography','Location indicator - code', 'Location indicator','Incorporation status - code',
       'Incorporation status', 'Reference year', 
                        'Gross margin (%)']
sb_revenue_tab5b_3y_pca = sb_revenue_tab5_3y_pca[sb_tab5b_pca_left_col]

In [14]:
sb_tab6_pca_left_col = ['North American Industry Classification System, NAICS - code', 'North American Industry Classification System, NAICS',
                        'Geography - code', 'Geography','Location indicator - code', 'Location indicator','Incorporation status - code',
       'Incorporation status', 'Reference year', 
                        'Gross margin (%)']
sb_revenue_tab6_3y_pca = sb_revenue_tab6_3y_pca[sb_tab6_pca_left_col]

In [15]:
frames = [sb_revenue_tab5b_3y_pca,sb_revenue_tab6_3y_pca]
sb_revenue_tab6n_3y_pca=pd.concat(frames)

In [16]:
sb_tab7_pca_left_col = ['North American Industry Classification System, NAICS - code', 'North American Industry Classification System, NAICS',
                        'Geography - code', 'Geography','Location indicator - code', 'Location indicator','Incorporation status - code',
       'Incorporation status', 'Reference year', 
                        'Percent of profitable businesses']
sb_revenue_tab7_3y_pca = sb_revenue_tab7_3y_pca[sb_tab7_pca_left_col]

**Join the small business tab1-tab5.**

In [17]:
a = pd.merge(sb_revenue_tab1_3y_pca,sb_revenue_tab2_3y_pca,how='inner',left_on=['North American Industry Classification System, NAICS - code', 'North American Industry Classification System, NAICS',
                        'Geography - code', 'Geography','Location indicator - code', 'Location indicator','Incorporation status - code',
       'Incorporation status', 'Reference year'],right_on=['North American Industry Classification System, NAICS - code', 'North American Industry Classification System, NAICS',
                        'Geography - code', 'Geography','Location indicator - code', 'Location indicator','Incorporation status - code',
       'Incorporation status', 'Reference year'])

In [18]:
b = pd.merge(a,sb_revenue_tab3_3y_pca,how='inner',left_on=['North American Industry Classification System, NAICS - code', 'North American Industry Classification System, NAICS',
                        'Geography - code', 'Geography','Location indicator - code', 'Location indicator','Incorporation status - code',
       'Incorporation status', 'Reference year'],right_on=['North American Industry Classification System, NAICS - code', 'North American Industry Classification System, NAICS',
                        'Geography - code', 'Geography','Location indicator - code', 'Location indicator','Incorporation status - code',
       'Incorporation status', 'Reference year'])


In [19]:

c = pd.merge(b,sb_revenue_tab4_3y_pca,how='left',left_on=['North American Industry Classification System, NAICS - code', 'North American Industry Classification System, NAICS',
                        'Geography - code', 'Geography','Location indicator - code', 'Location indicator','Incorporation status - code',
       'Incorporation status', 'Reference year'],right_on=['North American Industry Classification System, NAICS - code', 'North American Industry Classification System, NAICS',
                        'Geography - code', 'Geography','Location indicator - code', 'Location indicator','Incorporation status - code',
       'Incorporation status', 'Reference year'])


In [20]:
d = pd.merge(c,sb_revenue_tab5a_3y_pca,how='left',left_on=['North American Industry Classification System, NAICS - code', 'North American Industry Classification System, NAICS',
                        'Geography - code', 'Geography','Location indicator - code', 'Location indicator','Incorporation status - code',
       'Incorporation status', 'Reference year'],right_on=['North American Industry Classification System, NAICS - code', 'North American Industry Classification System, NAICS',
                        'Geography - code', 'Geography','Location indicator - code', 'Location indicator','Incorporation status - code',
       'Incorporation status', 'Reference year'])

In [21]:
e = pd.merge(d,sb_revenue_tab6n_3y_pca,how='left',left_on=['North American Industry Classification System, NAICS - code', 'North American Industry Classification System, NAICS',
                        'Geography - code', 'Geography','Location indicator - code', 'Location indicator','Incorporation status - code',
       'Incorporation status', 'Reference year'],right_on=['North American Industry Classification System, NAICS - code', 'North American Industry Classification System, NAICS',
                        'Geography - code', 'Geography','Location indicator - code', 'Location indicator','Incorporation status - code',
       'Incorporation status', 'Reference year'])

In [22]:
final_sb = pd.merge(e,sb_revenue_tab7_3y_pca,how='left',left_on=['North American Industry Classification System, NAICS - code', 'North American Industry Classification System, NAICS',
                        'Geography - code', 'Geography','Location indicator - code', 'Location indicator','Incorporation status - code',
       'Incorporation status', 'Reference year'],right_on=['North American Industry Classification System, NAICS - code', 'North American Industry Classification System, NAICS',
                        'Geography - code', 'Geography','Location indicator - code', 'Location indicator','Incorporation status - code',
       'Incorporation status', 'Reference year'])

In [23]:
final_sb.head(10)

Unnamed: 0,"North American Industry Classification System, NAICS - code","North American Industry Classification System, NAICS",Geography - code,Geography,Location indicator - code,Location indicator,Incorporation status - code,Incorporation status,Reference year,Total number of businesses,...,Total current assets,Total liabilities,Total current liabilities,Total equity,Debt to equity ratio (times),Current debt to equity (%),Revenue to equity ratio (times),Net profit to equity (%),Gross margin (%),Percent of profitable businesses
0,0,All industries,0.0,Canada,1.0,Rural,1.0,Unincorporated,2017.0,112086,...,,,,,,,,,71.0,
1,0,All industries,0.0,Canada,1.0,Rural,2.0,Incorporated,2017.0,193262,...,360.4,576.8,217.1,488.5,1.2,44.4,1.3,6.1,64.3,
2,0,All industries,0.0,Canada,1.0,Rural,3.0,All businesses,2017.0,305348,...,,,,,,,,,67.4,76.3
3,0,All industries,0.0,Canada,2.0,Urban,1.0,Unincorporated,2017.0,596808,...,,,,,,,,,82.2,
4,0,All industries,0.0,Canada,2.0,Urban,2.0,Incorporated,2017.0,1002456,...,421.7,605.1,255.7,504.1,1.2,50.7,1.1,13.1,58.1,
5,0,All industries,0.0,Canada,2.0,Urban,3.0,All businesses,2017.0,1599264,...,,,,,,,,,65.0,81.7
6,0,All industries,15.0,Altantic,1.0,Rural,1.0,Unincorporated,2017.0,17112,...,,,,,,,,,77.2,
7,0,All industries,15.0,Altantic,1.0,Rural,2.0,Incorporated,2017.0,20348,...,286.7,464.6,177.7,364.1,1.3,48.8,1.8,13.7,56.8,
8,0,All industries,15.0,Altantic,1.0,Rural,3.0,All businesses,2017.0,37460,...,,,,,,,,,62.0,80.4
9,0,All industries,15.0,Altantic,2.0,Urban,1.0,Unincorporated,2017.0,19314,...,,,,,,,,,79.8,


In [24]:
new_sb = final_sb.replace(regex=['X', '\.\.', '\.\.\.'], value=np.nan)

In [25]:
len(new_sb)

32967

In [26]:
new_sb.head(10)

Unnamed: 0,"North American Industry Classification System, NAICS - code","North American Industry Classification System, NAICS",Geography - code,Geography,Location indicator - code,Location indicator,Incorporation status - code,Incorporation status,Reference year,Total number of businesses,...,Total current assets,Total liabilities,Total current liabilities,Total equity,Debt to equity ratio (times),Current debt to equity (%),Revenue to equity ratio (times),Net profit to equity (%),Gross margin (%),Percent of profitable businesses
0,0,All industries,0.0,Canada,1.0,Rural,1.0,Unincorporated,2017.0,112086,...,,,,,,,,,71.0,
1,0,All industries,0.0,Canada,1.0,Rural,2.0,Incorporated,2017.0,193262,...,360.4,576.8,217.1,488.5,1.2,44.4,1.3,6.1,64.3,
2,0,All industries,0.0,Canada,1.0,Rural,3.0,All businesses,2017.0,305348,...,,,,,,,,,67.4,76.3
3,0,All industries,0.0,Canada,2.0,Urban,1.0,Unincorporated,2017.0,596808,...,,,,,,,,,82.2,
4,0,All industries,0.0,Canada,2.0,Urban,2.0,Incorporated,2017.0,1002456,...,421.7,605.1,255.7,504.1,1.2,50.7,1.1,13.1,58.1,
5,0,All industries,0.0,Canada,2.0,Urban,3.0,All businesses,2017.0,1599264,...,,,,,,,,,65.0,81.7
6,0,All industries,15.0,Altantic,1.0,Rural,1.0,Unincorporated,2017.0,17112,...,,,,,,,,,77.2,
7,0,All industries,15.0,Altantic,1.0,Rural,2.0,Incorporated,2017.0,20348,...,286.7,464.6,177.7,364.1,1.3,48.8,1.8,13.7,56.8,
8,0,All industries,15.0,Altantic,1.0,Rural,3.0,All businesses,2017.0,37460,...,,,,,,,,,62.0,80.4
9,0,All industries,15.0,Altantic,2.0,Urban,1.0,Unincorporated,2017.0,19314,...,,,,,,,,,79.8,


In [27]:
new_sb.dtypes

North American Industry Classification System, NAICS - code     object
North American Industry Classification System, NAICS            object
Geography - code                                               float64
Geography                                                       object
Location indicator - code                                      float64
Location indicator                                              object
Incorporation status - code                                    float64
Incorporation status                                            object
Reference year                                                 float64
Total number of businesses                                      object
Total revenue                                                   object
Sales of goods and services* ( percent of total revenue)        object
Cost of sales (direct expenses) (%)                             object
Operating expenses (indirect expenses) (%)                      object
Total 

In [28]:
new_sb.columns

Index(['North American Industry Classification System, NAICS - code',
       'North American Industry Classification System, NAICS',
       'Geography - code', 'Geography', 'Location indicator - code',
       'Location indicator', 'Incorporation status - code',
       'Incorporation status', 'Reference year', 'Total number of businesses',
       'Total revenue',
       'Sales of goods and services* ( percent of total revenue)',
       'Cost of sales (direct expenses) (%)',
       'Operating expenses (indirect expenses) (%)', 'Total expenses',
       'Cost of sales (direct expenses)',
       'Operating expenses (indirect expenses)', 'Net Profit/Loss',
       'Total assets', 'Total current assets', 'Total liabilities',
       'Total current liabilities', 'Total equity',
       'Debt to equity ratio (times)', 'Current debt to equity (%)',
       'Revenue to equity ratio (times)', 'Net profit to equity (%)',
       'Gross margin (%)', 'Percent of profitable businesses'],
      dtype='objec

In [29]:
new_sb[['Total number of businesses',
       'Total revenue',
       'Sales of goods and services* ( percent of total revenue)',
       'Cost of sales (direct expenses) (%)',
       'Operating expenses (indirect expenses) (%)', 'Total expenses',
       'Cost of sales (direct expenses)',
       'Operating expenses (indirect expenses)','Net Profit/Loss', 'Total assets', 'Total current assets',
       'Total liabilities', 'Total current liabilities', 'Total equity',
       'Debt to equity ratio (times)', 'Current debt to equity (%)',
       'Revenue to equity ratio (times)', 'Net profit to equity (%)',
       'Gross margin (%)',
       'Percent of profitable businesses']] = new_sb[['Total number of businesses',
       'Total revenue',
       'Sales of goods and services* ( percent of total revenue)',
       'Cost of sales (direct expenses) (%)',
       'Operating expenses (indirect expenses) (%)', 'Total expenses',
       'Cost of sales (direct expenses)',
       'Operating expenses (indirect expenses)','Net Profit/Loss', 'Total assets', 'Total current assets',
       'Total liabilities', 'Total current liabilities', 'Total equity',
       'Debt to equity ratio (times)', 'Current debt to equity (%)',
       'Revenue to equity ratio (times)', 'Net profit to equity (%)',
       'Gross margin (%)',
       'Percent of profitable businesses']].apply(pd.to_numeric)
new_sb.dtypes

North American Industry Classification System, NAICS - code     object
North American Industry Classification System, NAICS            object
Geography - code                                               float64
Geography                                                       object
Location indicator - code                                      float64
Location indicator                                              object
Incorporation status - code                                    float64
Incorporation status                                            object
Reference year                                                 float64
Total number of businesses                                     float64
Total revenue                                                  float64
Sales of goods and services* ( percent of total revenue)       float64
Cost of sales (direct expenses) (%)                            float64
Operating expenses (indirect expenses) (%)                     float64
Total 

In [30]:
new_sb.to_csv("new_sb.csv")

In [31]:
new_sb.shape

(32967, 29)

In [32]:
new_sb.head(20)

Unnamed: 0,"North American Industry Classification System, NAICS - code","North American Industry Classification System, NAICS",Geography - code,Geography,Location indicator - code,Location indicator,Incorporation status - code,Incorporation status,Reference year,Total number of businesses,...,Total current assets,Total liabilities,Total current liabilities,Total equity,Debt to equity ratio (times),Current debt to equity (%),Revenue to equity ratio (times),Net profit to equity (%),Gross margin (%),Percent of profitable businesses
0,0,All industries,0.0,Canada,1.0,Rural,1.0,Unincorporated,2017.0,112086.0,...,,,,,,,,,71.0,
1,0,All industries,0.0,Canada,1.0,Rural,2.0,Incorporated,2017.0,193262.0,...,360.4,576.8,217.1,488.5,1.2,44.4,1.3,6.1,64.3,
2,0,All industries,0.0,Canada,1.0,Rural,3.0,All businesses,2017.0,305348.0,...,,,,,,,,,67.4,76.3
3,0,All industries,0.0,Canada,2.0,Urban,1.0,Unincorporated,2017.0,596808.0,...,,,,,,,,,82.2,
4,0,All industries,0.0,Canada,2.0,Urban,2.0,Incorporated,2017.0,1002456.0,...,421.7,605.1,255.7,504.1,1.2,50.7,1.1,13.1,58.1,
5,0,All industries,0.0,Canada,2.0,Urban,3.0,All businesses,2017.0,1599264.0,...,,,,,,,,,65.0,81.7
6,0,All industries,15.0,Altantic,1.0,Rural,1.0,Unincorporated,2017.0,17112.0,...,,,,,,,,,77.2,
7,0,All industries,15.0,Altantic,1.0,Rural,2.0,Incorporated,2017.0,20348.0,...,286.7,464.6,177.7,364.1,1.3,48.8,1.8,13.7,56.8,
8,0,All industries,15.0,Altantic,1.0,Rural,3.0,All businesses,2017.0,37460.0,...,,,,,,,,,62.0,80.4
9,0,All industries,15.0,Altantic,2.0,Urban,1.0,Unincorporated,2017.0,19314.0,...,,,,,,,,,79.8,


In [33]:
new_sb.columns

Index(['North American Industry Classification System, NAICS - code',
       'North American Industry Classification System, NAICS',
       'Geography - code', 'Geography', 'Location indicator - code',
       'Location indicator', 'Incorporation status - code',
       'Incorporation status', 'Reference year', 'Total number of businesses',
       'Total revenue',
       'Sales of goods and services* ( percent of total revenue)',
       'Cost of sales (direct expenses) (%)',
       'Operating expenses (indirect expenses) (%)', 'Total expenses',
       'Cost of sales (direct expenses)',
       'Operating expenses (indirect expenses)', 'Net Profit/Loss',
       'Total assets', 'Total current assets', 'Total liabilities',
       'Total current liabilities', 'Total equity',
       'Debt to equity ratio (times)', 'Current debt to equity (%)',
       'Revenue to equity ratio (times)', 'Net profit to equity (%)',
       'Gross margin (%)', 'Percent of profitable businesses'],
      dtype='objec

In [34]:
sb_no_uninco=new_sb[new_sb["Incorporation status"]!="Unincorporated"]

In [35]:
sb_no_uninco.head(10)

Unnamed: 0,"North American Industry Classification System, NAICS - code","North American Industry Classification System, NAICS",Geography - code,Geography,Location indicator - code,Location indicator,Incorporation status - code,Incorporation status,Reference year,Total number of businesses,...,Total current assets,Total liabilities,Total current liabilities,Total equity,Debt to equity ratio (times),Current debt to equity (%),Revenue to equity ratio (times),Net profit to equity (%),Gross margin (%),Percent of profitable businesses
1,0,All industries,0.0,Canada,1.0,Rural,2.0,Incorporated,2017.0,193262.0,...,360.4,576.8,217.1,488.5,1.2,44.4,1.3,6.1,64.3,
2,0,All industries,0.0,Canada,1.0,Rural,3.0,All businesses,2017.0,305348.0,...,,,,,,,,,67.4,76.3
4,0,All industries,0.0,Canada,2.0,Urban,2.0,Incorporated,2017.0,1002456.0,...,421.7,605.1,255.7,504.1,1.2,50.7,1.1,13.1,58.1,
5,0,All industries,0.0,Canada,2.0,Urban,3.0,All businesses,2017.0,1599264.0,...,,,,,,,,,65.0,81.7
7,0,All industries,15.0,Altantic,1.0,Rural,2.0,Incorporated,2017.0,20348.0,...,286.7,464.6,177.7,364.1,1.3,48.8,1.8,13.7,56.8,
8,0,All industries,15.0,Altantic,1.0,Rural,3.0,All businesses,2017.0,37460.0,...,,,,,,,,,62.0,80.4
10,0,All industries,15.0,Altantic,2.0,Urban,2.0,Incorporated,2017.0,36953.0,...,338.1,650.1,253.0,530.6,1.2,47.7,1.2,12.5,58.5,
11,0,All industries,15.0,Altantic,2.0,Urban,3.0,All businesses,2017.0,56267.0,...,,,,,,,,,64.4,79.9
13,0,All industries,10.0,Newfoundland and Labrador,1.0,Rural,2.0,Incorporated,2017.0,4407.0,...,323.5,485.5,180.3,376.0,1.3,47.9,1.8,9.0,50.1,
14,0,All industries,10.0,Newfoundland and Labrador,1.0,Rural,3.0,All businesses,2017.0,9082.0,...,,,,,,,,,56.6,78.0


In [36]:
len(sb_no_uninco)

21978

In [37]:
sb_no_uninco.to_csv("sb_no_unincorporate.csv")

In [38]:
sb_inco=new_sb[new_sb["Incorporation status"]=="Incorporated"]

In [39]:
sb_inco=sb_inco.drop(columns=['Percent of profitable businesses'])

In [40]:
sb_inco.head(10)

Unnamed: 0,"North American Industry Classification System, NAICS - code","North American Industry Classification System, NAICS",Geography - code,Geography,Location indicator - code,Location indicator,Incorporation status - code,Incorporation status,Reference year,Total number of businesses,...,Total assets,Total current assets,Total liabilities,Total current liabilities,Total equity,Debt to equity ratio (times),Current debt to equity (%),Revenue to equity ratio (times),Net profit to equity (%),Gross margin (%)
1,0,All industries,0.0,Canada,1.0,Rural,2.0,Incorporated,2017.0,193262.0,...,1065.3,360.4,576.8,217.1,488.5,1.2,44.4,1.3,6.1,64.3
4,0,All industries,0.0,Canada,2.0,Urban,2.0,Incorporated,2017.0,1002456.0,...,1109.2,421.7,605.1,255.7,504.1,1.2,50.7,1.1,13.1,58.1
7,0,All industries,15.0,Altantic,1.0,Rural,2.0,Incorporated,2017.0,20348.0,...,828.8,286.7,464.6,177.7,364.1,1.3,48.8,1.8,13.7,56.8
10,0,All industries,15.0,Altantic,2.0,Urban,2.0,Incorporated,2017.0,36953.0,...,1180.7,338.1,650.1,253.0,530.6,1.2,47.7,1.2,12.5,58.5
13,0,All industries,10.0,Newfoundland and Labrador,1.0,Rural,2.0,Incorporated,2017.0,4407.0,...,861.5,323.5,485.5,180.3,376.0,1.3,47.9,1.8,9.0,50.1
16,0,All industries,10.0,Newfoundland and Labrador,2.0,Urban,2.0,Incorporated,2017.0,7334.0,...,1224.1,418.2,650.4,251.4,573.7,1.1,43.8,1.1,8.2,57.4
19,0,All industries,11.0,Prince Edward Island,1.0,Rural,2.0,Incorporated,2017.0,1614.0,...,1158.2,380.0,661.4,276.0,496.8,1.3,55.6,1.5,2.7,63.7
22,0,All industries,11.0,Prince Edward Island,2.0,Urban,2.0,Incorporated,2017.0,2437.0,...,997.6,306.0,633.7,214.7,363.9,1.7,59.0,1.7,13.6,57.2
25,0,All industries,12.0,Nova Scotia,1.0,Rural,2.0,Incorporated,2017.0,7507.0,...,729.7,246.2,417.8,158.3,311.9,1.3,50.7,1.9,19.7,59.2
28,0,All industries,12.0,Nova Scotia,2.0,Urban,2.0,Incorporated,2017.0,15410.0,...,1191.8,327.7,722.2,304.7,469.6,1.5,64.9,1.3,15.8,60.2


In [41]:
sb_inco.to_csv("sb_incorporate.csv")

In [42]:
sb_all_industry=new_sb[new_sb["Incorporation status"]=="All businesses"]

In [43]:
sb_all_industry=sb_all_industry.dropna(axis=1, how='all')

In [44]:
sb_all_industry.head(10)

Unnamed: 0,"North American Industry Classification System, NAICS - code","North American Industry Classification System, NAICS",Geography - code,Geography,Location indicator - code,Location indicator,Incorporation status - code,Incorporation status,Reference year,Total number of businesses,Total revenue,Cost of sales (direct expenses) (%),Operating expenses (indirect expenses) (%),Total expenses,Cost of sales (direct expenses),Operating expenses (indirect expenses),Net Profit/Loss,Gross margin (%),Percent of profitable businesses
2,0,All industries,0.0,Canada,1.0,Rural,3.0,All businesses,2017.0,305348.0,441.6,32.6,59.9,408.7,144.1,264.6,32.9,67.4,76.3
5,0,All industries,0.0,Canada,2.0,Urban,3.0,All businesses,2017.0,1599264.0,398.4,35.0,50.3,340.1,139.6,200.5,58.3,65.0,81.7
8,0,All industries,15.0,Altantic,1.0,Rural,3.0,All businesses,2017.0,37460.0,412.1,38.0,50.7,365.5,156.5,209.0,46.6,62.0,80.4
11,0,All industries,15.0,Altantic,2.0,Urban,3.0,All businesses,2017.0,56267.0,446.1,35.6,51.3,387.6,158.7,228.8,58.5,64.4,79.9
14,0,All industries,10.0,Newfoundland and Labrador,1.0,Rural,3.0,All businesses,2017.0,9082.0,381.0,43.4,47.3,345.5,165.2,180.2,35.5,56.6,78.0
17,0,All industries,10.0,Newfoundland and Labrador,2.0,Urban,3.0,All businesses,2017.0,10917.0,478.6,36.4,53.1,428.3,174.1,254.2,50.3,63.6,74.1
20,0,All industries,11.0,Prince Edward Island,1.0,Rural,3.0,All businesses,2017.0,3589.0,423.5,30.1,60.8,385.1,127.6,257.5,38.4,69.9,81.2
23,0,All industries,11.0,Prince Edward Island,2.0,Urban,3.0,All businesses,2017.0,3902.0,436.7,35.6,53.8,390.1,155.3,234.9,46.6,64.4,80.7
26,0,All industries,12.0,Nova Scotia,1.0,Rural,3.0,All businesses,2017.0,13626.0,391.6,36.3,50.1,338.1,142.1,196.1,53.5,63.7,83.8
29,0,All industries,12.0,Nova Scotia,2.0,Urban,3.0,All businesses,2017.0,24094.0,426.9,34.3,51.1,364.3,146.3,218.0,62.6,65.7,82.8


In [45]:
sb_all_industry.to_csv("sb_all_industry.csv")

In [46]:
sb_unincorporate=new_sb[new_sb["Incorporation status"]=="Unincorporated"]

In [47]:
sb_unincorporate=sb_unincorporate.dropna(axis=1, how='all')

In [48]:
sb_unincorporate.head(10)

Unnamed: 0,"North American Industry Classification System, NAICS - code","North American Industry Classification System, NAICS",Geography - code,Geography,Location indicator - code,Location indicator,Incorporation status - code,Incorporation status,Reference year,Total number of businesses,Total revenue,Cost of sales (direct expenses) (%),Operating expenses (indirect expenses) (%),Total expenses,Cost of sales (direct expenses),Operating expenses (indirect expenses),Net Profit/Loss,Gross margin (%)
0,0,All industries,0.0,Canada,1.0,Rural,1.0,Unincorporated,2017.0,112086.0,130.5,29.0,41.5,92.0,37.9,54.1,38.5,71.0
3,0,All industries,0.0,Canada,2.0,Urban,1.0,Unincorporated,2017.0,596808.0,115.4,17.8,43.1,70.3,20.6,49.7,45.1,82.2
6,0,All industries,15.0,Altantic,1.0,Rural,1.0,Unincorporated,2017.0,17112.0,134.5,22.8,45.3,91.7,30.7,61.0,42.8,77.2
9,0,All industries,15.0,Altantic,2.0,Urban,1.0,Unincorporated,2017.0,19314.0,122.2,20.2,43.8,78.1,24.7,53.5,44.0,79.8
12,0,All industries,10.0,Newfoundland and Labrador,1.0,Rural,1.0,Unincorporated,2017.0,4675.0,108.8,22.0,43.8,71.7,24.0,47.7,37.2,78.0
15,0,All industries,10.0,Newfoundland and Labrador,2.0,Urban,1.0,Unincorporated,2017.0,3583.0,129.5,14.7,41.4,72.7,19.1,53.7,56.7,85.3
18,0,All industries,11.0,Prince Edward Island,1.0,Rural,1.0,Unincorporated,2017.0,1975.0,153.2,13.2,48.5,94.5,20.2,74.3,58.7,86.8
21,0,All industries,11.0,Prince Edward Island,2.0,Urban,1.0,Unincorporated,2017.0,1465.0,125.0,20.7,45.9,83.3,25.9,57.4,41.7,79.3
24,0,All industries,12.0,Nova Scotia,1.0,Rural,1.0,Unincorporated,2017.0,6119.0,140.2,25.2,43.7,96.7,35.4,61.3,43.5,74.8
27,0,All industries,12.0,Nova Scotia,2.0,Urban,1.0,Unincorporated,2017.0,8684.0,120.1,22.4,42.4,77.8,26.9,50.9,42.3,77.6


In [49]:
sb_unincorporate.to_csv("sb_unincorporate.csv")