In [1]:
import pandas as pd
import numpy as np
import altair as alt

In [2]:
def tab_reader(filepathpattern,tabnum,skiprow,low_memory=False):
    tabs = []
    for i in range (tabnum):
        cur_index = i+1
        cur_filepath = filepathpattern.format(cur_index)
        try:
            tab = pd.read_csv(cur_filepath, skiprows=skiprow, low_memory=False)
        except:
            tab = pd.read_csv(cur_filepath, skiprows=skiprow, encoding='latin-1', low_memory=low_memory)
        tabs.append(tab)
    return tabs


In [3]:
# 2017 data
rcbp_2017_mb_profitMargin_Tabs = tab_reader("../../../data/2017_csv_eng/2017_Medium businesses_Profit margin based_csv/_2017_Medium businesses_Profit margin_Tab{}.csv",5,5)
rcbp_2017_mb_revenue_Tabs = tab_reader("../../../data/2017_csv_eng/2017_Medium businesses_Total revenue based_csv/_2017_Eng_Medium_Revenue_Tab{}.csv",5,5)
rcbp_2017_sb_profitMargin_Tabs = tab_reader("../../../data/2017_csv_eng/2017_Small businesses_Profit margin based_csv/_2017_Small businesses_Profit margin based_Tab{}.csv",7,5)
rcbp_2017_sb_profitMargin_Tabs[0] = tab_reader("../../../data/2017_csv_eng/2017_Small businesses_Profit margin based_csv/_2017_Small businesses_Profit margin based_Tab{}.csv",1,6)[0]
rcbp_2017_sb_revenue_Tabs = tab_reader("../../../data/2017_csv_eng/2017_Small businesses_Total revenue based_csv/_2017_Small businesses_Total revenue based_Tab{}.csv",7,5)
rcbp_2017_sb_revenue_Tabs[0] = tab_reader("../../../data/2017_csv_eng/2017_Small businesses_Total revenue based_csv/_2017_Small businesses_Total revenue based_Tab{}.csv",1,6)[0]

In [4]:
# 2018 data
rcbp_2018_mb_profitMargin_Tabs = tab_reader("../../../data/2018_csv_eng/2018_Medium businesses_Profit margin based_csv/_2018_Medium businesses_Profit margin_Tab{}.csv",5,5)
rcbp_2018_mb_revenue_Tabs = tab_reader("../../../data/2018_csv_eng/2018_Medium businesses_Total revenue based_csv/_2018_Medium businesses_Total revenue_Tab{}.csv",5,5)
rcbp_2018_sb_profitMargin_Tabs = tab_reader("../../../data/2018_csv_eng/2018_Small businesses_Profit margin based_csv/_2018_Small businesses_Profit margin_Tab{}.csv",7,5)
rcbp_2018_sb_profitMargin_Tabs[0] = tab_reader("../../../data/2018_csv_eng/2018_Small businesses_Profit margin based_csv/_2018_Small businesses_Profit margin_Tab{}.csv",1,6)[0]
rcbp_2018_sb_revenue_Tabs = tab_reader("../../../data/2018_csv_eng/2018_Small businesses_Total revenue based_csv/_2018_Small businesses_Total revenue_Tab{}.csv",7,5)
rcbp_2018_sb_revenue_Tabs[0] = tab_reader("../../../data/2018_csv_eng/2018_Small businesses_Total revenue based_csv/_2018_Small businesses_Total revenue_Tab{}.csv",1,6)[0]

In [5]:
# 2019 data
rcbp_2019_mb_profitMargin_Tabs = tab_reader("../../../data/2019_csv_eng/2019_Medium businesses_Profit margin_csv/_2019_Medium businesses_Profit margin_Tab{}.csv",5,5)
rcbp_2019_mb_revenue_Tabs = tab_reader("../../../data/2019_csv_eng/2019_Medium businesses_Total revenue_csv/_2019_Medium businesses_Total revenue_Tab{}.csv",5,5)
rcbp_2019_sb_profitMargin_Tabs = tab_reader("../../../data/2019_csv_eng/2019_Small businesses_Profit margin_csv/_2019_Small businesses_Profit margin_Tab{}.csv",7,5)
rcbp_2019_sb_profitMargin_Tabs[0] = tab_reader("../../../data/2019_csv_eng/2019_Small businesses_Profit margin_csv/_2019_Small businesses_Profit margin_Tab{}.csv",1,6)[0]
rcbp_2019_sb_revenue_Tabs = tab_reader("../../../data/2019_csv_eng/2019_Small businesses_Total revenue_csv/_2019_Small businesses_Total revenue_Tab{}.csv",7,5)
rcbp_2019_sb_revenue_Tabs[0] = tab_reader("../../../data/2019_csv_eng/2019_Small businesses_Total revenue_csv/_2019_Small businesses_Total revenue_Tab{}.csv",1,6)[0]

## Data Wrangling

In [6]:
# Drop the first NaN row, select industry code level, drop quartile columns and concat different years
def selectData(tabs):
    concatTabs = []
    for tab in tabs:
        tab = tab.drop(0)
        cols_without_quartile = [x for x in tab.columns if 'quartile' not in x.lower()]
        tab = tab[cols_without_quartile]
        tab = tab.loc[(tab["North American Industry Classification System, NAICS - code"].str.len() < 3) | (tab["North American Industry Classification System, NAICS - code"].str.contains('-'))]
        concatTabs.append(tab)
    return pd.concat(concatTabs)

In [7]:
## medium tab1 3year contain All industry(code=0)
mb_revenue_tab1_3y = selectData([rcbp_2017_mb_revenue_Tabs[0], rcbp_2018_mb_revenue_Tabs[0], rcbp_2019_mb_revenue_Tabs[0]]).reset_index(drop=True)
## small tab1 3year contain All industry
sb_revenue_tab1_3y = selectData([rcbp_2017_sb_revenue_Tabs[0], rcbp_2018_sb_revenue_Tabs[0], rcbp_2019_sb_revenue_Tabs[0]]).reset_index(drop=True)
## small tab7 3year contain All industry
sb_revenue_tab7_3y = selectData([rcbp_2017_sb_revenue_Tabs[6], rcbp_2018_sb_revenue_Tabs[6], rcbp_2019_sb_revenue_Tabs[6]]).reset_index(drop=True)

### 2. Clean the column names

In [8]:
# add dash to the column names

mb_revenue_tab1_3y.columns = [c.replace(' ', '_') for c in mb_revenue_tab1_3y.columns]
sb_revenue_tab1_3y.columns = [c.replace(' ', '_') for c in sb_revenue_tab1_3y.columns]
sb_revenue_tab7_3y.columns = [c.replace(' ', '_') for c in sb_revenue_tab7_3y.columns]

mb_revenue_tab1_3y.columns = [c.replace(',', '') for c in mb_revenue_tab1_3y.columns]
sb_revenue_tab1_3y.columns = [c.replace(',', '') for c in sb_revenue_tab1_3y.columns]
sb_revenue_tab7_3y.columns = [c.replace(',', '') for c in sb_revenue_tab7_3y.columns]

### 3. Change the suppressed data (X) and the small data (..) and (...) to be NA

In [9]:
mb_revenue_tab1_3y = mb_revenue_tab1_3y.replace(regex=['X', '\.\.', '\.\.\.'], value=np.nan)
sb_revenue_tab1_3y = sb_revenue_tab1_3y.replace(regex=['X', '\.\.', '\.\.\.'], value=np.nan)
sb_revenue_tab7_3y = sb_revenue_tab7_3y.replace(regex=['X', '\.\.', '\.\.\.'], value=np.nan)

In [10]:
mb_revenue_tab1_3y['North_American_Industry_Classification_System_NAICS'].unique().tolist()

['All industries',
 'Agriculture, forestry, fishing and hunting',
 'Mining, quarrying, and oil and gas extraction',
 'Utilities',
 'Construction',
 'Manufacturing',
 'Wholesale trade',
 'Retail trade',
 'Transportation and warehousing',
 'Information and cultural industries',
 'Real estate and rental and leasing',
 'Professional, scientific and technical services',
 'Management of companies and enterprises',
 'Administrative and support, waste management and remediation services',
 'Educational services',
 'Health care and social assistance',
 'Arts, entertainment and recreation',
 'Accommodation and food services',
 'Other services (except public administration)']

### 4. Change columns data type

#### (1) Medium businesses --- Tab1

In [11]:
mb_revenue_tab1_3y['North_American_Industry_Classification_System_NAICS'] = mb_revenue_tab1_3y['North_American_Industry_Classification_System_NAICS'].astype('string') 
mb_revenue_tab1_3y['Location_indicator'] = mb_revenue_tab1_3y['Location_indicator'].astype('string') 
mb_revenue_tab1_3y['Reference_year'] = mb_revenue_tab1_3y['Reference_year'].astype('int') 
mb_revenue_tab1_3y['Total_number_of_businesses'] = mb_revenue_tab1_3y['Total_number_of_businesses'].astype('int') 
mb_revenue_tab1_3y['Total_revenue'] = mb_revenue_tab1_3y['Total_revenue'].astype('float64') 
mb_revenue_tab1_3y['Sales_of_goods_and_services'] = mb_revenue_tab1_3y['Sales_of_goods_and_services'].astype('float64') 
mb_revenue_tab1_3y['All_other_revenues'] = mb_revenue_tab1_3y['All_other_revenues'].astype('float64') 
mb_revenue_tab1_3y['All_other_revenues_(_percent_of_total_revenue)'] = mb_revenue_tab1_3y['All_other_revenues_(_percent_of_total_revenue)'].astype('float64') 
mb_revenue_tab1_3y['Sales_of_goods_and_services_(_percent_of_total_revenue)'] = mb_revenue_tab1_3y['Sales_of_goods_and_services_(_percent_of_total_revenue)'].astype('float64')

#### (2) Small businesses --- Tab1

In [12]:
sb_revenue_tab1_3y['North_American_Industry_Classification_System_NAICS'] = sb_revenue_tab1_3y['North_American_Industry_Classification_System_NAICS'].astype('string') 
sb_revenue_tab1_3y['Geography'] = sb_revenue_tab1_3y['Geography'].astype('string') 
sb_revenue_tab1_3y['Location_indicator'] = sb_revenue_tab1_3y['Location_indicator'].astype('string') 
sb_revenue_tab1_3y['Incorporation_status'] = sb_revenue_tab1_3y['Incorporation_status'].astype('string') 
sb_revenue_tab1_3y['Reference_year'] = sb_revenue_tab1_3y['Reference_year'].astype('int') 
sb_revenue_tab1_3y['Total_number_of_businesses'] = sb_revenue_tab1_3y['Total_number_of_businesses'].astype('float64') 
sb_revenue_tab1_3y['Total_revenue'] = sb_revenue_tab1_3y['Total_revenue'].astype('float64') 
sb_revenue_tab1_3y['Sales_of_goods_and_services*'] = sb_revenue_tab1_3y['Sales_of_goods_and_services*'].astype('float64') 
sb_revenue_tab1_3y['All_other_revenues*'] = sb_revenue_tab1_3y['All_other_revenues*'].astype('float64') 
sb_revenue_tab1_3y['Sales_of_goods_and_services*_(_percent_of_total_revenue)'] = sb_revenue_tab1_3y['Sales_of_goods_and_services*_(_percent_of_total_revenue)'].astype('float64')
sb_revenue_tab1_3y['All_other_revenues*_(_percent_of_total_revenue)'] = sb_revenue_tab1_3y['All_other_revenues*_(_percent_of_total_revenue)'].astype('float64') 


#### (3) Small businesses --- Tab7

In [13]:
sb_revenue_tab7_3y['North_American_Industry_Classification_System_NAICS'] = sb_revenue_tab7_3y['North_American_Industry_Classification_System_NAICS'].astype('string') 
sb_revenue_tab7_3y['Geography'] = sb_revenue_tab7_3y['Geography'].astype('string') 
sb_revenue_tab7_3y['Location_indicator'] = sb_revenue_tab7_3y['Location_indicator'].astype('string') 
sb_revenue_tab1_3y['Incorporation_status'] = sb_revenue_tab1_3y['Incorporation_status'].astype('string') 
sb_revenue_tab7_3y['Reference_year'] = sb_revenue_tab7_3y['Reference_year'].astype('int') 
sb_revenue_tab7_3y['Percent_of_profitable_businesses'] = sb_revenue_tab7_3y['Percent_of_profitable_businesses'].astype('float64') 
sb_revenue_tab7_3y['Profitable_:_Total_Revenue'] = sb_revenue_tab7_3y['Profitable_:_Total_Revenue'].astype('float64') 
sb_revenue_tab7_3y['Profitable_:_Total_Expenses'] = sb_revenue_tab7_3y['Profitable_:_Total_Expenses'].astype('float64') 
sb_revenue_tab7_3y['Profitable_:_Net_Profit'] = sb_revenue_tab7_3y['Profitable_:_Net_Profit'].astype('float64') 
sb_revenue_tab7_3y['Non-profitable_:_Total_Revenue'] = sb_revenue_tab7_3y['Non-profitable_:_Total_Revenue'].astype('float64') 
sb_revenue_tab7_3y['Non-profitable_:_Total_expenses'] = sb_revenue_tab7_3y['Non-profitable_:_Total_expenses'].astype('float64') 
sb_revenue_tab7_3y['Non-profitable_:_Net_loss'] = sb_revenue_tab7_3y['Non-profitable_:_Net_loss'].astype('float64') 


### 5. Drop irrelevant columns, Add industry overall revenue which is Total_number_of_businesses * Total_revenue, Change "Nordwest Territories" to "Northwest Territorries"

#### 1. mb_tab1

In [14]:
mb_tab1_drop_col = ['Geography_-_code', 'Geography', 
                    'Incorporation_status_-_code', 'Incorporation_status', 'All_businesses_minimum_revenue_value', 
                    'All_businesses_maximum_revenue_value', 'Sales_of_goods_and_services_percent_of_businesses_reporting', 
                    'All_other_revenues__percent_of_businesses_reporting']

In [15]:
mb_revenue_tab1_3y = mb_revenue_tab1_3y.drop(columns = mb_tab1_drop_col, axis=1)

In [16]:
mb_revenue_tab1_3y['Ind_total_revenue'] = mb_revenue_tab1_3y['Total_number_of_businesses'] * mb_revenue_tab1_3y['Total_revenue']

#### 2. sb_tab1

In [17]:
sb_tab1_drop_col = ['All_businesses_minimum_revenue_value', 'All_businesses_maximum_revenue_value', 
                    'Sales_of_goods_and_services*___percent_of_businesses_reporting', 
                    'All_other_revenues*__percent_of_businesses_reporting']

In [18]:
sb_revenue_tab1_3y = sb_revenue_tab1_3y.drop(columns = sb_tab1_drop_col, axis=1)

In [19]:
sb_revenue_tab1_3y['Ind_total_revenue'] = sb_revenue_tab1_3y['Total_number_of_businesses'] * sb_revenue_tab1_3y['Total_revenue']

In [20]:
#sb_revenue_tab1_3y.loc(sb_revenue_tab1_3y['Geography'] == 'Nordwest Territories') = 'Northwest Territories'
sb_revenue_tab1_3y.loc[sb_revenue_tab1_3y.Geography=='Nordwest Territories','Geography'] = 'Northwest Territories'


#### 3. sb_tab7

In [21]:
sb_tab7_drop_col = ['Incorporation_status_-_code', 'Incorporation_status']

In [22]:
sb_revenue_tab7_3y = sb_revenue_tab7_3y.drop(columns = sb_tab7_drop_col, axis=1)

In [23]:
sb_revenue_tab7_3y.loc[sb_revenue_tab7_3y.Geography=='Nordwest Territories','Geography'] = 'Northwest Territories'


### 6. select 2019 data 

#### (1) Medium business Tab1

In [24]:
mb_2019_revenue_tab1 = mb_revenue_tab1_3y[mb_revenue_tab1_3y['Reference_year']==2019]

#### (2) Small business Tab1

In [25]:
sb_2019_revenue_tab1 = sb_revenue_tab1_3y[sb_revenue_tab1_3y['Reference_year']==2019]

#### (3) Small business Tab7

In [26]:
sb_revenue_tab7_3y['Geography'].unique()

<StringArray>
[                   'Canada',                  'Altantic',
 'Newfoundland and Labrador',      'Prince Edward Island',
               'Nova Scotia',             'New Brunswick',
                    'Québec',                   'Ontario',
                  'Prairies',                  'Manitoba',
              'Saskatchewan',                   'Alberta',
          'British Columbia',               'Territories',
                     'Yukon',     'Northwest Territories',
                   'Nunavut']
Length: 17, dtype: string

In [27]:
sb_2019_revenue_tab7 = sb_revenue_tab7_3y[sb_revenue_tab7_3y['Reference_year']==2019]

### 7. Output excels for the use of Tabluea

In [28]:
mb_revenue_tab1_3y.to_excel("mb_revenue_tab1_3y.xlsx") # with NaN

In [29]:
sb_revenue_tab1_3y.to_excel("sb_revenue_tab1_3y.xlsx")

In [30]:
sb_revenue_tab7_3y.to_excel("sb_revenue_tab7_3y.xlsx")

## Data Visualization

### 1. mb_revenue_tab1_3y

In [31]:
# drop all industry, code!=0
mb_2019_revenue_tab1_ind = mb_2019_revenue_tab1[mb_2019_revenue_tab1['North_American_Industry_Classification_System_NAICS_-_code'] != '0']


In [32]:
mb_2019_revenue_tab1_ind.shape

(36, 18)

In [33]:
mb_revenue_tab1_3y_TTnum = mb_revenue_tab1_3y[['North_American_Industry_Classification_System_NAICS_-_code', 'North_American_Industry_Classification_System_NAICS', 
                    'Location_indicator_-_code', 'Location_indicator', 'Reference_year', 'Total_number_of_businesses']]

In [34]:
mb_revenue_tab1_3y_ind = mb_revenue_tab1_3y[mb_revenue_tab1_3y['North_American_Industry_Classification_System_NAICS_-_code'] != '0']

In [35]:
sub = mb_revenue_tab1_3y_ind[mb_revenue_tab1_3y_ind["Reference_year"]==2019]
sub['North_American_Industry_Classification_System_NAICS']

78            Agriculture, forestry, fishing and hunting
79            Agriculture, forestry, fishing and hunting
80         Mining, quarrying, and oil and gas extraction
81         Mining, quarrying, and oil and gas extraction
82                                             Utilities
83                                             Utilities
84                                          Construction
85                                          Construction
86                                         Manufacturing
87                                         Manufacturing
88                                       Wholesale trade
89                                       Wholesale trade
90                                          Retail trade
91                                          Retail trade
92                        Transportation and warehousing
93                        Transportation and warehousing
94                   Information and cultural industries
95                   Informatio

In [36]:
sub['North_American_Industry_Classification_System_NAICS'] = np.where(sub['North_American_Industry_Classification_System_NAICS'].str.contains("Administrative and support"), "Administrative and support", sub['North_American_Industry_Classification_System_NAICS'])




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub['North_American_Industry_Classification_System_NAICS'] = np.where(sub['North_American_Industry_Classification_System_NAICS'].str.contains("Administrative and support"), "Administrative and support", sub['North_American_Industry_Classification_System_NAICS'])


In [37]:

a = (alt.Chart(
    mb_revenue_tab1_3y_ind)
  .mark_bar()
  .encode(
      x=alt.X('Total_number_of_businesses', title='Total number of businesses'),
      y=alt.Y('North_American_Industry_Classification_System_NAICS', title='', sort='-x'),
      color=alt.Color('Location_indicator', title = None)
  ).facet(column = alt.Column('Reference_year:O', title=None))
)


b = (alt.Chart(
    mb_revenue_tab1_3y_ind)
  .mark_bar()
  .encode(
      x=alt.X('Ind_total_revenue',title='Total revenue(dollars x 1,000)'),
      y=alt.Y('North_American_Industry_Classification_System_NAICS', title = '', sort='-x'),
      color='Location_indicator'
  ).facet(column = alt.Column('Reference_year:O', title=None))
 )

c = (alt.Chart(sub)
  .mark_bar()
  .encode(
      x=alt.X('Total_revenue',title='Average revenue(dollars x 1,000)'),
      y=alt.Y('North_American_Industry_Classification_System_NAICS', title= '', sort='-x'),
      color=alt.Color('Location_indicator', title = None)
  ).configure_axis(
    labelFontSize=18,
    labelLimit=1000
)
 )


d = (alt.Chart(
    mb_revenue_tab1_3y_ind)
  .mark_bar()
  .encode(
      x=alt.X('Sales_of_goods_and_services_(_percent_of_total_revenue)', title='Sales of goods and services (percent of total revenue)'),
      y=alt.Y('North_American_Industry_Classification_System_NAICS', title='', sort='-x'),
      color=alt.Color('Location_indicator', title = None)
  ).facet(column = alt.Column('Reference_year:O', title=None))
 )

In [38]:
#alt.vconcat(a, b, c).configure_axis(labelLimit=1000)
#.properties(
#    title='Medium businesses, Canada, by industry and rural and urban areas, 2017-2019'
#)

c



In [39]:
d.configure_axis(labelLimit=1000).properties()

In [40]:
# drop all industry, industry code!=0, select all Canada, Geographycode=0
rm_geo = list(['Canada', 'Altantic', 'Prairies', 'Territories'])

sb_2019_revenue_tab1_ind = sb_2019_revenue_tab1[sb_2019_revenue_tab1['North_American_Industry_Classification_System_NAICS_-_code'] != '0']
sb_2019_revenue_tab1_ind_CA = sb_2019_revenue_tab1_ind[sb_2019_revenue_tab1_ind['Geography_-_code']==0]

In [41]:
# select all industry, industry code=0
# drop all Canada, Geography_code!=0, drop 3 aggregations of the geography
sb_2019_revenue_tab1_allind = sb_2019_revenue_tab1[sb_2019_revenue_tab1['North_American_Industry_Classification_System_NAICS_-_code'] == '0']
sb_2019_revenue_tab1_allind_Geo = sb_2019_revenue_tab1_allind.loc[~sb_2019_revenue_tab1_allind.Geography.isin(rm_geo)]


- Prairies has the most small businesses in rural, then Quebec, Ontario, Alberta. 
- Therein, Prairies has the most incorporated businesses, then Quebec, Alberta, and Ontario;
- Ontario has the most unincorporated businesses, then Quebec and Prairies.
- Ontario has the most small businesses in urban, then Quebec, BC and Prairies. 
- Therein, Ontario has the most incorporated businesses, then Prairies, Quebec and BC; 
- Ontario has the most unincorporated businesses, then Quebec  and BC.

##### (2) Geogrophy---Provinces and Territories vs Industries

In [42]:
# drop all industry, industry code!=0, 
# drop all Canada, Geographycode!=0,drop 3 geography aggregation 15--Atlantic, 49--Priaries, 63--Territories
# sb_2019_revenue_tab1_ind = sb_2019_revenue_tab1[sb_2019_revenue_tab1['North_American_Industry_Classification_System_NAICS_-_code'] != '0']
sb_2019_revenue_tab1_ind_Geo = sb_2019_revenue_tab1_ind.loc[~sb_2019_revenue_tab1_ind.Geography.isin(rm_geo)]

In [43]:
top_ind_sb_num_Geo_rural= (alt.Chart(
    sb_2019_revenue_tab1_ind_Geo[sb_2019_revenue_tab1_ind_Geo['Location_indicator_-_code']==1])
.mark_rect().encode(
 x=alt.X('Geography:O', title=None),
 y=alt.Y('North_American_Industry_Classification_System_NAICS', title='Rural'),
 color=alt.Color('Total_number_of_businesses:Q', legend=alt.Legend(title="Total number of businesses"))
)
)

top_ind_sb_num_Geo_urban= (alt.Chart(
    sb_2019_revenue_tab1_ind_Geo[sb_2019_revenue_tab1_ind_Geo['Location_indicator_-_code']==2])
.mark_rect().encode(
 x=alt.X('Geography:O', title=None),
 y=alt.Y('North_American_Industry_Classification_System_NAICS', title='Urban'),
 color=alt.Color('Total_number_of_businesses:Q', legend=alt.Legend(title="Total number of businesses"))
)
)



In [44]:
top_ind_sb_avgrev_Geo_rural= (alt.Chart(
    sb_2019_revenue_tab1_ind_Geo[sb_2019_revenue_tab1_ind_Geo['Location_indicator_-_code']==1])
.mark_rect().encode(
 x=alt.X('Geography:O', title=None),
 y=alt.Y('North_American_Industry_Classification_System_NAICS', title='Rural'),
 color=alt.Color('Total_revenue:Q', legend=alt.Legend(title="Average revenue"))
)
                             )

top_ind_sb_avgrev_Geo_urban= (alt.Chart(
    sb_2019_revenue_tab1_ind_Geo[sb_2019_revenue_tab1_ind_Geo['Location_indicator_-_code']==2])
.mark_rect().encode(
 x=alt.X('Geography:O', title=None),
 y=alt.Y('North_American_Industry_Classification_System_NAICS', title='Urban'),
 color=alt.Color('Total_revenue:Q', legend=alt.Legend(title="Average revenue"))

)
)



In [45]:
top_ind_sb_TTrev_Geo_rural= (alt.Chart(
    sb_2019_revenue_tab1_ind_Geo[sb_2019_revenue_tab1_ind_Geo['Location_indicator_-_code']==1])
.mark_rect().encode(
 x=alt.X('Geography:O', title=None),
 y=alt.Y('North_American_Industry_Classification_System_NAICS', title='Rural'),
 color=alt.Color('Ind_total_revenue:Q', legend=alt.Legend(title="Total revenue"))
)
)

top_ind_sb_TTrev_Geo_urban= (alt.Chart(
    sb_2019_revenue_tab1_ind_Geo[sb_2019_revenue_tab1_ind_Geo['Location_indicator_-_code']==2])
.mark_rect().encode(
 x=alt.X('Geography:O', title=None),
 y=alt.Y('North_American_Industry_Classification_System_NAICS', title='Urban'),
 color=alt.Color('Ind_total_revenue:Q', legend=alt.Legend(title="Total revenue"))
)
)

###### **(i) Canada, All industry, 2017-2019 rural and urban, percent of profitable businesses**

In [46]:
sb_revenue_tab7_3y_allind = sb_revenue_tab7_3y[sb_revenue_tab7_3y['North_American_Industry_Classification_System_NAICS_-_code']=='0']

In [47]:
sb_revenue_tab7_3y_allind_CA = sb_revenue_tab7_3y_allind[sb_revenue_tab7_3y_allind['Geography_-_code']==0]

In [48]:
sb_revenue_tab7_prfpct_3y_allind_CA = alt.Chart(sb_revenue_tab7_3y_allind_CA).mark_line(point = True).encode(
    x=alt.X('Reference_year:O', title=None),
    y=alt.Y('Percent_of_profitable_businesses:Q', title='Percent of profitable businesses', scale=alt.Scale(domain=[60,100])),
    color=alt.Color('Location_indicator', title=None)
).properties(width=400, height=300)

#.facet(column = alt.Column("Location_indicator:O", title=None))





###### **(ii) Canada, different industries, 2017-2019 rural and urban, percent of profitable businesses**

In [49]:
# 2017-2019, drop all industry, industry code!=0, select all Canada, Geographycode=0
sb_revenue_tab7_3y_ind = sb_revenue_tab7_3y[sb_revenue_tab7_3y['North_American_Industry_Classification_System_NAICS_-_code'] != '0']
sb_revenue_tab7_3y_ind_CA = sb_revenue_tab7_3y_ind[sb_revenue_tab7_3y_ind['Geography_-_code']==0]

In [50]:
sb_revenue_tab7_3y_ind_CA.Reference_year.unique()

array([2017, 2018, 2019])

In [51]:
# drop all industry, industry code!=0, select all Canada, Geographycode=0
sb_2019_revenue_tab7_ind = sb_2019_revenue_tab7[sb_2019_revenue_tab7['North_American_Industry_Classification_System_NAICS_-_code'] != '0']
sb_2019_revenue_tab7_ind_CA = sb_2019_revenue_tab7_ind[sb_2019_revenue_tab7_ind['Geography_-_code']==0]

In [52]:
# drop all industry, industry code!=0, drop all Canada, Geographycode!=0
# sb_2019_revenue_tab7_ind = sb_2019_revenue_tab7[sb_2019_revenue_tab7['North_American_Industry_Classification_System_NAICS_-_code'] != '0']
sb_2019_revenue_tab7_ind_Geo = sb_2019_revenue_tab7_ind.loc[~sb_2019_revenue_tab7_ind.Geography.isin(rm_geo)]


In [53]:
sb_2019_revenue_tab7_ind_Geo.shape

(450, 21)

In [54]:
top_ind_sb_profpct_Geo_rural= (alt.Chart(
    sb_2019_revenue_tab7_ind_Geo[sb_2019_revenue_tab7_ind_Geo['Location_indicator_-_code']==1])
.mark_rect().encode(
 x='Geography:O',
 y=alt.Y('North_American_Industry_Classification_System_NAICS', title='Rural', sort='-x'),
 color='Percent_of_profitable_businesses:Q',size='Percent_of_profitable_businesses'
)
)

top_ind_sb_profpct_Geo_urban= (alt.Chart(
    sb_2019_revenue_tab7_ind_Geo[sb_2019_revenue_tab7_ind_Geo['Location_indicator_-_code']==2])
.mark_rect().encode(
 x='Geography:O',
 y=alt.Y('North_American_Industry_Classification_System_NAICS', title='Urban', sort='-x'),
 color='Percent_of_profitable_businesses:Q', size='Percent_of_profitable_businesses'
)
)

top_ind_sb_profpct_Geo_rural.configure_axis(labelLimit=1000) 

In [55]:
top_ind_sb_profpct_Geo_urban.configure_axis(labelLimit=1000) 

- In rural, Nunavut, Territories, Newfoundland and Labrador have higher percent of profitable small businesses in Agricalture; while Prairies, Saskatchewan, Manitoba, Alberta and Ontario have lower percent of profitable small businesses.
- In urban, Manitoba's Agriculture has the lowest percent of profitable small business.

#### 2. Profitable and non-profitable businesses' average revenue, expense and profit/loss

##### (1) CA

In [56]:
# 2017-2019, drop all industry, industry code!=0, select all Canada, Geographycode=0
sb_revenue_tab7_3y_ind = sb_revenue_tab7_3y[sb_revenue_tab7_3y['North_American_Industry_Classification_System_NAICS_-_code'] != '0']
sb_revenue_tab7_3y_ind_CA = sb_revenue_tab7_3y_ind[sb_revenue_tab7_3y_ind['Geography_-_code']==0]

In [57]:
sb_revenue_prof_tab7_3y_ind_CA_melt = sb_revenue_tab7_3y_ind_CA[['North_American_Industry_Classification_System_NAICS', 'Reference_year', 'Location_indicator', 'Profitable_:_Total_Expenses', 'Profitable_:_Net_Profit']]

In [58]:
sb_revenue_nonprof_tab7_3y_ind_CA_melt = sb_revenue_tab7_3y_ind_CA[['North_American_Industry_Classification_System_NAICS', 'Reference_year', 'Location_indicator', 'Non-profitable_:_Total_expenses', 'Non-profitable_:_Net_loss']]

In [59]:
sb_revenue_prof_tab7_3y_ind_CA_melt.Reference_year.unique()

array([2017, 2018, 2019])

In [60]:
sb_revenue_prof_tab7_3y_ind_CA_melt = sb_revenue_prof_tab7_3y_ind_CA_melt.melt(id_vars=['North_American_Industry_Classification_System_NAICS', 'Reference_year', 'Location_indicator'], var_name='type',
             value_name='value')

In [61]:
sb_revenue_nonprof_tab7_3y_ind_CA_melt = sb_revenue_nonprof_tab7_3y_ind_CA_melt.melt(id_vars=['North_American_Industry_Classification_System_NAICS', 'Reference_year', 'Location_indicator'], var_name='type',
             value_name='value')

In [62]:
sb_revenue_prof_tab7_3y_ind_CA_melt.head()

Unnamed: 0,North_American_Industry_Classification_System_NAICS,Reference_year,Location_indicator,type,value
0,"Agriculture, forestry, fishing and hunting",2017,Rural,Profitable_:_Total_Expenses,349.0
1,"Agriculture, forestry, fishing and hunting",2017,Urban,Profitable_:_Total_Expenses,351.9
2,"Mining, quarrying, and oil and gas extraction",2017,Rural,Profitable_:_Total_Expenses,290.6
3,"Mining, quarrying, and oil and gas extraction",2017,Urban,Profitable_:_Total_Expenses,290.0
4,Utilities,2017,Rural,Profitable_:_Total_Expenses,472.2


In [63]:
sb_revenue_nonprof_tab7_3y_ind_CA_melt.head()

Unnamed: 0,North_American_Industry_Classification_System_NAICS,Reference_year,Location_indicator,type,value
0,"Agriculture, forestry, fishing and hunting",2017,Rural,Non-profitable_:_Total_expenses,1148.7
1,"Agriculture, forestry, fishing and hunting",2017,Urban,Non-profitable_:_Total_expenses,1112.4
2,"Mining, quarrying, and oil and gas extraction",2017,Rural,Non-profitable_:_Total_expenses,537.1
3,"Mining, quarrying, and oil and gas extraction",2017,Urban,Non-profitable_:_Total_expenses,1920.7
4,Utilities,2017,Rural,Non-profitable_:_Total_expenses,569.9


In [64]:
s1= sb_revenue_prof_tab7_3y_ind_CA_melt[sb_revenue_prof_tab7_3y_ind_CA_melt["Location_indicator"]=="Rural"]
s2 = sb_revenue_nonprof_tab7_3y_ind_CA_melt[sb_revenue_nonprof_tab7_3y_ind_CA_melt["Location_indicator"]=="Rural"]
# s2['North_American_Industry_Classification_System_NAICS'] = np.where(s2['North_American_Industry_Classification_System_NAICS'].str.contains("Administrative and support"), "Administrative and support", s2['North_American_Industry_Classification_System_NAICS'])

# s1['North_American_Industry_Classification_System_NAICS'] = np.where(s1['North_American_Industry_Classification_System_NAICS'].str.contains("Administrative and support"), "Administrative and support", s1['North_American_Industry_Classification_System_NAICS'])


In [65]:
top_ind_sb_prof_avgrev_melt = (alt.Chart(s1[s1['Reference_year']==2019],title=alt.TitleParams(text = "", 
                                           subtitle = "")).mark_bar().encode(
    x=alt.X('value:Q', title='Profitable:Average revenue(dollar x 1,000)', scale=alt.Scale(domain=[0,1000])),
    y=alt.Y('North_American_Industry_Classification_System_NAICS', title=None),
    color=alt.Color('type', legend=alt.Legend(title=None))
)
)

top_ind_sb_pnonrof_avgrev_melt = (alt.Chart(s2[s2['Reference_year']==2019],title=alt.TitleParams(text = "", 
                                           subtitle = "")).mark_bar().encode(
    x=alt.X('value:Q', title='Non-profitable:Average revenue(dollar x 1,000)', scale=alt.Scale(domain=[-2000,2500])),
    y=alt.Y('North_American_Industry_Classification_System_NAICS', title=None),
    color=alt.Color('type', legend=alt.Legend(title=None, orient='bottom'))
)
)

In [66]:
alt.hconcat(top_ind_sb_prof_avgrev_melt,top_ind_sb_pnonrof_avgrev_melt).configure_axis(
    labelFontSize=18,
    labelLimit=1000
)

In [67]:
top_ind_sb_prof_avgrev_melt = (alt.Chart(sb_revenue_prof_tab7_3y_ind_CA_melt,title=alt.TitleParams(text = "", 
                                           subtitle = "")).mark_bar().encode(
    x=alt.X('value:Q', title='Profitable:Average revenue', scale=alt.Scale(domain=[0,1000])),
    y=alt.Y('North_American_Industry_Classification_System_NAICS', title='Industry', sort='-x'),
    color=alt.Color('type'),
    row=alt.Row('Reference_year', title='')
).facet('Location_indicator')
)

top_ind_sb_pnonrof_avgrev_melt = (alt.Chart(sb_revenue_nonprof_tab7_3y_ind_CA_melt,title=alt.TitleParams(text = "", 
                                           subtitle = "")).mark_bar().encode(
    x=alt.X('value:Q', title='Non-profitable:Average revenue', scale=alt.Scale(domain=[-2000,2000])),
    y=alt.Y('North_American_Industry_Classification_System_NAICS', title=None, sort='-x'),
    color=alt.Color('type'),
    row=alt.Row('Reference_year', title='')
).facet('Location_indicator')
)

In [68]:
d = (alt.Chart(
    mb_revenue_tab1_3y_ind)
  .mark_bar()
  .encode(
      x=alt.X('Sales_of_goods_and_services_(_percent_of_total_revenue)', title='Sales of goods and services (percent of total revenue)'),
      y=alt.Y('North_American_Industry_Classification_System_NAICS', title='', sort='-x'),
      color=alt.Color('Location_indicator', title = None)
  ).facet(row = alt.Row('Reference_year:O', title=None))
 )




alt.vconcat(top_ind_sb_prof_avgrev_melt, top_ind_sb_pnonrof_avgrev_melt).configure_axis(labelLimit=1000).properties(
    title='Medium businesses, Canada, by industry and rural and urban areas, 2017-2019'
)

##### (2) Geogrophy---Provinces and Territories


In [69]:
# 2017-2019, drop all industry, industry code!=0, select all Canada, Geographycode=0
sb_revenue_tab7_3y_ind = sb_revenue_tab7_3y[sb_revenue_tab7_3y['North_American_Industry_Classification_System_NAICS_-_code'] != '0']
sb_revenue_tab7_3y_ind_CA = sb_revenue_tab7_3y_ind[sb_revenue_tab7_3y_ind['Geography_-_code']==0]

In [70]:
sb_revenue_prof_tab7_3y_ind_CA_melt = sb_revenue_tab7_3y_ind_CA[['North_American_Industry_Classification_System_NAICS', 'Reference_year', 'Location_indicator', 'Profitable_:_Total_Expenses', 'Profitable_:_Net_Profit']]

In [71]:
sb_revenue_nonprof_tab7_3y_ind_CA_melt = sb_revenue_tab7_3y_ind_CA[['North_American_Industry_Classification_System_NAICS', 'Reference_year', 'Location_indicator', 'Non-profitable_:_Total_expenses', 'Non-profitable_:_Net_loss']]

In [72]:
sb_revenue_prof_tab7_3y_ind_CA_melt.Reference_year.unique()

array([2017, 2018, 2019])

In [73]:
sb_revenue_prof_tab7_3y_ind_CA_melt = sb_revenue_prof_tab7_3y_ind_CA_melt.melt(id_vars=['North_American_Industry_Classification_System_NAICS', 'Reference_year', 'Location_indicator'], var_name='type',
             value_name='value')

In [74]:
sb_revenue_nonprof_tab7_3y_ind_CA_melt = sb_revenue_nonprof_tab7_3y_ind_CA_melt.melt(id_vars=['North_American_Industry_Classification_System_NAICS', 'Reference_year', 'Location_indicator'], var_name='type',
             value_name='value')

In [75]:
sb_revenue_prof_tab7_3y_ind_CA_melt.head()

Unnamed: 0,North_American_Industry_Classification_System_NAICS,Reference_year,Location_indicator,type,value
0,"Agriculture, forestry, fishing and hunting",2017,Rural,Profitable_:_Total_Expenses,349.0
1,"Agriculture, forestry, fishing and hunting",2017,Urban,Profitable_:_Total_Expenses,351.9
2,"Mining, quarrying, and oil and gas extraction",2017,Rural,Profitable_:_Total_Expenses,290.6
3,"Mining, quarrying, and oil and gas extraction",2017,Urban,Profitable_:_Total_Expenses,290.0
4,Utilities,2017,Rural,Profitable_:_Total_Expenses,472.2


In [76]:
sb_revenue_nonprof_tab7_3y_ind_CA_melt.head()

Unnamed: 0,North_American_Industry_Classification_System_NAICS,Reference_year,Location_indicator,type,value
0,"Agriculture, forestry, fishing and hunting",2017,Rural,Non-profitable_:_Total_expenses,1148.7
1,"Agriculture, forestry, fishing and hunting",2017,Urban,Non-profitable_:_Total_expenses,1112.4
2,"Mining, quarrying, and oil and gas extraction",2017,Rural,Non-profitable_:_Total_expenses,537.1
3,"Mining, quarrying, and oil and gas extraction",2017,Urban,Non-profitable_:_Total_expenses,1920.7
4,Utilities,2017,Rural,Non-profitable_:_Total_expenses,569.9


##### (3) Geogrophy---Provinces and Territories vs Industries


In [77]:
top_ind_sb_prof_Geo_avgrev= (alt.Chart(
    sb_2019_revenue_tab7_ind_Geo)
.mark_rect().encode(
 x='Geography:O',
 y=alt.Y('North_American_Industry_Classification_System_NAICS', title='Industry (Urban)', sort='-x'),
 color='Profitable_:_Total_Revenue:Q', size='Profitable_:_Total_Revenue'
).facet('Location_indicator')
)                      

top_ind_sb_prof_Geo_avgexp= (alt.Chart(
    sb_2019_revenue_tab7_ind_Geo)
.mark_rect().encode(
 x='Geography:O',
 y=alt.Y('North_American_Industry_Classification_System_NAICS', title='Industry (Urban)', sort='-x'),
 color='Profitable_:_Total_Expenses:Q', size='Profitable_:_Total_Expenses'
).facet('Location_indicator')
)   

top_ind_sb_prof_Geo_avgprofit= (alt.Chart(
    sb_2019_revenue_tab7_ind_Geo)
.mark_rect().encode(
 x='Geography:O',
 y=alt.Y('North_American_Industry_Classification_System_NAICS', title='Industry (Urban)', sort='-x'),
 color='Profitable_:_Net_Profit:Q', size='Profitable_:_Net_Profit'
).facet('Location_indicator')
)

##############

top_ind_sb_nonprof_Geo_avgrev= (alt.Chart(
    sb_2019_revenue_tab7_ind_Geo)
.mark_rect().encode(
 x='Geography:O',
 y=alt.Y('North_American_Industry_Classification_System_NAICS', title='Industry (Urban)', sort='-x'),
 color='Non-profitable_:_Total_Revenue:Q', size='Non-profitable_:_Total_Revenue'
).facet('Location_indicator')
)                      

top_ind_sb_nonprof_Geo_avgexp= (alt.Chart(
    sb_2019_revenue_tab7_ind_Geo)
.mark_rect().encode(
 x='Geography:O',
 y=alt.Y('North_American_Industry_Classification_System_NAICS', title='Industry (Urban)', sort='-x'),
 color='Non-profitable_:_Total_expenses:Q', size='Non-profitable_:_Total_expenses'
).facet('Location_indicator')
)   

top_ind_sb_nonprof_Geo_avgprofit= (alt.Chart(
    sb_2019_revenue_tab7_ind_Geo)
.mark_rect().encode(
 x='Geography:O',
 y=alt.Y('North_American_Industry_Classification_System_NAICS', title='Industry (Urban)', sort='-x'),
 color='Non-profitable_:_Net_loss:Q', size='Non-profitable_:_Net_loss'
).facet('Location_indicator')
)   

In [78]:
top_ind_sb_prof_Geo_avgrev | top_ind_sb_nonprof_Geo_avgrev

In [79]:
top_ind_sb_prof_Geo_avgexp | top_ind_sb_nonprof_Geo_avgexp

In [80]:
top_ind_sb_prof_Geo_avgprofit

In [81]:
top_ind_sb_nonprof_Geo_avgprofit