In [1]:
import pandas as pd
import numpy as np
import altair as alt

In [2]:
mb_2dig_ind = pd.read_csv("mb_2dig_ind.csv")
sb_2dig_ind = pd.read_csv("sb_2dig_ind.csv")

## Data processing

In [3]:
# Change column names to Average revenue and Average expenses
mb_2dig_ind = mb_2dig_ind.rename(columns={'Total revenue':'Average revenue', 'Total expenses':'Average expenses'})
sb_2dig_ind = sb_2dig_ind.rename(columns={'Total revenue':'Average revenue', 'Total expenses':'Average expenses',
                                         'Sales of goods and services* ( percent of total revenue)':'Sales of goods and services ( percent of total revenue)'})

In [4]:
# Small businesses: Canada, Rural, All industry
sb_2dig_ind_CA = sb_2dig_ind[sb_2dig_ind['Geography - code']==0]
sb_2dig_ind_CA_rural = sb_2dig_ind_CA[sb_2dig_ind_CA['Location indicator - code']==1]
sb_2dig_ind_CA_rural_allcorp = sb_2dig_ind_CA_rural[sb_2dig_ind_CA_rural['Incorporation status - code']==3]
sb_2dig_ind_CA_rural_allcorp.shape

(54, 30)

In [5]:
# Medium businesses: Canada, Rural, All is incorporated 
mb_2dig_ind_CA_rural = mb_2dig_ind[mb_2dig_ind['Location indicator - code']==1]
mb_2dig_ind_CA_rural.shape

(54, 25)

In [6]:
# Columns used for Agriculture Analysis
AgriPart_col = ['North American Industry Classification System, NAICS - code', 'North American Industry Classification System, NAICS',
                'Reference year', 'Total number of businesses', 'Average revenue', 'Average expenses', 'Net Profit/Loss', 
                'Sales of goods and services ( percent of total revenue)', 'Gross margin (%)', 
                'Cost of sales (direct expenses) (%)', 'Operating expenses (indirect expenses) (%)']

# Columns used for Agriculture small businesses percent of profitable businesses
sm_PctProfit_col = ['North American Industry Classification System, NAICS - code', 'North American Industry Classification System, NAICS',
                'Reference year', 'Net Profit/Loss', 'Percent of profitable businesses']

In [7]:
# Extract same columns for small and medium businesses
sb_2dig_ind_CA_rural_allcorp_drop = sb_2dig_ind_CA_rural_allcorp[AgriPart_col]
mb_2dig_ind_CA_rural_drop = mb_2dig_ind_CA_rural[AgriPart_col]
# Extract columns of percent of profitable businesses for small businesses
sb_2dig_ind_CA_rural_allcorp_PctProfit = sb_2dig_ind_CA_rural_allcorp[sm_PctProfit_col]

In [8]:
# Insert businesses type for medium and small in order to concat
sb_2dig_ind_CA_rural_allcorp_drop.insert(sb_2dig_ind_CA_rural_allcorp_drop.shape[1], 'Business type', 'Small Business')
mb_2dig_ind_CA_rural_drop.insert(mb_2dig_ind_CA_rural_drop.shape[1], 'Business type', 'Medium Business')

In [9]:
# Add Total revenue and total expenses columns for small and medium businesses
sb_2dig_ind_CA_rural_allcorp_drop['Total revenue'] = sb_2dig_ind_CA_rural_allcorp_drop['Average revenue'] * sb_2dig_ind_CA_rural_allcorp_drop['Total number of businesses']
sb_2dig_ind_CA_rural_allcorp_drop['Total expenses'] = sb_2dig_ind_CA_rural_allcorp_drop['Average expenses'] * sb_2dig_ind_CA_rural_allcorp_drop['Total number of businesses']
mb_2dig_ind_CA_rural_drop['Total revenue'] = mb_2dig_ind_CA_rural_drop['Average revenue'] * mb_2dig_ind_CA_rural_drop['Total number of businesses']
mb_2dig_ind_CA_rural_drop['Total expenses'] = mb_2dig_ind_CA_rural_drop['Average expenses'] * mb_2dig_ind_CA_rural_drop['Total number of businesses']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sb_2dig_ind_CA_rural_allcorp_drop['Total revenue'] = sb_2dig_ind_CA_rural_allcorp_drop['Average revenue'] * sb_2dig_ind_CA_rural_allcorp_drop['Total number of businesses']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sb_2dig_ind_CA_rural_allcorp_drop['Total expenses'] = sb_2dig_ind_CA_rural_allcorp_drop['Average expenses'] * sb_2dig_ind_CA_rural_allcorp_drop['Total number of businesses']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead



In [10]:
# Create the data used to do the analysis of Agriculture in rural areas
AgriPart_data = pd.concat([sb_2dig_ind_CA_rural_allcorp_drop, mb_2dig_ind_CA_rural_drop])

In [11]:
AgriPart_data.to_csv("AgriPart_data.csv") 
sb_2dig_ind_CA_rural_allcorp_PctProfit.to_csv("sb_2dig_ind_CA_rural_allcorp_PctProfit.csv") 

In [12]:
AgriPart_data.columns

Index(['North American Industry Classification System, NAICS - code',
       'North American Industry Classification System, NAICS',
       'Reference year', 'Total number of businesses', 'Average revenue',
       'Average expenses', 'Net Profit/Loss',
       'Sales of goods and services ( percent of total revenue)',
       'Gross margin (%)', 'Cost of sales (direct expenses) (%)',
       'Operating expenses (indirect expenses) (%)', 'Business type',
       'Total revenue', 'Total expenses'],
      dtype='object')

In [13]:
sb_2dig_ind_CA_rural_allcorp_PctProfit.columns

Index(['North American Industry Classification System, NAICS - code',
       'North American Industry Classification System, NAICS',
       'Reference year', 'Net Profit/Loss',
       'Percent of profitable businesses'],
      dtype='object')

## Data visualization

In [14]:
count = (alt.Chart(
    AgriPart_data)
  .mark_bar()
  .encode(
      x=alt.X('Total number of businesses', title='Total number of businesses'),
      y=alt.Y('North American Industry Classification System, NAICS', title='', sort='-x'),
      color=alt.Color('Business type', scale=alt.Scale(scheme='viridis'), title = None) # color=viridis
  ).facet(column = alt.Column('Reference year:O', title=None))
)


totalRev = (alt.Chart(
    AgriPart_data)
  .mark_bar()
  .encode(
      x=alt.X('Total revenue',title='Total revenue(dollars x 1,000)'),
      y=alt.Y('North American Industry Classification System, NAICS', title = '', sort='-x'),
      color='Business type'
  ).facet(column = alt.Column('Reference year:O', title=None))
 )

avgRve = (alt.Chart(
    AgriPart_data)
  .mark_bar()
  .encode(
      x=alt.X('Average revenue',title='Average revenue(dollars x 1,000)'),
      y=alt.Y('North American Industry Classification System, NAICS', title = '', sort='-x'),
      color='Business type'
  ).facet(column = alt.Column('Reference year:O', title=None))
 )

netProfit = (alt.Chart(
    AgriPart_data)
  .mark_bar()
  .encode(
      x=alt.X('Net Profit/Loss',title='Net Profit/Loss(dollars x 1,000)'),
      y=alt.Y('North American Industry Classification System, NAICS', title = '', sort='-x'),
      color='Business type'
  ).facet(column = alt.Column('Reference year:O', title=None))
 )

grossMargin = (alt.Chart(
    AgriPart_data)
  .mark_bar()
  .encode(
      x=alt.X('Gross margin (%)',title='Gross margin (%)'),
      y=alt.Y('North American Industry Classification System, NAICS', title = '', sort='-x'),
      color='Business type'
  ).facet(column = alt.Column('Reference year:O', title=None))
 )

totalExp = (alt.Chart(
    AgriPart_data)
  .mark_bar()
  .encode(
      x=alt.X('Total expenses',title='Total expenses(dollars x 1,000)'),
      y=alt.Y('North American Industry Classification System, NAICS', title = '', sort='-x'),
      color='Business type'
  ).facet(column = alt.Column('Reference year:O', title=None))
 )

directExp = (alt.Chart(
    AgriPart_data)
  .mark_bar()
  .encode(
      x=alt.X('Cost of sales (direct expenses) (%)',title='Cost of sales (direct expenses) (%)'),
      y=alt.Y('North American Industry Classification System, NAICS', title = '', sort='-x'),
      color='Business type'
  ).facet(column = alt.Column('Reference year:O', title=None))
 )

indirectExp = (alt.Chart(
    AgriPart_data)
  .mark_bar()
  .encode(
      x=alt.X('Operating expenses (indirect expenses) (%)',title='Operating expenses (indirect expenses) (%)'),
      y=alt.Y('North American Industry Classification System, NAICS', title = '', sort='-x'),
      color='Business type'
  ).facet(column = alt.Column('Reference year:O', title=None))
 )

pctProfit = (alt.Chart(
    sb_2dig_ind_CA_rural_allcorp_PctProfit)
  .mark_bar()
  .encode(
      x=alt.X('Percent of profitable businesses',title='Percent of profitable businesses (%)'),
      y=alt.Y('North American Industry Classification System, NAICS', title = '', sort='-x'),
  ).facet(column = alt.Column('Reference year:O', title=None))
 )


In [61]:
avgRve = (alt.Chart(
    AgriPart_data)
  .mark_bar()
  .encode(
      x=alt.X('Average revenue',title='Average revenue(dollars x 1,000)'),
      y=alt.Y('North American Industry Classification System, NAICS', title = '', sort='-x'),
      color=alt.Color('Business type', scale=alt.Scale(scheme='viridis'), legend=alt.Legend(title=None, orient='bottom')) # color=viridis
  ).facet(column = alt.Column('Reference year:O', title=None))
 )

In [62]:
avgRve.configure_axis(labelLimit=360, labelFontSize=20, titleFontSize=20).configure_legend(
titleFontSize=20,
labelFontSize=20
) 

In [15]:
alt.vconcat(count, totalRev, avgRve, netProfit, grossMargin, totalExp, directExp, indirectExp, pctProfit).configure_axis(labelLimit=1000, labelFontSize=17, titleFontSize=20)


In [16]:
AgriPart_data2019 = AgriPart_data[AgriPart_data['Reference year']==2019]

In [28]:
count2019 = (alt.Chart(
    AgriPart_data2019)
  .mark_bar()
  .encode(
      x=alt.X('Total number of businesses:Q', title='Total number of businesses'),
      y=alt.Y('North American Industry Classification System, NAICS:O', title='', sort='-x'),
      color=alt.Color('Business type', scale=alt.Scale(scheme='viridis'), legend=alt.Legend(title=None, orient='bottom')), # color=viridis
      opacity=alt.condition(
        alt.datum['North American Industry Classification System, NAICS'] == 'Agriculture, forestry, fishing and hunting',  # If the country is "US" this test returns True,
        #alt.value('red'),     # highlight a bar with red.
        #alt.value('lightgrey')   # And grey for the rest of the bars
        alt.value(1.0), alt.value(0.3)
     )  )
)


In [29]:
totalRev2019 = (alt.Chart(
    AgriPart_data2019)
  .mark_bar()
  .encode(
      x=alt.X('Total revenue',title='Total revenue(dollars x 1,000)'),
      y=alt.Y('North American Industry Classification System, NAICS', title = '', sort='-x'),
      color=alt.Color('Business type', scale=alt.Scale(scheme='inferno'), legend=alt.Legend(title=None, orient='bottom')), # color=viridis
      opacity=alt.condition(
        alt.datum['North American Industry Classification System, NAICS'] == 'Agriculture, forestry, fishing and hunting',  # If the country is "US" this test returns True,
        #alt.value('red'),     # highlight a bar with red.
        #alt.value('lightgrey')   # And grey for the rest of the bars
        alt.value(1.0), alt.value(0.3)
     )  )      
  )
 


 

In [30]:
alt.hconcat(count2019, totalRev2019).configure_axis(labelLimit=360, labelFontSize=20, titleFontSize=20).configure_legend(
titleFontSize=20,
labelFontSize=20
) 


In [39]:
count2019.configure_axis(labelLimit=1000, labelFontSize=20, titleFontSize=20).configure_legend(
titleFontSize=20,
labelFontSize=20
) 

In [40]:
totalRev2019.configure_axis(labelLimit=1000, labelFontSize=20, titleFontSize=20).configure_legend(
titleFontSize=20,
labelFontSize=20
) 

In [31]:
netProfit2019 = (alt.Chart(
    AgriPart_data2019)
  .mark_bar()
  .encode(
      x=alt.X('Net Profit/Loss',title='Net Profit/Loss(dollars x 1,000)'),
      y=alt.Y('North American Industry Classification System, NAICS', title = '', sort='-x'),
      color=alt.Color('Business type', scale=alt.Scale(scheme='sinebow'), legend=alt.Legend(title=None, orient='bottom')), # color=sinebow
      opacity=alt.condition(
        alt.datum['North American Industry Classification System, NAICS'] == 'Agriculture, forestry, fishing and hunting',  # If the country is "US" this test returns True,
        #alt.value('red'),     # highlight a bar with red.
        #alt.value('lightgrey')   # And grey for the rest of the bars
        alt.value(1.0), alt.value(0.3)
     )  )
  )

In [41]:
netProfit2019.configure_axis(labelLimit=1000, labelFontSize=20, titleFontSize=20).configure_legend(
titleFontSize=20,
labelFontSize=20
) 

In [33]:
grossMargin2019 = (alt.Chart(
    AgriPart_data2019)
  .mark_bar()
  .encode(
      x=alt.X('Gross margin (%)',title='Gross margin (%)'),
      y=alt.Y('North American Industry Classification System, NAICS', title = '', sort='-x'),
      color=alt.Color('Business type', scale=alt.Scale(scheme='reds'), legend=alt.Legend(title=None, orient='bottom')), # color=reds
      opacity=alt.condition(
        alt.datum['North American Industry Classification System, NAICS'] == 'Agriculture, forestry, fishing and hunting',  # If the country is "US" this test returns True,
        #alt.value('red'),     # highlight a bar with red.
        #alt.value('lightgrey')   # And grey for the rest of the bars
        alt.value(1.0), alt.value(0.3)
     )  )
  )

In [42]:
grossMargin2019.configure_axis(labelLimit=1000, labelFontSize=20, titleFontSize=20).configure_legend(
titleFontSize=20,
labelFontSize=20
) 

In [44]:
sb_2dig_ind_CA_rural_allcorp_PctProfit2019 = sb_2dig_ind_CA_rural_allcorp_PctProfit[sb_2dig_ind_CA_rural_allcorp_PctProfit['Reference year']==2019]

In [55]:
pctProfit2019 = (alt.Chart(
    sb_2dig_ind_CA_rural_allcorp_PctProfit2019)
  .mark_bar()
  .encode(
      x=alt.X('Percent of profitable businesses',title='Percent of profitable businesses (%)'),
      y=alt.Y('North American Industry Classification System, NAICS', title = '', sort='-x'),
      color=alt.condition(
        alt.datum['North American Industry Classification System, NAICS'] == 'Agriculture, forestry, fishing and hunting',  # If the country is "US" this test returns True,
        alt.value('red'),     # highlight a bar with red.
        alt.value('lightgreen')   # And grey for the rest of the bars
     )
  )
  )

In [57]:
pctProfit2019.configure_axis(labelLimit=1000, labelFontSize=20, titleFontSize=20).configure_legend(
titleFontSize=20,
labelFontSize=20
) 


In [35]:
directExp2019 = (alt.Chart(
    AgriPart_data2019)
  .mark_bar()
  .encode(
      x=alt.X('Cost of sales (direct expenses) (%)',title='Cost of sales (direct expenses) (%)'),
      y=alt.Y('North American Industry Classification System, NAICS', title = '', sort='-x'),
      color=alt.Color('Business type', scale=alt.Scale(scheme='purpleblue'), legend=alt.Legend(title=None, orient='bottom')), # color=reds
      opacity=alt.condition(
        alt.datum['North American Industry Classification System, NAICS'] == 'Agriculture, forestry, fishing and hunting',  # If the country is "US" this test returns True,
        #alt.value('red'),     # highlight a bar with red.
        #alt.value('lightgrey')   # And grey for the rest of the bars
        alt.value(1.0), alt.value(0.3)
     )  )
  )

indirectExp2019 = (alt.Chart(
    AgriPart_data2019)
  .mark_bar()
  .encode(
      x=alt.X('Operating expenses (indirect expenses) (%)',title='Operating expenses (indirect expenses) (%)'),
      y=alt.Y('North American Industry Classification System, NAICS', title = '', sort='-x'),
       color=alt.Color('Business type', scale=alt.Scale(scheme='reds'), legend=alt.Legend(title=None, orient='bottom')), # color=reds
      opacity=alt.condition(
        alt.datum['North American Industry Classification System, NAICS'] == 'Agriculture, forestry, fishing and hunting',  # If the country is "US" this test returns True,
        #alt.value('red'),     # highlight a bar with red.
        #alt.value('lightgrey')   # And grey for the rest of the bars
        alt.value(1.0), alt.value(0.3)
     )  )
  )

In [36]:
alt.hconcat(directExp2019, indirectExp2019).configure_axis(labelLimit=360, labelFontSize=20, titleFontSize=20).configure_legend(
titleFontSize=20,
labelFontSize=20
) 
