In [1]:
import pandas as pd
import numpy as np
import plotly.offline as plt
import plotly.graph_objs as go
import colorlover as cl
import requests
import zipfile
import io
plt.init_notebook_mode(connected=True)

In [2]:
#some globals
data_dir = "../data/"

# Inititial Cleaning

## Energy

In [3]:
energy_df = pd.read_csv(data_dir+"01280016_energy.csv")
#only keep interesting variables
energy_df = energy_df[["Ref_Date", "GEO", "SUPPLY", "Value"]]

In [4]:
#some cleaning
#conver value to numeric
#droppping supporessed and missing (whic are coecerd to NaN)
energy_df["energy_value"] = pd.to_numeric(energy_df.Value, errors='coerce')
energy_df = energy_df[energy_df["energy_value"].notnull()]
#drop 0s
energy_df = energy_df[energy_df["energy_value"] > 0]
energy_df.head()

Unnamed: 0,Ref_Date,GEO,SUPPLY,Value,energy_value
0,1995,Canada,Exports,7484910.0,7484910.0
1,1996,Canada,Exports,7645044.0,7645044.0
2,1997,Canada,Exports,8081678.0,8081678.0
3,1998,Canada,Exports,8467485.0,8467485.0
4,1999,Canada,Exports,8485988.0,8485988.0


## Labor

In [5]:
def download_labor(data_dir):
    url = 'http://www20.statcan.gc.ca/tables-tableaux/cansim/csv/03830031-eng.zip'
    response = requests.get(url, allow_redirects=True)
    z = zipfile.ZipFile(io.BytesIO(response.content))
    z.extractall(path = data_dir)

In [6]:
#read in labour data, downloading if not present
try:
    labor_df = pd.read_csv(data_dir+"03830031-eng.csv")
except FileNotFoundError:
    print("Labor data not found. Downloading it")
    download_labor(data_dir)
    labor_df = pd.read_csv(data_dir+"03830031-eng.csv")


Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.



In [7]:
#some cleaning
#conver value to numeric
#droppping supporessed and missing (whic are coecerd to NaN)
labor_df["labor_value"] = pd.to_numeric(labor_df.Value, errors='coerce')
labor_df = labor_df[labor_df["labor_value"].notnull()]
#drop 0s
labor_df = labor_df[labor_df["labor_value"] > 0]
#only keep stats that we care about, for now hours
labor_df = labor_df[labor_df['STATS'] == 'Hours worked for all jobs (hours in thousands)']
labor_df.head()

Unnamed: 0,Ref_Date,GEO,STATS,NAICS,Vector,Coordinate,Value,labor_value
17820,1997,Canada,Hours worked for all jobs (hours in thousands),All industries,v65522654,1.4.1,25063000.0,25062987.0
17821,1998,Canada,Hours worked for all jobs (hours in thousands),All industries,v65522654,1.4.1,25580400.0,25580404.0
17822,1999,Canada,Hours worked for all jobs (hours in thousands),All industries,v65522654,1.4.1,26250300.0,26250333.0
17823,2000,Canada,Hours worked for all jobs (hours in thousands),All industries,v65522654,1.4.1,26799100.0,26799116.0
17824,2001,Canada,Hours worked for all jobs (hours in thousands),All industries,v65522654,1.4.1,26931100.0,26931093.0


# Concord

In [8]:
#read in concordances
energy_concordance = pd.read_csv(data_dir+"energy_concordance.csv")
labor_concordance = pd.read_csv(data_dir+"labor_concordance.csv")

## Energy

In [9]:
#now merge energy concordance
energy_concorded = energy_df.merge(energy_concordance, left_on="SUPPLY", right_on="energy_label")
#keep only variables of interests
energy_concorded = energy_concorded[['Ref_Date', 'GEO', 'energy_value', 'our_label']]
energy_concorded.head()

Unnamed: 0,Ref_Date,GEO,energy_value,our_label
0,1995,Canada,2109278.0,Total industrial
1,1996,Canada,2157590.0,Total industrial
2,1997,Canada,2202317.0,Total industrial
3,1998,Canada,2144147.0,Total industrial
4,1999,Canada,2175238.0,Total industrial


In [10]:
#now aggregate
energy_concorded = energy_concorded.groupby(['Ref_Date', 'GEO', 'our_label'], as_index=False).sum()
print("concorded energy data frame has {} observations".format(energy_concorded.shape[0]))
energy_concorded.head()

concorded energy data frame has 6017 observations


Unnamed: 0,Ref_Date,GEO,our_label,energy_value
0,1995,Alberta,Agriculture,129036.0
1,1995,Alberta,All other manufacturing,161671.0
2,1995,Alberta,Chemicals and fertilizers manufacturing,261299.0
3,1995,Alberta,Commercial and other institutional,343982.0
4,1995,Alberta,Construction,27030.0


## Labor

In [11]:
#now merge labour concordance
labor_concorded = labor_df.merge(labor_concordance, left_on="NAICS", right_on="labor_label")
#some lower level NAICS are substractred
#aoply psotive column
labor_concorded["labor_value"] = labor_concorded["labor_value"] *  labor_concorded["positive"]
#keep only variables of interests
labor_concorded = labor_concorded[['Ref_Date', 'GEO', 'labor_value', 'our_label']]
labor_concorded.head()

Unnamed: 0,Ref_Date,GEO,labor_value,our_label
0,1997,Canada,340243.0,Agriculture
1,1998,Canada,328248.0,Agriculture
2,1999,Canada,298032.0,Agriculture
3,2000,Canada,274103.0,Agriculture
4,2001,Canada,247259.0,Agriculture


In [12]:
#now aggregate
labor_concorded = labor_concorded.groupby(['Ref_Date', 'GEO', 'our_label'], as_index=False).sum()
print("concorded labour data frame has {} observations".format(labor_concorded.shape[0]))
labor_concorded.head()

concorded labour data frame has 5794 observations


Unnamed: 0,Ref_Date,GEO,our_label,labor_value
0,1997,Alberta,Agriculture,154801.0
1,1997,Alberta,All other manufacturing,205884.0
2,1997,Alberta,Cement manufacturing,5679.0
3,1997,Alberta,Chemicals and fertilizers manufacturing,14866.0
4,1997,Alberta,Commercial and other institutional,422620.0


# Final Cleaning

In [13]:
#now merge the two data frame
concorded = labor_concorded.merge(energy_concorded,
                      left_on=['Ref_Date', 'GEO', 'our_label'],
                      right_on=['Ref_Date', 'GEO', 'our_label'])

In [14]:
#calculate labour intensity
concorded["intensity"] = concorded["labor_value"]/concorded["energy_value"]

In [15]:
#get level
concorded = concorded.merge(energy_concordance[['our_label', 'level']].drop_duplicates())

In [16]:
concorded.head()

Unnamed: 0,Ref_Date,GEO,our_label,labor_value,energy_value,intensity,level
0,1997,Alberta,Agriculture,154801.0,149777.0,1.033543,1
1,1997,British Columbia,Agriculture,56991.0,35920.0,1.586609,1
2,1997,Canada,Agriculture,765892.0,621071.0,1.233179,1
3,1997,Manitoba,Agriculture,75753.0,60788.0,1.246183,1
4,1997,New Brunswick,Agriculture,11763.0,8267.0,1.422886,1


# Plot

In [17]:
colors = cl.scales['8']['qual']['Set2']

In [18]:
for_plotting = concorded[(concorded["GEO"] == "Canada") & (concorded["level"] == 1)]
for_plotting = for_plotting.groupby("our_label")

In [19]:
size_scaler = 2.*concorded.energy_value.max()/(40.**2)
data = []
for i, (key, df) in enumerate(for_plotting):
    data.append(go.Scatter(
                mode = 'markers',
                x=df.Ref_Date,
                y=df['intensity'],
                name = key,
                marker = dict(
                    color = colors[i],
                    sizemode='area',
                    sizeref=size_scaler,
                    sizemin=2,
                    opacity = 0.6,
                    size = df.energy_value)
                )
        )

In [20]:
layout = dict(
    title = "Labour per Energy Input - Canada Level",
)
fig = dict(data=data, layout=layout)
plt.iplot(fig, filename = "Manually Set Range")