In [1]:
import pandas as pd
import numpy as np
import colorlover as cl
import plotly.offline as plt
import plotly.graph_objs as go
plt.init_notebook_mode(connected=True)

In [2]:
data_dir = "../data/"

# Inititial Cleaning

## Energy

In [3]:
energy_df = pd.read_csv(data_dir+"01280016_energy.csv")
#only keep interesting variables
energy_df = energy_df[["Ref_Date", "GEO", "SUPPLY", "Value"]]

In [4]:
#some cleaning
#conver value to numeric
#droppping supporessed and missing (whic are coecerd to NaN)
energy_df["energy_value"] = pd.to_numeric(energy_df.Value, errors='coerce')
energy_df = energy_df[energy_df["energy_value"].notnull()]
#drop 0s
energy_df = energy_df[energy_df["energy_value"] > 0]
energy_df.head()

Unnamed: 0,Ref_Date,GEO,SUPPLY,Value,energy_value
0,1995,Canada,Exports,7484910.0,7484910.0
1,1996,Canada,Exports,7645044.0,7645044.0
2,1997,Canada,Exports,8081678.0,8081678.0
3,1998,Canada,Exports,8467485.0,8467485.0
4,1999,Canada,Exports,8485988.0,8485988.0


## GDP

In [5]:
gdp_df = pd.read_csv(data_dir+"03790030-eng_gdp.csv")
gdp_df.head()

Unnamed: 0,Ref_Date,GEO,VALUE,NAICS,Vector,Coordinate,Value
0,1997,Newfoundland and Labrador,Current dollars,"All industries (x 1,000,000)",v62460554,1.1.1,..
1,1998,Newfoundland and Labrador,Current dollars,"All industries (x 1,000,000)",v62460554,1.1.1,..
2,1999,Newfoundland and Labrador,Current dollars,"All industries (x 1,000,000)",v62460554,1.1.1,..
3,2000,Newfoundland and Labrador,Current dollars,"All industries (x 1,000,000)",v62460554,1.1.1,..
4,2001,Newfoundland and Labrador,Current dollars,"All industries (x 1,000,000)",v62460554,1.1.1,..


In [6]:
#some cleaning
#conver value to numeric
#droppping supporessed and missing (whic are coecerd to NaN)
gdp_df["value"] = pd.to_numeric(gdp_df.Value, errors='coerce')
gdp_df = gdp_df[gdp_df["value"].notnull()]
#drop 0s
gdp_df = gdp_df[gdp_df["value"] > 0]
#only want current dollars
gdp_df = gdp_df[gdp_df["VALUE"] == "Current dollars"]

In [7]:
#need to remove trailing ("x 1,000,000")
gdp_df["NAICS"] = gdp_df.NAICS.str.replace("\(x 1,000,000\)", "")
gdp_df["NAICS"] = gdp_df["NAICS"].str.strip()
gdp_df.head()

Unnamed: 0,Ref_Date,GEO,VALUE,NAICS,Vector,Coordinate,Value,value
10,2007,Newfoundland and Labrador,Current dollars,All industries,v62460554,1.1.1,27349.4,27349.4
11,2008,Newfoundland and Labrador,Current dollars,All industries,v62460554,1.1.1,29867.3,29867.3
12,2009,Newfoundland and Labrador,Current dollars,All industries,v62460554,1.1.1,23223.2,23223.2
13,2010,Newfoundland and Labrador,Current dollars,All industries,v62460554,1.1.1,27161.8,27161.8
14,2011,Newfoundland and Labrador,Current dollars,All industries,v62460554,1.1.1,31591.2,31591.2


# Concord

In [8]:
#read in concordances
energy_concordance = pd.read_csv(data_dir+"energy_concordance.csv")
gdp_concordance = pd.read_csv(data_dir+"gdp_concordance.csv")

## Energy

In [9]:
#now merge energy concordance
energy_concorded = energy_df.merge(energy_concordance, left_on="SUPPLY", right_on="energy_label")
#keep only variables of interests
energy_concorded = energy_concorded[['Ref_Date', 'GEO', 'energy_value', 'our_label']]
energy_concorded.head()

Unnamed: 0,Ref_Date,GEO,energy_value,our_label
0,1995,Canada,2109278.0,Total industrial
1,1996,Canada,2157590.0,Total industrial
2,1997,Canada,2202317.0,Total industrial
3,1998,Canada,2144147.0,Total industrial
4,1999,Canada,2175238.0,Total industrial


In [10]:
#now aggregate
energy_concorded = energy_concorded.groupby(['Ref_Date', 'GEO', 'our_label'], as_index=False).sum()
print("concorded energy data frame has {} observations".format(energy_concorded.shape[0]))
energy_concorded.head()

concorded energy data frame has 6017 observations


Unnamed: 0,Ref_Date,GEO,our_label,energy_value
0,1995,Alberta,Agriculture,129036.0
1,1995,Alberta,All other manufacturing,161671.0
2,1995,Alberta,Chemicals and fertilizers manufacturing,261299.0
3,1995,Alberta,Commercial and other institutional,343982.0
4,1995,Alberta,Construction,27030.0


## GDP

In [11]:
#now merge GDP concordance
gdp_concorded = gdp_df.merge(gdp_concordance, left_on="NAICS", right_on="gdp_label")
#some lower level NAICS are substractred
#aoply psotive column
gdp_concorded["gdp_value"] = gdp_concorded["value"] *  gdp_concorded["positive"]
#keep only variables of interests
gdp_concorded = gdp_concorded[['Ref_Date', 'GEO', 'gdp_value', 'our_label']]
print("Concorded, GDP data frame has {} observations".format(gdp_concorded.shape[0]))
gdp_concorded.head()

Concorded, GDP data frame has 5986 observations


Unnamed: 0,Ref_Date,GEO,gdp_value,our_label
0,2007,Newfoundland and Labrador,11.0,Agriculture
1,2008,Newfoundland and Labrador,7.5,Agriculture
2,2009,Newfoundland and Labrador,7.7,Agriculture
3,2010,Newfoundland and Labrador,13.8,Agriculture
4,2011,Newfoundland and Labrador,12.4,Agriculture


In [12]:
#now aggregate
gdp_concorded = gdp_concorded.groupby(['Ref_Date', 'GEO', 'our_label'], as_index=False).sum()
print("concorded, aggregated, gdp data frame has {} observations".format(gdp_concorded.shape[0]))
gdp_concorded.head()

concorded, aggregated, gdp data frame has 2134 observations


Unnamed: 0,Ref_Date,GEO,our_label,gdp_value
0,2007,Alberta,Agriculture,2511.9
1,2007,Alberta,All other manufacturing,12374.9
2,2007,Alberta,Cement manufacturing,780.1
3,2007,Alberta,Chemicals and fertilizers manufacturing,2308.1
4,2007,Alberta,Commercial and other institutional,23642.9


# Final Cleaning

In [13]:
#now merge the two data frame
concorded = gdp_concorded.merge(energy_concorded,
                      left_on=['Ref_Date', 'GEO', 'our_label'],
                      right_on=['Ref_Date', 'GEO', 'our_label'])

In [14]:
#convert energy value from terajoukles to oil barrel equivalent
concorded["energy_value"] = concorded["energy_value"] * 23.8845896627

In [15]:
#convert gdp to millions
concorded["gdp_value"] = concorded["gdp_value"] * 1000.

In [16]:
#calculate labour intensity
concorded["intensity"] = concorded["energy_value"]/concorded["gdp_value"]

In [17]:
#get level and parents
concorded = concorded.merge(gdp_concordance[['our_label', 'level', 'parent']].drop_duplicates())

In [18]:
concorded.head()

Unnamed: 0,Ref_Date,GEO,our_label,gdp_value,energy_value,intensity,level,parent
0,2007,Alberta,Agriculture,2511900.0,3535612.0,1.407545,1,
1,2007,British Columbia,Agriculture,1212800.0,1019490.0,0.840608,1,
2,2007,Manitoba,Agriculture,1494500.0,1597951.0,1.069221,1,
3,2007,New Brunswick,Agriculture,225600.0,183194.8,0.812034,1,
4,2007,Newfoundland and Labrador,Agriculture,64500.0,25413.2,0.394003,1,


In [19]:
concorded[(concorded["Ref_Date"] == 2014) & (concorded["level"] == 1)]["gdp_value"].sum()

1691942700.0

In [20]:
concorded[(concorded["Ref_Date"] == 2014) & (concorded["level"] == 1)]["GEO"].unique()

array(['Alberta', 'British Columbia', 'Manitoba', 'New Brunswick',
       'Newfoundland and Labrador', 'Nova Scotia', 'Ontario',
       'Prince Edward Island', 'Quebec', 'Saskatchewan',
       'Northwest Territories', 'Nunavut', 'Yukon'], dtype=object)

In [21]:
concorded[(concorded["Ref_Date"] == 2014) & (concorded["GEO"] == "Ontario")].to_csv("check.csv")

# Plot

In [22]:
colors = cl.scales['8']['qual']['Set2']

In [23]:
for_plotting = concorded[(concorded["GEO"] == "Ontario") & (concorded["level"] == 1)]
for_plotting = for_plotting.groupby("our_label")

In [24]:
size_scaler = 2.*concorded.energy_value.max()/(40.**2)
data = []
for i, (key, df) in enumerate(for_plotting):
    data.append(go.Scatter(
                mode = 'markers',
                x=df.Ref_Date,
                y=df['intensity'],
                name = key,
                marker = dict(
                    color = colors[i],
                    sizemode='area',
                    sizeref=size_scaler,
                    sizemin=2,
                    opacity = 0.6,
                    size = df.energy_value)
                )
        )

In [25]:
layout = dict(
    title = "Energy Input per GDP- Canada Level",
)
fig = dict(data=data, layout=layout)
plt.iplot(fig, filename = "Manually Set Range")