In [1]:
import pandas as pd
import numpy as np

import os
os.environ['THEANO_FLAGS']='device=cpu'
import pymc3 as pm
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
%%time
root = './input/'
train_df = pd.read_csv(root + 'train.csv')
train_df["timestamp"] = pd.to_datetime(train_df["timestamp"], format='%Y-%m-%d %H:%M:%S')

weather_train_df = pd.read_csv(root + 'weather_train.csv')
# test_df = pd.read_csv(root + 'test.csv')
# weather_test_df = pd.read_csv(root + 'weather_test.csv')
building_meta_df = pd.read_csv(root + 'building_metadata.csv')
print('...loaded')

...loaded
CPU times: user 10.2 s, sys: 1.27 s, total: 11.5 s
Wall time: 11 s


In [3]:
print('Size of train_df data', train_df.shape)
print('Size of weather_train_df data', weather_train_df.shape)
# print('Size of weather_test_df data', weather_test_df.shape)
print('Size of building_meta_df data', building_meta_df.shape)

Size of train_df data (20216100, 4)
Size of weather_train_df data (139773, 9)
Size of building_meta_df data (1449, 6)


In [4]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [5]:
train_df = reduce_mem_usage(train_df)
# test_df = reduce_mem_usage(test_df)

weather_train_df = reduce_mem_usage(weather_train_df)
# weather_test_df = reduce_mem_usage(weather_test_df)
building_meta_df = reduce_mem_usage(building_meta_df)

Mem. usage decreased to 289.19 Mb (53.1% reduction)
Mem. usage decreased to  3.07 Mb (68.1% reduction)
Mem. usage decreased to  0.03 Mb (60.3% reduction)


In [6]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading
0,0,0,2016-01-01,0.0
1,1,0,2016-01-01,0.0
2,2,0,2016-01-01,0.0
3,3,0,2016-01-01,0.0
4,4,0,2016-01-01,0.0


In [7]:
# test_df.head()

In [8]:
weather_train_df.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.5,0.0,0.0
1,0,2016-01-01 01:00:00,24.40625,,21.09375,-1.0,1020.0,70.0,1.5
2,0,2016-01-01 02:00:00,22.796875,2.0,21.09375,0.0,1020.0,0.0,0.0
3,0,2016-01-01 03:00:00,21.09375,2.0,20.59375,0.0,1020.0,0.0,0.0
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.599609


In [9]:
# weather_test_df.head()

In [10]:
building_meta_df.head()

Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count
0,0,0,Education,7432,2008.0,
1,0,1,Education,2720,2004.0,
2,0,2,Education,5376,1991.0,
3,0,3,Education,23685,2002.0,
4,0,4,Education,116607,1975.0,


 ### Building DF merge through concat

In [11]:
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
# test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])
weather_train_df['timestamp'] = pd.to_datetime(weather_train_df['timestamp'])
# weather_test_df['timestamp'] = pd.to_datetime(weather_test_df['timestamp'])

In [12]:
temp_df = train_df[['building_id']]
temp_df = temp_df.merge(building_meta_df, on=['building_id'], how='left')
del temp_df['building_id']
train_df = pd.concat([train_df, temp_df], axis=1)

# temp_df = test_df[['building_id']]
# temp_df = temp_df.merge(building_meta_df, on=['building_id'], how='left')

# del temp_df['building_id']
# test_df = pd.concat([test_df, temp_df], axis=1)
del temp_df, building_meta_df

In [13]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count
0,0,0,2016-01-01,0.0,0,Education,7432,2008.0,
1,1,0,2016-01-01,0.0,0,Education,2720,2004.0,
2,2,0,2016-01-01,0.0,0,Education,5376,1991.0,
3,3,0,2016-01-01,0.0,0,Education,23685,2002.0,
4,4,0,2016-01-01,0.0,0,Education,116607,1975.0,


In [14]:
# test_df.head()

### Weather DF merge over concat

In [15]:
temp_df = train_df[['site_id','timestamp']]
temp_df = temp_df.merge(weather_train_df, on=['site_id','timestamp'], how='left')

del temp_df['site_id'], temp_df['timestamp']
train_df = pd.concat([train_df, temp_df], axis=1)

# temp_df = test_df[['site_id','timestamp']]
# temp_df = temp_df.merge(weather_test_df, on=['site_id','timestamp'], how='left')

# del temp_df['site_id'], temp_df['timestamp']
# test_df = pd.concat([test_df, temp_df], axis=1)

# del temp_df, weather_train_df, weather_test_df

In [16]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,2016-01-01,0.0,0,Education,7432,2008.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
1,1,0,2016-01-01,0.0,0,Education,2720,2004.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
2,2,0,2016-01-01,0.0,0,Education,5376,1991.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
3,3,0,2016-01-01,0.0,0,Education,23685,2002.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
4,4,0,2016-01-01,0.0,0,Education,116607,1975.0,,25.0,6.0,20.0,,1019.5,0.0,0.0


In [17]:
# test_df.head()

In [18]:
print(train_df.shape)
# print(test_df.shape)

(20216100, 16)


In [19]:
# find missing values
def find_missing_data(df):
    total = df.isnull().sum().sort_values(ascending = False)
    percent = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending = False)
    missing_train_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_train_data

In [20]:
# drop missing value columns
train_df = train_df.drop(columns=['floor_count'])

In [21]:
# drop missing value by rows
# train_df = train_df.dropna()

In [22]:
# check shape after drop
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,2016-01-01,0.0,0,Education,7432,2008.0,25.0,6.0,20.0,,1019.5,0.0,0.0
1,1,0,2016-01-01,0.0,0,Education,2720,2004.0,25.0,6.0,20.0,,1019.5,0.0,0.0
2,2,0,2016-01-01,0.0,0,Education,5376,1991.0,25.0,6.0,20.0,,1019.5,0.0,0.0
3,3,0,2016-01-01,0.0,0,Education,23685,2002.0,25.0,6.0,20.0,,1019.5,0.0,0.0
4,4,0,2016-01-01,0.0,0,Education,116607,1975.0,25.0,6.0,20.0,,1019.5,0.0,0.0


In [23]:
site_2 = train_df.loc[(train_df['site_id'] == 2) & (train_df['year_built'] > 2005) & (train_df['year_built'] < 2009)]

In [24]:
site_2.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
166,156,0,2016-01-01,114.709999,2,Public services,43681,2007.0,15.601562,6.0,-5.601562,,1015.5,270.0,3.599609
189,169,0,2016-01-01,468.709991,2,Education,179559,2006.0,15.601562,6.0,-5.601562,,1015.5,270.0,3.599609
190,169,1,2016-01-01,176.686005,2,Education,179559,2006.0,15.601562,6.0,-5.601562,,1015.5,270.0,3.599609
191,169,3,2016-01-01,296.002014,2,Education,179559,2006.0,15.601562,6.0,-5.601562,,1015.5,270.0,3.599609
263,202,0,2016-01-01,143.25,2,Lodging/residential,74682,2006.0,15.601562,6.0,-5.601562,,1015.5,270.0,3.599609


In [25]:
site_2['building_id'].value_counts()

209    26308
169    26299
203    26254
202    25896
156     8783
280     8781
Name: building_id, dtype: int64

In [26]:
builds = ['209','169','203']
builds_selected = train_df.loc[train_df['building_id'].isin(builds)]
#builds_selected.head(20)

In [27]:
find_missing_data(builds_selected)
builds_selected['year_built'].value_counts()
builds_selected['meter'].value_counts()
builds_selected['meter'].dtype

dtype('int8')

In [28]:
builds_selected['meter'] = builds_selected['meter'].astype('category')
builds_selected['primary_use'] = builds_selected['primary_use'].astype('category')
builds_selected.head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
189,169,0,2016-01-01 00:00:00,468.709991,2,Education,179559,2006.0,15.601562,6.0,-5.601562,,1015.5,270.0,3.599609
190,169,1,2016-01-01 00:00:00,176.686005,2,Education,179559,2006.0,15.601562,6.0,-5.601562,,1015.5,270.0,3.599609
191,169,3,2016-01-01 00:00:00,296.002014,2,Education,179559,2006.0,15.601562,6.0,-5.601562,,1015.5,270.0,3.599609
266,203,0,2016-01-01 00:00:00,241.350006,2,Lodging/residential,63348,2007.0,15.601562,6.0,-5.601562,,1015.5,270.0,3.599609
267,203,1,2016-01-01 00:00:00,181.785995,2,Lodging/residential,63348,2007.0,15.601562,6.0,-5.601562,,1015.5,270.0,3.599609
268,203,3,2016-01-01 00:00:00,99.644096,2,Lodging/residential,63348,2007.0,15.601562,6.0,-5.601562,,1015.5,270.0,3.599609
280,209,0,2016-01-01 00:00:00,893.280029,2,Education,193294,2006.0,15.601562,6.0,-5.601562,,1015.5,270.0,3.599609
281,209,1,2016-01-01 00:00:00,812.392029,2,Education,193294,2006.0,15.601562,6.0,-5.601562,,1015.5,270.0,3.599609
282,209,3,2016-01-01 00:00:00,363.40799,2,Education,193294,2006.0,15.601562,6.0,-5.601562,,1015.5,270.0,3.599609
2490,169,0,2016-01-01 01:00:00,467.160004,2,Education,179559,2006.0,13.898438,,-5.601562,0.0,1015.5,270.0,4.101562


In [29]:
#builds_selected.fillna(method='ffill')

In [30]:
df = builds_selected.drop(columns = ['building_id','site_id','cloud_coverage','year_built'])

In [31]:
find_missing_data(df)
#df['wind_direction'].astype('float32')
#df.isin(['wind_direction','precip_depth_1_hr','sea_level_pressure','wind_speed','dew_temperature','air_temperature']).astype('float32')

Unnamed: 0,Total,Percent
wind_direction,5323,6.749851
precip_depth_1_hr,603,0.764637
sea_level_pressure,405,0.513562
wind_speed,54,0.068475
dew_temperature,18,0.022825
air_temperature,18,0.022825
square_feet,0,0.0
primary_use,0,0.0
meter_reading,0,0.0
timestamp,0,0.0


In [32]:
missing = ['wind_direction','precip_depth_1_hr','sea_level_pressure','wind_speed','dew_temperature','air_temperature']
df1 = df.isin(missing).fillna('-999')

In [33]:
df.columns

Index(['meter', 'timestamp', 'meter_reading', 'primary_use', 'square_feet',
       'air_temperature', 'dew_temperature', 'precip_depth_1_hr',
       'sea_level_pressure', 'wind_direction', 'wind_speed'],
      dtype='object')

In [34]:
predictors =['meter', 'primary_use', 'square_feet','air_temperature', 'dew_temperature', 'precip_depth_1_hr',
       'sea_level_pressure', 'wind_direction', 'wind_speed']

In [35]:
multiple_model = pm.Model()
with multiple_model:
    intercept = pm.Normal('intercept')
    beta      = pm.Normal('beta', shape=len(predictors))
#    beta_dynamic = pm.Normal('beta_dynamic')
    variance  = pm.InverseGamma('variance', alpha=0.1, beta=0.1)
    sd        = pm.Deterministic('sd', variance**0.5)
    yhat1 = intercept + pm.math.dot(df1[predictors], beta)
#    yhat1 = intercept + pm.math.dot(df[predictors],beta) + pm.math.dot(df['timestamp'],beta_dynamic)
    y   = pm.Normal('y', mu=yhat1, sd=sd, observed=df1['meter_reading'])
    
    trace = pm.sample()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [variance, beta, intercept]
Sampling 2 chains: 100%|██████████| 2000/2000 [02:09<00:00, 15.50draws/s] 
The acceptance probability does not match the target. It is 0.9963939623852184, but should be close to 0.8. Try to increase the number of tuning steps.
The acceptance probability does not match the target. It is 0.9527024630016041, but should be close to 0.8. Try to increase the number of tuning steps.


In [36]:
!conda list

# packages in environment at /opt/anaconda3/envs/myenv:
#
# Name                    Version                   Build  Channel
appnope                   0.1.0                    py37_0  
arviz                     0.5.1                      py_0    conda-forge
attrs                     19.3.0                     py_0  
backcall                  0.1.0                    py37_0  
blas                      1.0                         mkl  
bleach                    3.1.0                    py37_0  
bzip2                     1.0.8                h1de35cc_0  
ca-certificates           2019.10.16                    0  
cctools                   895                           1  
certifi                   2019.9.11                py37_0  
cftime                    1.0.4.2          py37h1d22016_0  
clang                     4.0.1                         1  
clang_osx-64              4.0.1               h1ce6c1d_18  
clangxx                   4.0.1                         1  
clang

In [37]:
pm.forestplot(trace,ylabels=['intercept'] + predictors + ['variance','sd']);

ImportError: ArviZ is not installed. In order to use `plot_forest`:
pip install arviz

In [None]:
pm.forestplot(trace,varnames=['beta','intercept','sd']);

In [None]:
with pm.Model() as correlated_error_model:
    intercept  = pm.Normal('intercept')
    beta      = pm.Normal('beta',shape=len(predictors))
    variance  = pm.InverseGamma('variance',alpha=0.1,beta=0.1)
    sd        = pm.Deterministic('sd',variance**0.5)
    mu        = intercept + pm.math.dot(df1[predictors],beta)
    
    tau      = pm.Gamma('tau',0.1,0.1)
    k        = pm.Uniform('k')
    error    = pm.AR1('error', k=k, tau_e=tau, observed=(df1['meter_reading']-mu).T)
    
    trace4 = pm.sample(tune=1000)

In [38]:
with pm.Model() as correlated_error_model:
    intercept = pm.Normal('intercept')
    beta      = pm.Normal('beta',shape=len(predictors))
    beta_dynamic = pm.Normal('beta_dynamic', mu=0, sigma=1.0)
    variance  = pm.InverseGamma('variance',alpha=0.1,beta=0.1)
    sd        = pm.Deterministic('sd',variance**0.5)
#    yhat1 = intercept + pm.math.dot(df1[predictors],beta)
    data    = pm.AR('y',beta_dynamic, sigma=1.0, observed=df1['meter_reading'])
    yhat1 = intercept + pm.math.dot(df1[predictors],beta) + data
#    yhat1 = intercept + pm.math.dot(df1[predictors],beta) + pm.math.dot(df1['meter_reading'],beta_dynamic)
    trace4 = pm.sample(tune=1000)

pm.traceplot(trace4)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [variance, beta_dynamic, beta, intercept]
Sampling 2 chains: 100%|██████████| 3000/3000 [00:08<00:00, 362.40draws/s]
There were 12 divergences after tuning. Increase `target_accept` or reparameterize.
There were 2 divergences after tuning. Increase `target_accept` or reparameterize.


ImportError: ArviZ is not installed. In order to use `plot_trace`:
pip install arviz

In [None]:
#pm.traceplot(trace4)

In [None]:
building_156 = train_df.loc[(train_df['building_id'] == 156) & (train_df['meter'] == 0)]

In [None]:
building_156_missing = find_missing_data(building_156)

In [None]:
building_156 = building_156.drop(columns=['cloud_coverage'])

In [None]:
building_203 = train_df.loc[(train_df['building_id'] == 203) & (train_df['meter'] == 0)]

In [None]:
building_203_missing = find_missing_data(building_203)

In [None]:
building_203 = building_203.drop(columns=['cloud_coverage'])

In [None]:
building_156

In [None]:
# y = train_df['meter_reading']
# X = train_df.drop(['meter_reading'], axis=1)

### Fitting a basic linear regression with no dynamic covariates

In [None]:
n_locations = 20
n_timesteps = 100
p_static    = 3
p_dynamic   = 4
error_sd    = 1.5
error_corr  = 0.3