In [1]:
import pandas as pd
import numpy as np

import os
os.environ['THEANO_FLAGS']='device=cpu'
import pymc3 as pm
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
%%time
root = './input/'
train_df = pd.read_csv(root + 'train.csv')
train_df["timestamp"] = pd.to_datetime(train_df["timestamp"], format='%Y-%m-%d %H:%M:%S')

weather_train_df = pd.read_csv(root + 'weather_train.csv')
# test_df = pd.read_csv(root + 'test.csv')
# weather_test_df = pd.read_csv(root + 'weather_test.csv')
building_meta_df = pd.read_csv(root + 'building_metadata.csv')
print('...loaded')

...loaded
CPU times: user 6.98 s, sys: 1.62 s, total: 8.6 s
Wall time: 9.75 s


In [3]:
print('Size of train_df data', train_df.shape)
print('Size of weather_train_df data', weather_train_df.shape)
# print('Size of weather_test_df data', weather_test_df.shape)
print('Size of building_meta_df data', building_meta_df.shape)

Size of train_df data (20216100, 4)
Size of weather_train_df data (139773, 9)
Size of building_meta_df data (1449, 6)


In [4]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [5]:
train_df = reduce_mem_usage(train_df)
# test_df = reduce_mem_usage(test_df)

weather_train_df = reduce_mem_usage(weather_train_df)
# weather_test_df = reduce_mem_usage(weather_test_df)
building_meta_df = reduce_mem_usage(building_meta_df)

Mem. usage decreased to 289.19 Mb (53.1% reduction)
Mem. usage decreased to  4.93 Mb (48.6% reduction)
Mem. usage decreased to  0.03 Mb (52.0% reduction)


In [6]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading
0,0,0,2016-01-01,0.0
1,1,0,2016-01-01,0.0
2,2,0,2016-01-01,0.0
3,3,0,2016-01-01,0.0
4,4,0,2016-01-01,0.0


 ### Building DF merge through concat

In [7]:
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
# test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])
weather_train_df['timestamp'] = pd.to_datetime(weather_train_df['timestamp'])
# weather_test_df['timestamp'] = pd.to_datetime(weather_test_df['timestamp'])

In [8]:
temp_df = train_df[['building_id']]
temp_df = temp_df.merge(building_meta_df, on=['building_id'], how='left')
del temp_df['building_id']
train_df = pd.concat([train_df, temp_df], axis=1)
del temp_df, building_meta_df

In [9]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count
0,0,0,2016-01-01,0.0,0,Education,7432,2008.0,
1,1,0,2016-01-01,0.0,0,Education,2720,2004.0,
2,2,0,2016-01-01,0.0,0,Education,5376,1991.0,
3,3,0,2016-01-01,0.0,0,Education,23685,2002.0,
4,4,0,2016-01-01,0.0,0,Education,116607,1975.0,


### Weather DF merge over concat

In [10]:
temp_df = train_df[['site_id','timestamp']]
temp_df = temp_df.merge(weather_train_df, on=['site_id','timestamp'], how='left')

del temp_df['site_id'], temp_df['timestamp']
train_df = pd.concat([train_df, temp_df], axis=1)

In [11]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,2016-01-01,0.0,0,Education,7432,2008.0,,25.0,6.0,20.0,,1019.700012,0.0,0.0
1,1,0,2016-01-01,0.0,0,Education,2720,2004.0,,25.0,6.0,20.0,,1019.700012,0.0,0.0
2,2,0,2016-01-01,0.0,0,Education,5376,1991.0,,25.0,6.0,20.0,,1019.700012,0.0,0.0
3,3,0,2016-01-01,0.0,0,Education,23685,2002.0,,25.0,6.0,20.0,,1019.700012,0.0,0.0
4,4,0,2016-01-01,0.0,0,Education,116607,1975.0,,25.0,6.0,20.0,,1019.700012,0.0,0.0


In [12]:
print(train_df.shape)

(20216100, 16)


In [13]:
# find missing values
def find_missing_data(df):
    total = df.isnull().sum().sort_values(ascending = False)
    percent = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending = False)
    missing_train_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_train_data

In [14]:
# drop missing value columns
train_df = train_df.drop(columns=['floor_count'])

In [15]:
# check shape after drop
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,2016-01-01,0.0,0,Education,7432,2008.0,25.0,6.0,20.0,,1019.700012,0.0,0.0
1,1,0,2016-01-01,0.0,0,Education,2720,2004.0,25.0,6.0,20.0,,1019.700012,0.0,0.0
2,2,0,2016-01-01,0.0,0,Education,5376,1991.0,25.0,6.0,20.0,,1019.700012,0.0,0.0
3,3,0,2016-01-01,0.0,0,Education,23685,2002.0,25.0,6.0,20.0,,1019.700012,0.0,0.0
4,4,0,2016-01-01,0.0,0,Education,116607,1975.0,25.0,6.0,20.0,,1019.700012,0.0,0.0


In [16]:
site_2 = train_df.loc[(train_df['site_id'] == 2) & (train_df['year_built'] > 2005) & (train_df['year_built'] < 2009)]

In [17]:
site_2.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
166,156,0,2016-01-01,114.709999,2,Public services,43681,2007.0,15.6,6.0,-5.6,,1015.299988,270.0,3.6
189,169,0,2016-01-01,468.709991,2,Education,179559,2006.0,15.6,6.0,-5.6,,1015.299988,270.0,3.6
190,169,1,2016-01-01,176.686005,2,Education,179559,2006.0,15.6,6.0,-5.6,,1015.299988,270.0,3.6
191,169,3,2016-01-01,296.002014,2,Education,179559,2006.0,15.6,6.0,-5.6,,1015.299988,270.0,3.6
263,202,0,2016-01-01,143.25,2,Lodging/residential,74682,2006.0,15.6,6.0,-5.6,,1015.299988,270.0,3.6


In [18]:
site_2['building_id'].value_counts()

209    26308
169    26299
203    26254
202    25896
156     8783
280     8781
Name: building_id, dtype: int64

In [19]:
builds = ['209']
meter = ['0']
builds_selected = train_df.loc[train_df['building_id'].isin(builds)]
builds_selected = builds_selected.loc[builds_selected['meter'].isin(meter)]
#builds_selected.head(20)

In [20]:
builds_selected['year_built'].value_counts()
builds_selected['meter'].value_counts()
#builds_selected['meter'].dtype

0    8772
Name: meter, dtype: int64

In [21]:
find_missing_data(builds_selected)
#builds_selected[builds_selected['meter']==0]
#builds_selected['meter'] = builds_selected['meter'].astype('category')
#builds_selected['primary_use'] = builds_selected['primary_use'].astype('category')
#builds_selected[builds_selected['meter']=='0']
#builds_selected.head(20)

Unnamed: 0,Total,Percent
cloud_coverage,2355,26.846785
wind_direction,590,6.725946
precip_depth_1_hr,67,0.763794
sea_level_pressure,45,0.512996
wind_speed,6,0.068399
dew_temperature,2,0.0228
air_temperature,2,0.0228
year_built,0,0.0
square_feet,0,0.0
primary_use,0,0.0


In [22]:
#builds_selected.fillna(method='ffill')

In [23]:
df = builds_selected.drop(columns = ['meter','square_feet','timestamp','primary_use','building_id','site_id','cloud_coverage','year_built'])

In [24]:
#builds_selected['meter']

In [25]:
find_missing_data(df)
#df['wind_direction'].astype('float32')
#df.isin(['wind_direction','precip_depth_1_hr','sea_level_pressure','wind_speed','dew_temperature','air_temperature']).astype('float32')

Unnamed: 0,Total,Percent
wind_direction,590,6.725946
precip_depth_1_hr,67,0.763794
sea_level_pressure,45,0.512996
wind_speed,6,0.068399
dew_temperature,2,0.0228
air_temperature,2,0.0228
meter_reading,0,0.0


In [26]:
missing = ['wind_direction','precip_depth_1_hr','sea_level_pressure','wind_speed','dew_temperature','air_temperature']
df1 = df.copy()
df1[missing] = df1[missing].fillna(method = 'ffill')
df1[missing] = df1[missing].fillna(method = 'bfill')

In [27]:
find_missing_data(df1)

Unnamed: 0,Total,Percent
wind_speed,0,0.0
wind_direction,0,0.0
sea_level_pressure,0,0.0
precip_depth_1_hr,0,0.0
dew_temperature,0,0.0
air_temperature,0,0.0
meter_reading,0,0.0


In [28]:
df1

Unnamed: 0,meter_reading,air_temperature,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
280,893.280029,15.600000,-5.6,0.0,1015.299988,270.0,3.6
2580,920.669983,13.900000,-5.6,0.0,1015.599976,270.0,4.1
4874,921.760010,13.300000,-5.6,0.0,1016.000000,270.0,3.1
7173,920.429993,12.200000,-6.1,0.0,1016.599976,280.0,3.1
9468,922.640015,11.700000,-6.7,0.0,1017.000000,270.0,3.1
...,...,...,...,...,...,...,...
20204579,965.989990,15.000000,11.1,0.0,1012.400024,210.0,3.6
20206942,969.690002,16.100000,10.0,0.0,1011.700012,260.0,3.6
20209308,967.039978,16.700001,9.4,0.0,1011.200012,240.0,4.6
20211673,965.830017,16.700001,10.0,0.0,1011.000000,250.0,3.6


## Fitting a basic linear regression 

In [29]:
df1 = (df1-df1.mean())/df1.std()

In [30]:
predictors =['air_temperature', 'dew_temperature', 'precip_depth_1_hr','sea_level_pressure', 'wind_direction', 'wind_speed']

In [31]:
multiple_model = pm.Model()
with multiple_model:
    intercept = pm.Normal('intercept')
    beta      = pm.Normal('beta', shape=len(predictors))
#    beta_dynamic = pm.Normal('beta_dynamic')
    variance  = pm.InverseGamma('variance', alpha=0.1, beta=0.1)
    sd        = pm.Deterministic('sd', variance**0.5)
    yhat1  = pm.Deterministic('yhat1',intercept + pm.math.dot(df1[predictors].values, beta))
#    yhat1 = intercept + pm.math.dot(df[predictors],beta) + pm.math.dot(df['timestamp'],beta_dynamic)
    y   = pm.Normal('y', mu=yhat1, sd=sd, observed=df1['meter_reading'].values)
    
    trace = pm.sample()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [variance, beta, intercept]
Sampling 2 chains: 100%|██████████| 2000/2000 [00:05<00:00, 385.31draws/s]


In [33]:
pm.forestplot(trace,ylabels=['intercept'] + predictors + ['variance','sd']);

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-33-2d18b0195917>", line 1, in <module>
    pm.forestplot(trace,ylabels=['intercept'] + predictors + ['variance','sd']);
  File "/usr/local/lib/python3.7/site-packages/pymc3/plots/forestplot.py", line 247, in forestplot
    plot_kwargs)
  File "/usr/local/lib/python3.7/site-packages/pymc3/plots/forestplot.py", line 53, in _plot_tree
    color=c)
  File "/usr/local/lib/python3.7/site-packages/matplotlib/__init__.py", line 1601, in inner
    return func(ax, *map(sanitize_sequence, args), **kwargs)
  File "/usr/local/lib/python3.7/site-packages/matplotlib/axes/_axes.py", line 3314, in errorbar
    self.add_line(data_line)
  File "/usr/local/lib/python3.7/site-packages/matplotlib/axes/_base.py", line 1902, in add_line
    self._update_line_limits(line)
  File "/usr/local/lib/py

KeyboardInterrupt: 

Error in callback <function flush_figures at 0x120c35378> (for post_execute):


KeyboardInterrupt: 

In [34]:
pm.stats.r2_score(df1['meter_reading'].values,trace['yhat1'])

r2_r(r2_median=0.07, r2_mean=0.07, r2_std=0.0)

In [35]:
pm.stats.waic(trace, model = multiple_model)

  """Entry point for launching an IPython kernel.
        log predictive densities exceeds 0.4. This could be indication of
        WAIC starting to fail see http://arxiv.org/abs/1507.04544 for details
        
  """)


WAIC_r(WAIC=24307.463017880003, WAIC_se=1001.5888425094446, p_WAIC=61.96366121682519, var_warn=1)

In [36]:
pm.traceplot(trace)

KeyboardInterrupt: 

Error in callback <function install_repl_displayhook.<locals>.post_execute at 0x120914840> (for post_execute):


KeyboardInterrupt: 

Error in callback <function flush_figures at 0x120c35378> (for post_execute):


KeyboardInterrupt: 

In [None]:
pm.plot_posterior(trace,['yhat1'] )

In [37]:
pm.summary(trace)

  axis=1, join_axes=[dforg.index])


Unnamed: 0,mean,sd,mc_error,hpd_2.5,hpd_97.5,n_eff,Rhat
intercept,0.000117,0.010028,0.000234,-0.017818,0.019492,2027.372513,0.999684
beta__0,-0.159815,0.013950,0.000380,-0.187969,-0.133059,1363.853016,0.999326
beta__1,0.166818,0.010899,0.000282,0.143716,0.185615,1200.966732,0.999095
beta__2,-0.013477,0.010414,0.000259,-0.032625,0.008047,1524.227234,0.999576
beta__3,0.069419,0.014309,0.000419,0.040869,0.095114,1101.663605,1.003256
...,...,...,...,...,...,...,...
yhat1__8767,0.284120,0.024030,0.000667,0.236537,0.327703,1438.272831,0.999099
yhat1__8768,0.175670,0.025049,0.000717,0.129016,0.227446,1350.826695,0.999145
yhat1__8769,0.181174,0.023420,0.000633,0.133529,0.224995,1546.427276,0.999337
yhat1__8770,0.166340,0.024190,0.000695,0.119993,0.214267,1311.120325,0.999313


## Model with AR1

In [None]:
with pm.Model() as correlated_error_model:
    intercept  = pm.Normal('intercept')
    beta      = pm.Normal('beta',shape=len(predictors))
    variance  = pm.InverseGamma('variance',alpha=0.1,beta=0.1)
    sd        = pm.Deterministic('sd',variance**0.5)
    
    tau      = pm.Gamma('tau',0.1,0.1)
    k        = pm.Uniform('k')
    data    = pm.AR1('data', k=k, tau_e=tau, observed=(df1['meter_reading']))
    mu      = pm.Deterministic('mu',intercept + pm.math.dot(df1[predictors],beta) + data)
    y   = pm.Normal('y', mu=mu, sd=sd, observed=df1['meter_reading'])
    
    trace4 = pm.sample(tune=1000)

In [None]:
pm.stats.r2_score(df1['meter_reading'].values,trace4['mu'])

In [None]:
pm.stats.waic(trace4, model = correlated_error_model)

## the exucation of rest part is not able to finish. It takes forever to run

In [None]:
pm.forestplot(trace4,ylabels=['intercept'] + predictors + ['variance','sd'] + ['beta_dynamic']);

In [None]:
pm.plot_posterior(trace4,['mu'] )

In [None]:
pm.traceplot(trace4)

## Draft

In [None]:
with pm.Model() as correlated_error_model:
    intercept = pm.Normal('intercept')
    beta      = pm.Normal('beta',shape=len(predictors))
    mu       = intercept + pm.math.dot(df1[predictors], beta)
    tau      = pm.Gamma('tau',0.1,0.1)
    k        = pm.Uniform('k')
    print(predictors)
    print(df1['meter_reading']-mu)
#    error    = pm.AR1('error', k=k, tau_e=tau, observed=(df1['meter_reading']-mu).T)
    
#    yhat1 = intercept + pm.math.dot(df1[predictors],beta) + pm.math.dot(df1['meter_reading'],beta_dynamic)
#     trace4 = pm.sample(tune=1000)