In [2]:
import pandas as pd
import numpy as np

import os
os.environ['THEANO_FLAGS']='device=cpu'
import pymc3 as pm
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
%%time
root = './input/'
train_df = pd.read_csv(root + 'train.csv')
train_df["timestamp"] = pd.to_datetime(train_df["timestamp"], format='%Y-%m-%d %H:%M:%S')

weather_train_df = pd.read_csv(root + 'weather_train.csv')
test_df = pd.read_csv(root + 'test.csv')
weather_test_df = pd.read_csv(root + 'weather_test.csv')
building_meta_df = pd.read_csv(root + 'building_metadata.csv')
sample_submission = pd.read_csv(root + 'sample_submission.csv')

CPU times: user 30 s, sys: 6.18 s, total: 36.2 s
Wall time: 35.4 s


In [4]:
print('Size of train_df data', train_df.shape)
print('Size of weather_train_df data', weather_train_df.shape)
print('Size of weather_test_df data', weather_test_df.shape)
print('Size of building_meta_df data', building_meta_df.shape)

Size of train_df data (20216100, 4)
Size of weather_train_df data (139773, 9)
Size of weather_test_df data (277243, 9)
Size of building_meta_df data (1449, 6)


In [5]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [6]:
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)

weather_train_df = reduce_mem_usage(weather_train_df)
weather_test_df = reduce_mem_usage(weather_test_df)
building_meta_df = reduce_mem_usage(building_meta_df)

Mem. usage decreased to 289.19 Mb (53.1% reduction)
Mem. usage decreased to 596.49 Mb (53.1% reduction)
Mem. usage decreased to  3.07 Mb (68.1% reduction)
Mem. usage decreased to  6.08 Mb (68.1% reduction)
Mem. usage decreased to  0.03 Mb (60.3% reduction)


In [21]:
train_df.head(60)

Unnamed: 0,building_id,meter,timestamp,meter_reading
0,0,0,2016-01-01,0.0
1,1,0,2016-01-01,0.0
2,2,0,2016-01-01,0.0
3,3,0,2016-01-01,0.0
4,4,0,2016-01-01,0.0
5,5,0,2016-01-01,0.0
6,6,0,2016-01-01,0.0
7,7,0,2016-01-01,0.0
8,8,0,2016-01-01,0.0
9,9,0,2016-01-01,0.0


In [30]:
test_df.head()

Unnamed: 0,row_id,building_id,meter,timestamp
0,0,0,0,2017-01-01 00:00:00
1,1,1,0,2017-01-01 00:00:00
2,2,2,0,2017-01-01 00:00:00
3,3,3,0,2017-01-01 00:00:00
4,4,4,0,2017-01-01 00:00:00


In [49]:
weather_train_df.head()

NameError: name 'weather_train_df' is not defined

In [32]:
weather_test_df.head()

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2017-01-01 00:00:00,17.796875,4.0,11.703125,,1021.5,100.0,3.599609
1,0,2017-01-01 01:00:00,17.796875,2.0,12.796875,0.0,1022.0,130.0,3.099609
2,0,2017-01-01 02:00:00,16.09375,0.0,12.796875,0.0,1022.0,140.0,3.099609
3,0,2017-01-01 03:00:00,17.203125,0.0,13.296875,0.0,1022.0,140.0,3.099609
4,0,2017-01-01 04:00:00,16.703125,2.0,13.296875,0.0,1022.5,130.0,2.599609


In [33]:
building_meta_df.head()

Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count
0,0,0,Education,7432,2008.0,
1,0,1,Education,2720,2004.0,
2,0,2,Education,5376,1991.0,
3,0,3,Education,23685,2002.0,
4,0,4,Education,116607,1975.0,


 ### Building DF merge through concat

In [34]:
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'])
test_df['timestamp'] = pd.to_datetime(test_df['timestamp'])
weather_train_df['timestamp'] = pd.to_datetime(weather_train_df['timestamp'])
weather_test_df['timestamp'] = pd.to_datetime(weather_test_df['timestamp'])

In [35]:
temp_df = train_df[['building_id']]
temp_df = temp_df.merge(building_meta_df, on=['building_id'], how='left')
del temp_df['building_id']
train_df = pd.concat([train_df, temp_df], axis=1)

temp_df = test_df[['building_id']]
temp_df = temp_df.merge(building_meta_df, on=['building_id'], how='left')

del temp_df['building_id']
test_df = pd.concat([test_df, temp_df], axis=1)
del temp_df, building_meta_df

In [36]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count
0,0,0,2016-01-01,0.0,0,Education,7432,2008.0,
1,1,0,2016-01-01,0.0,0,Education,2720,2004.0,
2,2,0,2016-01-01,0.0,0,Education,5376,1991.0,
3,3,0,2016-01-01,0.0,0,Education,23685,2002.0,
4,4,0,2016-01-01,0.0,0,Education,116607,1975.0,


In [37]:
test_df.head()

Unnamed: 0,row_id,building_id,meter,timestamp,site_id,primary_use,square_feet,year_built,floor_count
0,0,0,0,2017-01-01,0,Education,7432,2008.0,
1,1,1,0,2017-01-01,0,Education,2720,2004.0,
2,2,2,0,2017-01-01,0,Education,5376,1991.0,
3,3,3,0,2017-01-01,0,Education,23685,2002.0,
4,4,4,0,2017-01-01,0,Education,116607,1975.0,


### Weather DF merge over concat

In [38]:
temp_df = train_df[['site_id','timestamp']]
temp_df = temp_df.merge(weather_train_df, on=['site_id','timestamp'], how='left')

del temp_df['site_id'], temp_df['timestamp']
train_df = pd.concat([train_df, temp_df], axis=1)

temp_df = test_df[['site_id','timestamp']]
temp_df = temp_df.merge(weather_test_df, on=['site_id','timestamp'], how='left')

del temp_df['site_id'], temp_df['timestamp']
test_df = pd.concat([test_df, temp_df], axis=1)

del temp_df, weather_train_df, weather_test_df

In [39]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,2016-01-01,0.0,0,Education,7432,2008.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
1,1,0,2016-01-01,0.0,0,Education,2720,2004.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
2,2,0,2016-01-01,0.0,0,Education,5376,1991.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
3,3,0,2016-01-01,0.0,0,Education,23685,2002.0,,25.0,6.0,20.0,,1019.5,0.0,0.0
4,4,0,2016-01-01,0.0,0,Education,116607,1975.0,,25.0,6.0,20.0,,1019.5,0.0,0.0


In [40]:
test_df.head()

Unnamed: 0,row_id,building_id,meter,timestamp,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,0,2017-01-01,0,Education,7432,2008.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609
1,1,1,0,2017-01-01,0,Education,2720,2004.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609
2,2,2,0,2017-01-01,0,Education,5376,1991.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609
3,3,3,0,2017-01-01,0,Education,23685,2002.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609
4,4,4,0,2017-01-01,0,Education,116607,1975.0,,17.796875,4.0,11.703125,,1021.5,100.0,3.599609


In [41]:
print(train_df.shape)
print(test_df.shape)

(20216100, 16)
(41697600, 16)


In [42]:
# find missing values
total = train_df.isnull().sum().sort_values(ascending = False)
percent = (train_df.isnull().sum()/train_df.isnull().count()*100).sort_values(ascending = False)
missing_train_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_train_data

Unnamed: 0,Total,Percent
floor_count,16709167,82.652772
year_built,12127645,59.990033
cloud_coverage,8825365,43.655131
precip_depth_1_hr,3749023,18.544739
wind_direction,1449048,7.167792
sea_level_pressure,1231669,6.092515
wind_speed,143676,0.710701
dew_temperature,100140,0.495348
air_temperature,96658,0.478124
square_feet,0,0.0


In [43]:
# drop missing value columns
train_df = train_df.drop(columns=['floor_count', 'year_built', 'cloud_coverage'])

In [21]:
# drop missing value by rows
# train_df = train_df.dropna()

In [44]:
# check shape after drop
train_df.shape

(20216100, 13)

In [45]:
# check missing value again
train_df.isnull().sum().sort_values(ascending = False)

precip_depth_1_hr     3749023
wind_direction        1449048
sea_level_pressure    1231669
wind_speed             143676
dew_temperature        100140
air_temperature         96658
square_feet                 0
primary_use                 0
site_id                     0
meter_reading               0
timestamp                   0
meter                       0
building_id                 0
dtype: int64

In [50]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,air_temperature,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,2016-01-01,0.0,0,Education,7432,25.0,20.0,,1019.5,0.0,0.0
1,1,0,2016-01-01,0.0,0,Education,2720,25.0,20.0,,1019.5,0.0,0.0
2,2,0,2016-01-01,0.0,0,Education,5376,25.0,20.0,,1019.5,0.0,0.0
3,3,0,2016-01-01,0.0,0,Education,23685,25.0,20.0,,1019.5,0.0,0.0
4,4,0,2016-01-01,0.0,0,Education,116607,25.0,20.0,,1019.5,0.0,0.0


In [35]:
# y = train_df['meter_reading']
# X = train_df.drop(['meter_reading'], axis=1)

KeyError: 'meter_reading'

### Fitting a basic linear regression with no dynamic covariates

In [46]:
n_locations = 20
n_timesteps = 100
p_static    = 3
p_dynamic   = 4
error_sd    = 1.5
error_corr  = 0.3