In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [3]:
buildings = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz")

In [4]:
buildings.columns

Index(['site_id', 'building_id', 'primary_use', 'square_feet', 'year_built',
       'floor_count'],
      dtype='object')

In [5]:
weather = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz")

In [6]:
weather.columns

Index(['site_id', 'timestamp', 'air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure',
       'wind_direction', 'wind_speed'],
      dtype='object')

In [7]:
energy = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/train.0.0.csv.gz")

In [8]:
energy.columns

Index(['building_id', 'meter', 'timestamp', 'meter_reading'], dtype='object')

In [9]:
energy = energy[energy['meter_reading'] > 0]

In [10]:
energy = pd.merge(left=energy,right=buildings,how='left',left_on='building_id',right_on='building_id')

In [11]:
energy.set_index(['timestamp','site_id'],inplace=True)

In [12]:
weather.set_index(['timestamp','site_id'],inplace=True)

In [13]:
energy = pd.merge(left=energy,right=weather,how='left',right_index=True,left_index=True)

In [14]:
energy.reset_index(inplace=True)

In [16]:
energy.head()

Unnamed: 0,timestamp,site_id,building_id,meter,meter_reading,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,2016-01-30 08:00:00,0,0,0,43.6839,Education,7432,2008.0,,8.3,,6.1,0.0,1019.0,220.0,2.1
1,2016-01-31 05:00:00,0,0,0,37.5408,Education,7432,2008.0,,12.8,,10.0,0.0,1021.9,0.0,0.0
2,2016-01-31 17:00:00,0,0,0,52.5571,Education,7432,2008.0,,20.6,,11.7,0.0,1020.9,110.0,1.5
3,2016-04-08 14:00:00,0,0,0,59.3827,Education,7432,2008.0,,21.7,2.0,14.4,0.0,1015.1,250.0,3.1
4,2016-05-01 19:00:00,0,0,0,448.0,Education,7432,2008.0,,31.1,,17.2,0.0,1016.1,100.0,4.1


In [30]:
energy.isnull().sum().sort_values(ascending=False)

floor_count           5411
wind_direction         175
wind_speed               0
sea_level_pressure       0
precip_depth_1_hr        0
dew_temperature          0
cloud_coverage           0
air_temperature          0
year_built               0
square_feet              0
primary_use              0
meter_reading            0
meter                    0
building_id              0
site_id                  0
timestamp                0
dtype: int64

In [24]:
energy[energy['cloud_coverage'] > 0]['cloud_coverage'].mean()

3.604128271286399

In [25]:
energy['cloud_coverage'].fillna(0,inplace=True)

In [28]:
sea_pres_mean = energy['sea_level_pressure'].mean()

In [29]:
energy['sea_level_pressure'].fillna(sea_pres_mean,inplace=True)

In [34]:
energy['wind_direction'].fillna(0,inplace=True)

In [38]:
energy[energy['floor_count'].notnull()]

Unnamed: 0,timestamp,site_id,building_id,meter,meter_reading,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed


In [41]:
energy = energy.drop('floor_count',axis=1)

In [42]:
energy['timestamp'] = pd.to_datetime(energy['timestamp'])

In [43]:
energy['hour'] = energy['timestamp'].dt.hour

In [44]:
X_train,X_test = train_test_split(energy,test_size=0.2)

In [45]:
X_train_lr = X_train[['air_temperature','dew_temperature','sea_level_pressure','wind_speed','cloud_coverage','meter_reading']]

In [46]:
X_train_hour_averages = X_train.groupby('hour').mean()['meter_reading']

In [47]:
X = X_train_lr.drop('meter_reading',axis=1)

In [48]:
y = X_train_lr['meter_reading']

In [49]:
model = LinearRegression()

In [50]:
model.fit(X,y)

LinearRegression()

In [53]:
X.columns

Index(['air_temperature', 'dew_temperature', 'sea_level_pressure',
       'wind_speed', 'cloud_coverage'],
      dtype='object')

In [51]:
model.coef_,model.intercept_

(array([ 2.66424172,  3.42188365, -0.82631246, -2.43982483, -0.52646591]),
 953.8952144231581)

In [62]:
def my_predict(x):
    meter_log = np.log(x.meter_reading + 1)
    meter_lr = np.log(x.air_temperature * model.coef_[0] + x.dew_temperature * model.coef_[1] + 
                      x.sea_level_pressure * model.coef_[2] + x.wind_speed * model.coef_[3] + x.cloud_coverage * model.coef_[4] + model.intercept_)
    x['meter_reading_LR_q'] = (meter_log - meter_lr) ** 2
    return x

In [63]:
X_test = X_test.apply(my_predict,axis=1,result_type='expand')

In [64]:
X_test_lr_rmsle = np.sqrt(X_test['meter_reading_LR_q'].sum() / len(X_test))

In [65]:
X_test_lr_rmsle

0.20168291962017953