### Постановка задачи
Построить модель линейной регрессии энергопотребления здания, используя температуру воздуха (air_temperature) и влажность (dew_temperature).

Рассчитать качество построенной модели по проверочным данным.

Данные:
* http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz
* http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz
* http://video.ittensive.com/machine-learning/ashrae/train.0.0.csv.gz
Соревнование: https://www.kaggle.com/c/ashrae-energy-prediction/

© ITtensive, 2020

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [3]:
buildings = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz")

In [4]:
weather = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz")

In [5]:
energy = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/train.0.0.csv.gz")

In [7]:
buildings.head()

Unnamed: 0,site_id,building_id,primary_use,square_feet,year_built,floor_count
0,0,0,Education,7432,2008.0,
1,0,1,Education,2720,2004.0,
2,0,2,Education,5376,1991.0,
3,0,3,Education,23685,2002.0,
4,0,4,Education,116607,1975.0,


In [8]:
energy.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading
0,0,0,2016-01-01 00:00:00,0.0
1,0,0,2016-01-01 01:00:00,0.0
2,0,0,2016-01-01 02:00:00,0.0
3,0,0,2016-01-01 03:00:00,0.0
4,0,0,2016-01-01 04:00:00,0.0


In [9]:
energy = pd.merge(left=energy,right=buildings,how='left',left_on='building_id',right_on='building_id')

In [10]:
energy.set_index(['timestamp','site_id'],inplace=True)

In [11]:
weather.set_index(['timestamp','site_id'],inplace=True)

In [12]:
energy = pd.merge(left=energy,right=weather,how='left',right_index=True,left_index=True)

In [13]:
energy.reset_index(inplace=True)

In [14]:
energy = energy[energy['meter_reading'] > 0]

In [15]:
energy.head()

Unnamed: 0,timestamp,site_id,building_id,meter,meter_reading,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
704,2016-01-30 08:00:00,0,0,0,43.6839,Education,7432,2008.0,,8.3,,6.1,0.0,1019.0,220.0,2.1
725,2016-01-31 05:00:00,0,0,0,37.5408,Education,7432,2008.0,,12.8,,10.0,0.0,1021.9,0.0,0.0
737,2016-01-31 17:00:00,0,0,0,52.5571,Education,7432,2008.0,,20.6,,11.7,0.0,1020.9,110.0,1.5
2366,2016-04-08 14:00:00,0,0,0,59.3827,Education,7432,2008.0,,21.7,2.0,14.4,0.0,1015.1,250.0,3.1
2923,2016-05-01 19:00:00,0,0,0,448.0,Education,7432,2008.0,,31.1,,17.2,0.0,1016.1,100.0,4.1


In [16]:
energy['timestamp'] = pd.to_datetime(energy['timestamp'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  energy['timestamp'] = pd.to_datetime(energy['timestamp'])


In [17]:
energy['hour'] = energy['timestamp'].dt.hour

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  energy['hour'] = energy['timestamp'].dt.hour


In [18]:
X_train,X_test = train_test_split(energy,test_size=0.2)

In [20]:
X_train_averages = X_train.groupby('hour').mean()['meter_reading']

X_train_lr = pd.DataFrame(X_train,columns=['meter_reading','air_temperature','dew_temperature'])

In [21]:
X = X_train_lr.drop('meter_reading',axis=1)

In [22]:
y = X_train_lr['meter_reading']

In [23]:
model = LinearRegression()

In [24]:
model.fit(X,y)

LinearRegression()

In [25]:
model.coef_

array([2.06692378, 4.26052992])

In [26]:
model.intercept_

102.93386674289593

In [29]:
def calculate_model(x):
    meter_reading_log = np.log(x.meter_reading + 1)
    meter_reading_mean = np.log(X_train_averages[x.hour] + 1)
    meter_reading_lr = np.log(x.air_temperature * model.coef_[0] + x.dew_temperature * model.coef_[1] + model.intercept_)
    
    x['meter_reading_lr_q'] = (meter_reading_log - meter_reading_lr) ** 2
    x['meter_reading_mean_q'] = (meter_reading_log - meter_reading_mean) ** 2
    
    return x
    

In [30]:
X_test = X_test.apply(calculate_model,axis=1,result_type='expand')

In [31]:
X_test_lr_rmsle = np.sqrt(X_test['meter_reading_lr_q'].sum() / len(X_test))

In [32]:
X_test_mean_rmsle = np.sqrt(X_test['meter_reading_mean_q'].sum() / len(X_test))

In [33]:
X_test_lr_rmsle

0.2166997728531764

In [34]:
X_test_mean_rmsle

0.25279498755566365