### Постановка задачи
Заполним отсутствующие значения по погоде интерполяционными данными.

Для точки росы вычтем температуру воздуха, направление ветра разложим на синус и косинус, для температуры воздуха вычислим  первую и вторую производные. Также введем параметры по праздничным дням, дням недели, месяцам и неделям года.

Посчитаем модель линейной регрессии по первым 20 зданиям и найдем ее точность. Проверим, какой набор параметров позволяет улучшить точность.

Данные:
* http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz
* http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz
* http://video.ittensive.com/machine-learning/ashrae/train.0.csv.gz
Соревнование: https://www.kaggle.com/c/ashrae-energy-prediction/

© ITtensive, 2020

In [1]:
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import numpy as np
from scipy.interpolate import interp1d
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
def reduce_memory(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if str(col_type)[:5] == 'float':
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo('f2').min and c_max < np.finfo('f2').max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo('f4').min and c_max < np.finfo('f4').max():
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        elif str(col_type)[:3] == 'int':
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo('i1').min and c_max < np.iinfo('i1').max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo('i2').min and c_max < np.iinfo('i2').max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo('i4').min and c_max < np.iinfo('i4').max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo('i8').min and c_max < np.iinfo('i8').max:
                df[col] = df[col].astype(np.int64)
        elif col == 'timestamp':
            df[col] = pd.to_datetime(df[col])
        elif str(col_type)[:8] != "datetime":
            df[col] = df[col].astype("category")
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Потребление памяти меньше на', round(start_mem - end_mem, 2), 'Мб (минус', round(100 * (start_mem - end_mem) / start_mem, 1), '%)')
    return df

In [13]:
buildings = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz")
weather = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz")
energy = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/train.0.csv.gz")

In [14]:
weather = weather[weather['site_id'] == 0]
energy = energy[energy['building_id'] < 20]
energy = pd.merge(left=energy,right=buildings,how='left',right_on='building_id',left_on='building_id')
del buildings


In [15]:
weather['precip_depth_1_hr'] = weather['precip_depth_1_hr'].apply(lambda x: x if x > 0 else 0)
interpolate_columns = ['air_temperature','dew_temperature','cloud_coverage','wind_speed',
                       'wind_direction','precip_depth_1_hr','sea_level_pressure']
for col in interpolate_columns:
    weather[col] = weather[col].interpolate(limit_direction='both',kind='cubic')

In [16]:
weather['wind_direction_rad'] = weather['wind_direction'] / np.pi
weather['wind_direction_sin'] = np.sin(weather['wind_direction_rad'])
weather['wind_direction_cos'] = np.cos(weather['wind_direction_rad'])
weather['air_temperature_diff1'] = weather['air_temperature'].diff()
weather.at[0,"air_temperature_diff1"] = weather.at[1,"air_temperature_diff1"]
weather['air_temperature_diff2'] = weather['air_temperature_diff1'].diff()
weather.at[0,"air_temperature_diff2"] = weather.at[1,"air_temperature_diff2"]

In [17]:
energy = energy.set_index(['timestamp','site_id'])
weather = weather.set_index(['timestamp','site_id'])
energy = pd.merge(left=energy,right=weather,how='left',left_index=True,right_index=True)
energy.reset_index(inplace=True)
energy = energy.drop(['meter','site_id','year_built','square_feet','floor_count'],axis=1)
del weather
energy = reduce_memory(energy)

Потребление памяти меньше на 15.41 Мб (минус 71.9 %)


In [18]:
energy['hour'] = energy['timestamp'].dt.hour.astype('int8')
energy['weekday'] = energy['timestamp'].dt.weekday.astype('int8')
energy['week'] = energy['timestamp'].dt.week.astype('int8')
energy['month'] = energy['timestamp'].dt.month.astype('int8')
energy['date'] = pd.to_datetime(energy['timestamp'].dt.date)
dates_range = pd.date_range(start='2015-12-31',end='2017-01-01')
us_holidays = calendar().holidays(start=dates_range.min(),end=dates_range.max())
energy['is_holiday'] = energy['date'].isin(us_holidays).astype('int8')

for weekday in range(0,7):
    energy['is_wday' + str(weekday)] = energy['weekday'].isin([weekday]).astype('int8')

for week in range(1,54):
    energy['is_w' + str(week)] = energy['week'].isin([week]).astype('int8')
    
for month in range(1,13):
    energy['is_m' + str(month)] = energy['month'].isin([month]).astype('int8')
    
    

In [22]:
energy.columns

Index(['timestamp', 'building_id', 'meter_reading', 'primary_use',
       'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed', 'wind_direction_rad', 'wind_direction_sin',
       'wind_direction_cos', 'air_temperature_diff1', 'air_temperature_diff2',
       'hour', 'weekday', 'week', 'month', 'date', 'is_holiday', 'is_wday0',
       'is_wday1', 'is_wday2', 'is_wday3', 'is_wday4', 'is_wday5', 'is_wday6',
       'is_w1', 'is_w2', 'is_w3', 'is_w4', 'is_w5', 'is_w6', 'is_w7', 'is_w8',
       'is_w9', 'is_w10', 'is_w11', 'is_w12', 'is_w13', 'is_w14', 'is_w15',
       'is_w16', 'is_w17', 'is_w18', 'is_w19', 'is_w20', 'is_w21', 'is_w22',
       'is_w23', 'is_w24', 'is_w25', 'is_w26', 'is_w27', 'is_w28', 'is_w29',
       'is_w30', 'is_w31', 'is_w32', 'is_w33', 'is_w34', 'is_w35', 'is_w36',
       'is_w37', 'is_w38', 'is_w39', 'is_w40', 'is_w41', 'is_w42', 'is_w43',
       'is_w44', 'is_w45', 'is_w46', 'i

In [23]:
energy['meter_reading_log'] = np.log(energy['meter_reading'] + 1)

In [24]:
energy_train,energy_test = train_test_split(energy[energy['meter_reading'] > 0],test_size=0.2)

In [31]:
energy_train_lr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86858 entries, 135668 to 154642
Data columns (total 83 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   meter_reading_log      86858 non-null  float16
 1   hour                   86858 non-null  int8   
 2   building_id            86858 non-null  int8   
 3   air_temperature        86858 non-null  float16
 4   dew_temperature        86858 non-null  float16
 5   sea_level_pressure     86858 non-null  float16
 6   win_speed              0 non-null      float64
 7   cloud_coverage         86858 non-null  float16
 8   air_temperature_diff1  86858 non-null  float16
 9   air_temperature_diff2  86858 non-null  float16
 10  is_holiday             86858 non-null  int8   
 11  is_wday0               86858 non-null  int8   
 12  is_wday1               86858 non-null  int8   
 13  is_wday2               86858 non-null  int8   
 14  is_wday3               86858 non-null  int8   
 

In [40]:
hours = range(0,24)
buildings = range(0,energy_train['building_id'].max() + 1)
lr_columns = ['meter_reading_log','hour','building_id','air_temperature',
              'dew_temperature','sea_level_pressure','wind_speed','cloud_coverage',
             'air_temperature_diff1','air_temperature_diff2',
              'wind_direction_cos','wind_direction_sin',
              'is_holiday']
for wday in range(0,7):
    lr_columns.append('is_wday' + str(wday))
for week in range(1,54):
    lr_columns.append('is_w' + str(week))
for month in range(1,13):
    lr_columns.append('is_m' + str(month))
    
energy_train_lr = pd.DataFrame(energy_train,columns=lr_columns) 
energy_lr = [[]] * len(buildings)
for building in buildings:
    energy_lr[building] = [[]] * len(hours)
    energy_train_b = energy_train_lr[energy_train_lr['building_id'] == building]
    for hour in hours:
        energy_train_bh = energy_train_b[energy_train_b['hour'] == hour]
        x = energy_train_bh.drop(['meter_reading_log','hour','building_id'],axis=1)
        y = energy_train_bh['meter_reading_log']
        model = LinearRegression(fit_intercept=False).fit(x,y)
        energy_lr[building][hour] = model.coef_
        energy_lr[building][hour] = np.append(energy_lr[building][hour],model.intercept_)
print(energy_lr[0])
        

[array([ 3.68121779e-03,  1.71376765e-02, -9.04934853e-03,  7.47642666e-03,
       -2.04044729e-02, -9.57933441e-03, -2.93863378e-03, -3.13054770e-02,
        4.60723341e-02, -4.02529426e-02,  6.85562325e+00,  6.85794115e+00,
        6.84704494e+00,  6.83240557e+00,  6.86404276e+00,  6.82054949e+00,
        6.82175684e+00, -1.14679337e-04, -4.24385071e-05,  1.12175941e-04,
       -2.38418579e-05, -2.43186951e-05, -2.19345093e-05,  1.28746033e-05,
        1.26361847e-05, -1.23977661e-05,  1.23977661e-05,  6.67572021e-06,
        3.81469727e-06,  2.86102295e-06, -1.04904175e-05, -1.90734863e-06,
        5.24520874e-06, -3.33786011e-06,  4.76837158e-07, -4.76837158e-07,
        1.75670898e+00,  1.56272912e+00,  1.42902613e+00,  1.58432186e+00,
        1.69231439e+00,  1.56879675e+00,  1.52951384e+00,  1.48247445e+00,
        1.75396430e+00,  1.67050016e+00,  1.58487904e+00,  1.48866582e+00,
        1.49665737e+00,  1.54965007e+00,  1.32581711e+00,  1.28616536e+00,
        1.33269906e+00, 

In [41]:
def calculate_model(x):
    lr = -1
    model = energy_lr[x.building_id][x.hour]
    if len(model) > 0:
        lr = np.sum([x[col] * model[i] for i,col in enumerate(lr_columns[3:])])
        lr += model[len(lr_columns) - 3]
        lr = np.exp(lr)
    else:
        lr = 0
        
    
    x['meter_reading_lr_q'] = (np.log(x.meter_reading + 1) - np.log(lr + 1)) ** 2
    return x

In [42]:
energy_test = energy_test.apply(calculate_model,axis=1,result_type='expand')
np.sqrt(energy_test['meter_reading_lr_q'].sum() / len(energy_test))

0.19644320176167857

In [3]:
energy_lr[1][1]

NameError: name 'energy_lr' is not defined