### Постановка задачи
Рассмотрим несколько моделей линейной регрессии, чтобы выяснить более оптимальную для первых 20 зданий.

Данные:
* http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz
* http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz
* http://video.ittensive.com/machine-learning/ashrae/train.0.csv.gz
Соревнование: https://www.kaggle.com/c/ashrae-energy-prediction/

© ITtensive, 2020

In [1]:
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import numpy as np
from scipy.interpolate import interp1d
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, BayesianRidge

In [2]:
def reduce_mem_usage (df):
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if str(col_type)[:5] == "float":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo("f2").min and c_max < np.finfo("f2").max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo("f4").min and c_max < np.finfo("f4").max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        elif str(col_type)[:3] == "int":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo("i1").min and c_max < np.iinfo("i1").max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo("i2").min and c_max < np.iinfo("i2").max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo("i4").min and c_max < np.iinfo("i4").max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo("i8").min and c_max < np.iinfo("i8").max:
                df[col] = df[col].astype(np.int64)
        elif col == "timestamp":
            df[col] = pd.to_datetime(df[col])
        elif str(col_type)[:8] != "datetime":
            df[col] = df[col].astype("category")
    end_mem = df.memory_usage().sum() / 1024**2
    print('Потребление памяти меньше на', round(start_mem - end_mem, 2), 'Мб (минус', round(100 * (start_mem - end_mem) / start_mem, 1), '%)')
    return df

In [3]:
buildings = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz")
weather = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz")
energy = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/train.0.csv.gz")
energy = energy[(energy["building_id"]<20)]
energy = pd.merge(left=energy, right=buildings, how="left",
                   left_on="building_id", right_on="building_id")
energy = energy.set_index(["timestamp", "site_id"])
weather = weather.set_index(["timestamp", "site_id"])
energy = pd.merge(left=energy, right=weather, how="left",
                  left_index=True, right_index=True)
energy.reset_index(inplace=True)
energy = energy.drop(columns=["meter", "site_id", "year_built",
                              "square_feet", "floor_count"], axis=1)
del buildings
del weather
energy = reduce_mem_usage(energy)
print (energy.info())

Потребление памяти меньше на 10.39 Мб (минус 70.5 %)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175680 entries, 0 to 175679
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   timestamp           175680 non-null  datetime64[ns]
 1   building_id         175680 non-null  int8          
 2   meter_reading       175680 non-null  float16       
 3   primary_use         175680 non-null  category      
 4   air_temperature     175620 non-null  float16       
 5   cloud_coverage      99080 non-null   float16       
 6   dew_temperature     175620 non-null  float16       
 7   precip_depth_1_hr   175660 non-null  float16       
 8   sea_level_pressure  173980 non-null  float16       
 9   wind_direction      170680 non-null  float16       
 10  wind_speed          175680 non-null  float16       
dtypes: category(1), datetime64[ns](1), float16(8), int8(1)
memory usage: 4.4 MB
None


Получите данные по энергопотреблению первых 20 зданий (building_id от 0 до 19).

Заполните отсутствующие значения по погоде интерполяционными данными.

Разделите данные на обучающие/проверочные в пропорции 80/20.

Постройте (1) первый набор моделей линейной регрессии по часам для каждого из первых 20 зданий по следующим параметрам: air_temperature, dew_temperature, cloud_coverage, wind_speed, sea_level_pressure.

Постройте для этих же 20 зданий (2) второй набор моделей линейной регрессии по часам по параметрам: дни недели и праздники (is_holiday). Требуется построить еще 480 моделей.

Используйте логарифм целевого показателя (meter_reading_log) для обоих наборов моделей.

In [4]:
energy_lr = []
energy_ridge = []
temp_columns = ['meter_reading_log','sea_level_pressure','air_temperature',
                'dew_temperature','cloud_coverage','wind_speed','building_id']
energy["hour"] = energy["timestamp"].dt.hour.astype("int8")
energy["weekday"] = energy["timestamp"].dt.weekday.astype("int8")
for weekday in range(0,7):
    energy['is_wday' + str(weekday)] = energy['weekday'].isin([weekday]).astype("int8")
energy["date"] = pd.to_datetime(energy["timestamp"].dt.date)
dates_range = pd.date_range(start='2015-12-31', end='2017-01-01')
us_holidays = calendar().holidays(start=dates_range.min(),
                                  end=dates_range.max())
energy['is_holiday'] = energy['date'].isin(us_holidays).astype("int8")
energy["meter_reading_log"] = np.log(energy["meter_reading"] + 1)
for col in temp_columns:
    energy[col] = energy[col].interpolate(limit_direction='both',kind='cubic')

In [5]:
energy_train,energy_test = train_test_split(energy[energy['meter_reading'] > 0],test_size=0.2)

In [6]:
models = {
    'LinearRegression':LinearRegression,
    'Ridge-0.1':Ridge,
    'Ridge-1.0':Ridge,
    'Ridge-0.01':Ridge
}
energy_train_lr = pd.DataFrame(energy_train,columns=['meter_reading_log','sea_level_pressure','air_temperature',
                'dew_temperature','cloud_coverage','wind_speed','building_id','hour'])

In [7]:
from sklearn.metrics import *

In [8]:
hours = range(0,24)
temp_models_scores = {}
lr_model = None
buildings = range(0,energy_train_lr['building_id'].max() + 1)
for model in models:
    energy_lr_scores = [[]]*len(buildings)
    for building in buildings:
        energy_lr_scores[building] = [0] * len(hours)
        energy_train_b = energy_train_lr[energy_train_lr['building_id'] == building]
        for hour in hours:
            energy_train_bh = energy_train_b[energy_train_b['hour'] == hour]
            y = energy_train_bh['meter_reading_log']
            x = energy_train_bh.drop(['meter_reading_log','building_id','hour'],axis=1)
            if model == 'LinearRegression':
                lr_model = LinearRegression(fit_intercept=False).fit(x,y)
            elif model == 'Ridge-0.1':
                lr_model = Ridge(alpha=0.1,fit_intercept=False).fit(x,y)
            elif model == 'Ridge-1.0':
                lr_model = Ridge(alpha=1,fit_intercept=False).fit(x,y)
            else:
                lr_model = Ridge(alpha=0.01,fit_intercept=False).fit(x,y)
            energy_lr_scores[building] = r2_score(y,lr_model.predict(x))
    temp_models_scores[model] = np.mean(energy_lr_scores)

In [9]:
temp_models_scores

{'LinearRegression': 0.25188455048951247,
 'Ridge-0.1': 0.2518845790846912,
 'Ridge-1.0': 0.2518844785576621,
 'Ridge-0.01': 0.25188458009377884}

In [None]:
energy["hour"] = energy["timestamp"].dt.hour.astype("int8")
energy["weekday"] = energy["timestamp"].dt.weekday.astype("int8")
for weekday in range(0,7):
    energy['is_wday' + str(weekday)] = energy['weekday'].isin([weekday]).astype("int8")
energy["date"] = pd.to_datetime(energy["timestamp"].dt.date)
dates_range = pd.date_range(start='2015-12-31', end='2017-01-01')
us_holidays = calendar().holidays(start=dates_range.min(),
                                  end=dates_range.max())
energy['is_holiday'] = energy['date'].isin(us_holidays).astype("int8")
energy["meter_reading_log"] = np.log(energy["meter_reading"] + 1)

In [None]:
energy_train,energy_test = train_test_split(energy[energy['meter_reading'] > 0],test_size=0.2)

In [None]:
hours = range(0,24)
buildings = range(0,energy_train['building_id'].max() + 1)
time_models_scores = {}
time_model = None
time_columns = ["meter_reading_log", "hour", "building_id", "is_holiday"]
for wday in range(0,7):
    time_columns.append("is_wday" + str(wday))
len(time_columns)
energy_train_time = pd.DataFrame(energy_train,columns=time_columns)

In [None]:
energy_train_time.head()

In [None]:
for model in models:
    energy_time_scores = [[]] * len(buildings)
    for building in buildings:
        energy_time_scores[building] = [0]*len(hours)
        energy_train_time_b = energy_train_time[energy_train_time['building_id'] == building]
        for hour in hours:
            energy_train_time_bh = energy_train_time_b[energy_train_time_b['hour'] == hour]
            y = energy_train_time_bh['meter_reading_log']
            x = energy_train_time_bh.drop(['meter_reading_log','hour','building_id'],axis=1)
            if model == 'LinearRegression':
                time_model = LinearRegression(fit_intercept=False).fit(x,y)
            elif model == 'Ridge-0.1':
                time_model = Ridge(alpha=0.1,fit_intercept=False).fit(x,y)
            elif model == 'Ridge-1.0':
                time_model = Ridge(alpha=1,fit_intercept=False).fit(x,y)
            else:
                time_model = Ridge(alpha=0.01,fit_intercept=False).fit(x,y)
            energy_time_scores[building][hour] = r2_score(y,time_model.predict(x))
    time_models_scores[model] = np.mean(energy_time_scores)

In [None]:
time_models_scores