In [1]:
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import numpy as np
from scipy.interpolate import interp1d
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
def reduce_memory(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if str(col_type)[:5] == 'float':
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo('f2').min and c_max < np.finfo('f2').max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo('f4').min and c_max < np.finfo('f4').max():
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        elif str(col_type)[:3] == 'int':
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo('i1').min and c_max < np.iinfo('i1').max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo('i2').min and c_max < np.iinfo('i2').max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo('i4').min and c_max < np.iinfo('i4').max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo('i8').min and c_max < np.iinfo('i8').max:
                df[col] = df[col].astype(np.int64)
        elif col == 'timestamp':
            df[col] = pd.to_datetime(df[col])
        elif str(col_type)[:8] != "datetime":
            df[col] = df[col].astype("category")
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Потребление памяти меньше на', round(start_mem - end_mem, 2), 'Мб (минус', round(100 * (start_mem - end_mem) / start_mem, 1), '%)')
    return df

In [3]:
buildings = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz")
weather = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz")
energy = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/train.0.csv.gz")

In [4]:
energy = energy[energy['building_id'] < 100]
energy = pd.merge(left=energy,right=buildings,how='left',right_on='building_id',left_on='building_id')
energy = energy.set_index(['timestamp','site_id'])
weather = weather.set_index(['timestamp','site_id'])
energy = pd.merge(left=energy,right=weather,how='left',left_index=True,right_index=True)
energy.reset_index(inplace=True)
energy = energy.drop(['meter','year_built','square_feet','floor_count'],axis=1)
energy = reduce_memory(energy)
del buildings
del weather

Потребление памяти меньше на 56.89 Мб (минус 71.9 %)


In [5]:
energy['hour'] = energy['timestamp'].dt.hour.astype('int8')
energy['weekday'] = energy['timestamp'].dt.weekday.astype('int8')
for weekday in range(0,7):
  energy['is_wday' + str(weekday)] = energy['weekday'].isin([weekday]).astype('int8')
energy['date'] = pd.to_datetime(energy['timestamp'].dt.date)
dates_range = pd.date_range(start='2015-12-31',end='2017-01-01')
us_holidays = calendar().holidays(start=dates_range.min(),end=dates_range.max()) 
energy['is_holiday'] = energy['date'].isin(us_holidays).astype("int8")
energy['meter_reading_log'] = np.log(energy['meter_reading'] + 1)

In [6]:
energy_train,energy_test = train_test_split(energy[energy['meter_reading'] > 0],test_size=0.2)

In [7]:
hours = range(0,24)
buildings = range(0,energy_train['building_id'].max() + 1)
lr_columns = ['meter_reading_log','hour','building_id','is_holiday']
for wday in range(0,7):
  lr_columns.append('is_wday' + str(wday))
energy_train_lr = pd.DataFrame(energy_train,columns=lr_columns)
energy_lr = [[]] * len(buildings)
for building in buildings:
  energy_lr[building] = [[]] * len(hours)
  energy_train_b = energy_train_lr[energy_train_lr['building_id'] == building]
  for hour in hours:
    energy_lr[building].append([0] * (len(lr_columns) - 3))
    energy_train_bh = energy_train_b[energy_train_b['hour'] == hour]
    y = energy_train_bh['meter_reading_log']
    if len(y) > 0:
      x = energy_train_bh.drop(['meter_reading_log','hour','building_id'],axis=1)
      model = LinearRegression(fit_intercept=False)
      model.fit(x,y)
      energy_lr[building][hour] = model.coef_
      energy_lr[building][hour] = np.append(energy_lr[building][hour],model.intercept_)
print(energy_lr[0])      


[array([-0.13402895,  5.45342722,  5.4603125 ,  5.47786458,  5.500625  ,
        5.42583414,  5.44401042,  5.43475116,  0.        ]), array([-0.06561396,  5.44684513,  5.49414062,  5.49567522,  5.46871831,
        5.4438538 ,  5.46657986,  5.44342672,  0.        ]), array([-0.03821639,  5.43418224,  5.47767857,  5.52421875,  5.46463713,
        5.45853365,  5.47086589,  5.43055556,  0.        ]), array([-0.1310587 ,  5.45859445,  5.45659722,  5.50240385,  5.46361341,
        5.44929847,  5.45501078,  5.47216797,  0.        ]), array([-0.03857242,  5.44283326,  5.4640625 ,  5.50585938,  5.479375  ,
        5.45668531,  5.4458912 ,  5.42919922,  0.        ]), array([-0.15542314,  5.47775565,  5.44867996,  5.49537037,  5.46327927,
        5.48981068,  5.46310764,  5.38151042,  0.        ]), array([-0.14588473,  5.48319285,  5.45868389,  5.48456101,  5.44671875,
        5.45036664,  5.48476562,  5.49112216,  0.        ]), array([-0.07730235,  5.45877463,  5.44229403,  5.47859375,  5.490623

In [8]:
sites = range(0,energy['site_id'].max() + 1)
primary_uses = energy['primary_use'].unique()
lr_columns_use = ['meter_reading_log','hour','building_id','is_holiday','primary_use','site_id']
for wday in range(0,7):
  lr_columns_use.append('is_wday' + str(wday))
energy_lr_use = {}
energy_lr_use_site = {}  
energy_train_lr = pd.DataFrame(energy_train,columns=lr_columns_use)
for primary_use in primary_uses:
  energy_train_u = energy_train_lr[energy_train_lr['primary_use'] == primary_use]
  if len(energy_train_u) > 0:
    energy_lr_use_site[primary_use] = [[]] * len(sites)
    for site in sites:
      energy_lr_use_site[primary_use][site] = [[]] * len(hours)
      energy_train_us = energy_train_u[energy_train_u['site_id'] == site]
      if len(energy_train_us) > 0:
        for hour in hours:
          energy_train_ush = energy_train_us[energy_train_us['hour'] == hour]
          y = energy_train_ush['meter_reading_log']
          if len(y) > 0:
            x = energy_train_ush.drop(['meter_reading_log','hour','building_id','site_id','primary_use'],axis=1)
            model = LinearRegression(fit_intercept=False)
            model.fit(x,y)
            energy_lr_use_site[primary_use][site][hour] = model.coef_
            energy_lr_use_site[primary_use][site][hour] = np.append(energy_lr_use_site[primary_use][site][hour],model.intercept_)
    energy_lr_use[primary_use] = [[]] * len(hours)       
    for hour in hours:
      energy_train_uh = energy_train_u[energy_train_u['hour'] == hour]
      y = energy_train_uh['meter_reading_log']
      if len(y) > 0:
        x = energy_train_uh.drop(['meter_reading_log','hour','building_id','site_id','primary_use'],axis=1)
        model = LinearRegression(fit_intercept=False)
        model.fit(x,y)
        energy_lr_use[primary_use][hour] = model.coef_
        energy_lr_use[primary_use][hour] = np.append(energy_lr_use[primary_use][hour],model.intercept_)
print(energy_lr_use_site['Education'])      


[[array([0.01041911, 5.61395315, 5.70179028, 5.68541603, 5.69121823,
       5.64781714, 5.6491358 , 5.64602292, 0.        ]), array([-0.04325704,  5.64160629,  5.63350602,  5.69670407,  5.65383024,
        5.65601829,  5.62203699,  5.60732345,  0.        ]), array([0.02053786, 5.57231738, 5.6461055 , 5.64654097, 5.63237207,
       5.63932975, 5.65886274, 5.54829738, 0.        ]), array([-0.12433813,  5.57422391,  5.59054447,  5.60593964,  5.6295499 ,
        5.61570536,  5.63442158,  5.58714947,  0.        ]), array([-0.08507817,  5.58022276,  5.61605588,  5.66376192,  5.63235131,
        5.63787574,  5.60232882,  5.59896641,  0.        ]), array([-0.05243391,  5.63621578,  5.68415731,  5.63014513,  5.64579099,
        5.61994683,  5.67066617,  5.60750427,  0.        ]), array([-0.08742314,  5.72298001,  5.76761038,  5.75473833,  5.72046877,
        5.72952171,  5.68603323,  5.6908535 ,  0.        ]), array([-0.11561226,  5.83152026,  5.8225845 ,  5.80012932,  5.83531026,
        5.827

In [10]:
def calculate_model(x):
  lr = -1
  model = energy_lr[x.building_id][x.hour]
  if len(model) == 0:
    model = energy_lr_use[x.primary_use][x.hour]
  if len(model) == 0:
    model = energy_lr_use[x.primary_use][x.hour]
  if len(model) > 0:
    lr = np.sum([x[col] * model[i] for i,col in enumerate(lr_columns[3:])])
    lr += model[len(lr_columns) - 3]
    lr = np.exp(lr)
  if lr < 0 or lr != lr or lr*lr == lr:
    lr = 0
  x['meter_reading_lr_q'] = (np.log(1+x.meter_reading) - np.log(1 + lr)) ** 2
  return x

energy_test = energy_test.apply(calculate_model,axis=1,result_type='expand')

np.sqrt(energy_test['meter_reading_lr_q'].sum() / len(energy_test))

0.3443734770234583