In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.linear_model import LinearRegression

In [9]:
buildings = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz")
weather = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz")
energy = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/train.0.csv.gz")

In [10]:
def reduce_mem_usage (df):
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if str(col_type)[:5] == "float":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo("f2").min and c_max < np.finfo("f2").max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo("f4").min and c_max < np.finfo("f4").max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        elif str(col_type)[:3] == "int":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo("i1").min and c_max < np.iinfo("i1").max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo("i2").min and c_max < np.iinfo("i2").max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo("i4").min and c_max < np.iinfo("i4").max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo("i8").min and c_max < np.iinfo("i8").max:
                df[col] = df[col].astype(np.int64)
        elif col == "timestamp":
            df[col] = pd.to_datetime(df[col])
        elif str(col_type)[:8] != "datetime":
            df[col] = df[col].astype("category")
    end_mem = df.memory_usage().sum() / 1024**2
    print('Потребление памяти меньше на', round(start_mem - end_mem, 2), 'Мб (минус', round(100 * (start_mem - end_mem) / start_mem, 1), '%)')
    return df

def round_fillna (df, columns):
    for col in columns:
        type_ = "int8"
        if col in ["wind_direction", "year_built", "precip_depth_1_hr"]:
            type_ = "int16"
        if col == "precip_depth_1_hr":
            df[col] = df[col].apply(lambda x:0 if x<0 else x)
        df[col] = np.round(df[col].fillna(value=0)).astype(type_)
    return df

In [11]:
buildings = reduce_mem_usage(buildings)
weather = reduce_mem_usage(weather)
energy = reduce_mem_usage(energy)

Потребление памяти меньше на 0.05 Мб (минус 73.8 %)
Потребление памяти меньше на 6.53 Мб (минус 68.1 %)
Потребление памяти меньше на 195.54 Мб (минус 53.1 %)


In [12]:
energy = pd.merge(left=energy,right=buildings,how='left',left_on='building_id',right_on='building_id')
energy = energy[energy['building_id'] < 20]
energy = energy.set_index(['site_id','timestamp'])
weather = weather.set_index(['site_id','timestamp'])
energy = pd.merge(left=energy,right=weather,how='left',left_index=True,right_index=True)
energy.reset_index(inplace=True)
del weather
del buildings

In [16]:
energy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175680 entries, 0 to 175679
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   site_id             175680 non-null  int64         
 1   timestamp           175680 non-null  datetime64[ns]
 2   building_id         175680 non-null  int16         
 3   meter               175680 non-null  int8          
 4   meter_reading       175680 non-null  float32       
 5   primary_use         175680 non-null  category      
 6   square_feet         175680 non-null  int32         
 7   year_built          175680 non-null  float16       
 8   floor_count         0 non-null       float16       
 9   air_temperature     175620 non-null  float16       
 10  cloud_coverage      99080 non-null   float16       
 11  dew_temperature     175620 non-null  float16       
 12  precip_depth_1_hr   175660 non-null  float16       
 13  sea_level_pressure  173980 no

In [38]:
energy_train_lr.columns

Index(['meter_reading', 'hour', 'building_id', 'air_temperature',
       'dew_temperature', 'sea_level_pressure', 'wind_speed', 'cloud_coverage',
       'is_holiday'],
      dtype='object')

In [14]:
len(energy)

175680

In [17]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

In [18]:
energy['hour'] = energy['timestamp'].dt.hour.astype('int8')
dates_range = pd.date_range(start='2015-12-31',end='2017-01-01')
us_holidays = calendar().holidays(start=dates_range.min(),end=dates_range.max())
energy['date'] = pd.to_datetime(energy['timestamp'].dt.date)
energy['is_holiday'] = energy['date'].isin(us_holidays).astype('int8')

In [19]:
energy['precip_depth_1_hr'] = energy['precip_depth_1_hr'].apply(lambda x: x if x > 0 else 0)

In [21]:
interp_cols = ['air_temperature','cloud_coverage','dew_temperature','precip_depth_1_hr','sea_level_pressure','wind_speed']
for col in interp_cols:
  energy[col] = energy[col].interpolate(limit_direction='both',kind='cubic')

In [22]:
regression_cols = ['meter_reading','air_temperature','cloud_coverage','dew_temperature','is_holiday','sea_level_pressure','wind_speed']

In [23]:
energy_train,energy_test = train_test_split(energy[energy['meter_reading'] > 0],test_size=0.2)

In [24]:
X = energy_train.drop('meter_reading',axis=1)
y = energy_train['meter_reading']

In [39]:
hours = range(0,24)
buildings = range(0,energy_train['building_id'].max() + 1)
lr_columns = ['meter_reading','hour','building_id','air_temperature',
              'dew_temperature','sea_level_pressure','wind_speed','cloud_coverage','is_holiday']
energy_train_lr = pd.DataFrame(energy_train,columns=lr_columns)
energy_lr = [[]] * len(buildings)
for building in buildings:
  energy_lr[building] = [[]] * len(hours)
  energy_train_b = energy_train_lr[energy_train_lr['building_id'] == building]
  for hour in hours:
    energy_train_bh = energy_train_b[energy_train_b['hour'] == hour]
    y = energy_train_bh['meter_reading']
    X = energy_train_bh.drop(['meter_reading','hour','building_id'],axis=1)
    model = LinearRegression()
    model.fit(X,y)
    energy_lr[building][hour] = model.coef_
    energy_lr[building][hour] = np.append(energy_lr[building][hour],model.intercept_)


In [40]:
def calculate_model(x):
  model = energy_lr[x.building_id][x.hour]
  lr = np.sum([x[col] * model[i] for i,col in enumerate(lr_columns[3:])])
  lr +=model[len(lr_columns) - 3]
  x['meter_reading_lr_q'] = (np.log(1 + x.meter_reading) - np.log(1 + lr)) ** 2
  return x

In [41]:
energy_test = energy_test.apply(calculate_model,axis=1,result_type='expand')
np.sqrt(energy_test['meter_reading_lr_q'].sum() / len(energy_test))

  """


0.23916610830812113