In [20]:
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import numpy as np
from scipy.interpolate import interp1d
from sklearn.linear_model import LinearRegression
from google.colab import drive
drive.mount("/content/drive",force_remount=True)

Mounted at /content/drive


In [25]:
energy = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/energy.csv")

In [29]:
def reduce_mem_usage (df):
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if str(col_type)[:5] == "float":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo("f2").min and c_max < np.finfo("f2").max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo("f4").min and c_max < np.finfo("f4").max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        elif str(col_type)[:3] == "int":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo("i1").min and c_max < np.iinfo("i1").max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo("i2").min and c_max < np.iinfo("i2").max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo("i4").min and c_max < np.iinfo("i4").max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo("i8").min and c_max < np.iinfo("i8").max:
                df[col] = df[col].astype(np.int64)
        elif col == "timestamp":
            df[col] = pd.to_datetime(df[col])
        elif str(col_type)[:8] != "datetime":
            df[col] = df[col].astype("category")
    end_mem = df.memory_usage().sum() / 1024**2
    print('Потребление памяти меньше на', round(start_mem - end_mem, 2), 'Мб (минус', round(100 * (start_mem - end_mem) / start_mem, 1), '%)')
    return df

In [27]:
buildings = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz",
                       usecols=["site_id", "building_id"])
weather = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/weather_test.csv.gz")
weather = weather[weather["site_id"] == 0]
weather = weather.drop(columns=["wind_direction"], axis=1)
results = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/test.csv.gz")
results = results[(results["building_id"] < 20) & (results["meter"] == 0)]
results = pd.merge(left=results, right=buildings, how="left",
                   left_on="building_id", right_on="building_id")
del buildings
results = results.drop(columns=["meter"], axis=1)
print (results.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 350400 entries, 0 to 350399
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   row_id       350400 non-null  int64 
 1   building_id  350400 non-null  int64 
 2   timestamp    350400 non-null  object
 3   site_id      350400 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 13.4+ MB
None


In [28]:
interpolate_columns = ['air_temperature','dew_temperature','cloud_coverage','wind_speed','sea_level_pressure']
for col in interpolate_columns:
  weather[col] = weather[col].interpolate(limit_direction='both',kind='cubic')
weather['air_temperature_diff1'] = weather['air_temperature'].diff()
weather.at[0,'air_temperature_diff1'] = weather.at[1,'air_temperature_diff1']
weather['air_temperature_diff2'] = weather['air_temperature_diff1'].diff()
weather.at[0,'air_temperature_diff2'] = weather.at[1,'air_temperature_diff2']

In [30]:
results = results.set_index(['timestamp','site_id'])
weather = weather.set_index(['timestamp','site_id'])
results = pd.merge(left=results,right=weather,how='left',left_index=True,right_index=True)
results.reset_index(inplace=True)
results = results.drop(['site_id'],axis=1)
results = reduce_mem_usage(results)

Потребление памяти меньше на 19.72 Мб (минус 67.0 %)


In [32]:
results['hour'] = results['timestamp'].dt.hour.astype('int8')
results['weekday'] = results['timestamp'].dt.weekday.astype('int8')
results['week'] = results['timestamp'].dt.week.astype('int8')
results['month'] = results['timestamp'].dt.month.astype('int8')
results['date'] = pd.to_datetime(results['timestamp'].dt.date)
dates_range = pd.date_range(start='2016-12-31', end='2018-06-01')
us_holidays = calendar().holidays(start=dates_range.min(),end=dates_range.max())
results['is_holiday'] = results['date'].isin(us_holidays).astype('int8')
for weekday in range(0,7):
  results['is_wday' + str(weekday)] = results['weekday'].isin([weekday]).astype('int8')
for week in range(1,54):
  results['is_w' + str(week)] = results['week'].isin([week]).astype('int8')
for month in range(1,13):
  results['is_m' + str(month)] = results['month'].isin([month]).astype('int8')    

  This is separate from the ipykernel package so we can avoid doing imports until


In [36]:
hours = range(0,24)
buildings = range(0,energy['building_id'].max() + 1)
lr_columns = ["meter_reading_log", "hour", "building_id",
             "air_temperature", "dew_temperature",
             "sea_level_pressure", "wind_speed", "cloud_coverage",
             "air_temperature_diff1", "air_temperature_diff2",
             "is_holiday"]
for wday in range(0,7):
  lr_columns.append('is_wday' + str(wday))
for week in range(1,54):
  lr_columns.append('is_w' + str(week))
for month in range(1,13):
  lr_columns.append('is_m' + str(month))
energy_train_lr = pd.DataFrame(energy,columns=lr_columns)
energy_lr = [[]] * len(buildings)
for building in buildings:
  energy_lr[building] = [[]] * len(hours)
  energy_train_b = energy_train_lr[energy_train_lr['building_id'] == building]
  for hour in hours:
    energy_train_bh = energy_train_b[energy_train_b['hour'] == hour]
    y = energy_train_bh['meter_reading_log']
    x = energy_train_bh.drop(['meter_reading_log','building_id','hour'],axis=1)
    model = LinearRegression(fit_intercept=False).fit(x,y)
    energy_lr[building][hour] = model.coef_
    energy_lr[building][hour] = np.append(energy_lr[building][hour],model.intercept_)
print(energy_lr[0])

[array([ 1.82239162e-02, -5.00143110e-03, -5.45316549e-03, -2.27640469e-02,
       -1.65985523e-02,  2.10033054e-03,  7.35419668e-03,  1.33888005e-02,
        5.04305154e+00,  5.06148983e+00,  5.08243156e+00,  5.04754854e+00,
        5.05642120e+00,  5.15845083e+00,  5.14658520e+00,  1.12525041e-01,
        6.22993013e-02,  8.95214418e-02,  9.05743090e-02, -1.86501533e+00,
       -1.85536451e+00, -1.87203971e+00, -1.86267070e+00, -1.80207317e+00,
       -1.76371657e+00, -1.83533802e+00, -1.73106160e+00, -1.71557216e+00,
       -1.64898254e+00, -1.55243638e+00, -1.58896850e+00, -1.58738511e+00,
       -1.55519064e+00, -1.53406218e+00,  4.05963044e-02,  3.74777706e+00,
        3.67239132e+00,  3.80254713e+00,  3.84689259e+00,  3.71063910e+00,
        3.75822485e+00,  3.67944266e+00,  3.87623290e+00,  3.88733698e+00,
        3.81679414e+00,  1.05359968e+00,  1.16470292e+00,  1.15679872e+00,
        9.91946035e-01,  9.98933466e-01,  9.92720394e-01,  9.67654505e-01,
        9.38273524e-01, 

In [39]:
def calculate_model(x):
  lr = -1
  model = energy_lr[x.building_id][x.hour] 
  if len(model) > 0:
    lr = np.sum([x[c] * model[i] for i,c in enumerate(lr_columns[3:])])
    lr += model[len(lr_columns) - 3]
    lr = np.exp(lr)
  if lr < 0 or lr != lr or lr*lr == lr:
    lr = 0
  x['meter_reading'] = lr
  return x
results = results.apply(calculate_model,axis=1,result_type='expand')

In [40]:
results_ready = pd.DataFrame(results, columns=['row_id','meter_reading'])

In [41]:
results = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/test.csv.gz",
                     usecols=["row_id"])
results = pd.merge(left=results,right=results_ready,how='left',left_on='row_id',right_on='row_id')
results.fillna(value=0,inplace=True)


In [45]:
results['meter_reading']

0           3.616772
1           2.016745
2           0.812282
3           3.065807
4           4.009058
              ...   
41697595    0.000000
41697596    0.000000
41697597    0.000000
41697598    0.000000
41697599    0.000000
Name: meter_reading, Length: 41697600, dtype: float64