Загрузите данные и посчитайте модели линейной регрессии для 50 зданий по ансамблю регрессионных моделей: в первой модели весь оптимальный набор метеорологических данных, во второй - дни недели и праздники, в третьей - недели года, в четвертой - месяцы. Финальное значение показателя рассчитайте как взвешенное арифметическое показателей всех моделей, взяв веса для первой и второй модели как 3/8, а для третьей и четвертой - как 1/8.

Загрузите данные решения, посчитайте значение энергопотребления для требуемых дат для тех зданий, которые посчитаны в модели, и выгрузите результат в виде CSV-файла (submission.csv).

Данные:

http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz

http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz

http://video.ittensive.com/machine-learning/ashrae/train.0.csv.gz

http://video.ittensive.com/machine-learning/ashrae/test.csv.gz

http://video.ittensive.com/machine-learning/ashrae/weather_test.csv.gz
### Сколько строк в итоговом файле имеют ненулевое значение для показателя meter_reading

In [1]:
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import numpy as np
from scipy.interpolate import interp1d
from sklearn.linear_model import LinearRegression

In [2]:
def reduce_mem_usage (df):
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if str(col_type)[:5] == "float":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo("f2").min and c_max < np.finfo("f2").max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo("f4").min and c_max < np.finfo("f4").max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        elif str(col_type)[:3] == "int":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo("i1").min and c_max < np.iinfo("i1").max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo("i2").min and c_max < np.iinfo("i2").max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo("i4").min and c_max < np.iinfo("i4").max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo("i8").min and c_max < np.iinfo("i8").max:
                df[col] = df[col].astype(np.int64)
        elif col == "timestamp":
            df[col] = pd.to_datetime(df[col])
        elif str(col_type)[:8] != "datetime":
            df[col] = df[col].astype("category")
    end_mem = df.memory_usage().sum() / 1024**2
    print('Потребление памяти меньше на', round(start_mem - end_mem, 2), 'Мб (минус', round(100 * (start_mem - end_mem) / start_mem, 1), '%)')
    return df

In [4]:
buildings = pd.read_csv('http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz',
                        usecols=["site_id", "building_id"])
weather = pd.read_csv('http://video.ittensive.com/machine-learning/ashrae/weather_test.csv.gz')
weather = weather[weather['site_id'] == 0].drop('wind_direction',axis=1)
results = pd.read_csv('http://video.ittensive.com/machine-learning/ashrae/test.csv.gz')
results = results[(results['building_id'] < 50) & (results['meter'] == 0)]
# http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz
energy = pd.read_csv('energy.0-20.ready.csv')
results = pd.merge(left=results, right=buildings, how="left",
                   left_on="building_id", right_on="building_id")
del buildings
results = results.drop('meter',axis=1)


In [5]:
results.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 876000 entries, 0 to 875999
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   row_id       876000 non-null  int64 
 1   building_id  876000 non-null  int64 
 2   timestamp    876000 non-null  object
 3   site_id      876000 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 33.4+ MB


In [6]:
interpolate_columns = ['air_temperature','dew_temperature','wind_speed',
                       'cloud_coverage','sea_level_pressure']
for col in interpolate_columns:
    weather[col] = weather[col].interpolate(limit_direction='both',kind='cubic')
weather['air_temperature_diff1'] = weather['air_temperature'].diff()
weather.at[0,'air_temperature_diff1'] = weather.at[1,'air_temperature_diff1']
weather['air_temperature_diff2'] = weather['air_temperature_diff1'].diff()
weather.at[0,'air_temperature_diff2'] = weather.at[1,'air_temperature_diff2']

In [7]:
results = results.set_index(['site_id','timestamp'])
weather = weather.set_index(['site_id','timestamp'])
results = pd.merge(left=results,right=weather,how='left',left_index=True,right_index=True)
results.reset_index(inplace=True)
results = results.drop(['site_id'],axis=1)
results = reduce_mem_usage(results)

Потребление памяти меньше на 49.29 Мб (минус 67.0 %)


### results.info()

In [12]:
results['hour'] = results['timestamp'].dt.hour.astype('int8')
results['weekday'] = results['timestamp'].dt.weekday.astype('int8')
results['week'] = results['timestamp'].dt.week.astype('int8')
results['month'] = results['timestamp'].dt.month.astype('int8')
results['date'] = pd.to_datetime(results['timestamp'].dt.date)


dates_range = pd.date_range(start='2015-12-31',end='2017-01-01')
us_holidays = calendar().holidays(start=dates_range.min(),end=dates_range.max())
results['is_holiday'] = results['date'].isin(us_holidays).astype('int8')
for weekday in range(0,7):
    results['is_wday' + str(weekday)] = results['weekday'].isin([weekday]).astype('int8')
for week in range(1,54):
    results['is_w' + str(week)] = results['week'].isin([week]).astype('int8')
for month in range(1,13):
    results['is_m' + str(month)] = results['month'].isin([month]).astype('int8')

In [13]:
energy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175680 entries, 0 to 175679
Data columns (total 92 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   timestamp              175680 non-null  object 
 1   building_id            175680 non-null  int64  
 2   meter_reading          175680 non-null  float64
 3   primary_use            175680 non-null  object 
 4   air_temperature        175680 non-null  float64
 5   cloud_coverage         175680 non-null  float64
 6   dew_temperature        175680 non-null  float64
 7   precip_depth_1_hr      175680 non-null  float64
 8   sea_level_pressure     175680 non-null  float64
 9   wind_direction         175680 non-null  float64
 10  wind_speed             175680 non-null  float64
 11  air_temperature_diff1  175680 non-null  float64
 12  air_temperature_diff2  175680 non-null  float64
 13  hour                   175680 non-null  int64  
 14  weekday                175680 non-nu

In [14]:
lr_weather_columns = ['air_temperature','air_temperature_diff1','air_temperature_diff2',
                     'cloud_coverage','dew_temperature','wind_speed','sea_level_pressure','building_id','hour',
                     'meter_reading_log']
lr_days_columns = ['hour','building_id','meter_reading_log','is_holiday']
for weekday in range(0,7):
    lr_days_columns.append('is_wday' + str(weekday))
for week in range(1,54):
    lr_days_columns.append('is_w' + str(week))
for month in range(1,13):
    lr_days_columns.append('is_m' + str(month))


In [16]:
from sklearn.model_selection import train_test_split
energy_train,energy_test = train_test_split(energy[energy['meter_reading'] > 0], test_size=0.2) #80/20

In [15]:
hours = range(0,24)
buildings = range(0,energy['building_id'].max())
energy_train_weather = pd.DataFrame(energy_train,columns=lr_weather_columns)

19

In [17]:
def calculate_model(x,df_lr,lr_columns):
    lr = -1
    model = df_lr[x.building_id][x.hour]
    if len(model) > 0:
        lr = np.sum([x[c] * model[i] for i,c in enumerate(lr_columns[3:])])
        lr += model[len(lr_columns) - 3]
        lr = np.exp(lr)
    if lr < 0 or lr * lr == lr:
        lr = 0
    x['meter_reading_lr_q'] = (np.log(x.meter_reading + 1) - np.log(1 + lr)) ** 2
    return x

In [18]:
def train_model(df,columns):
    df_test_lr = pd.DataFrame(df,columns=columns)