### Постановка задачи
Заполним отсутствующие значения по погоде интерполяционными данными.

Посчитаем модель линейной регрессии по первому зданию и найдем ее точность.

Данные:
* http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz
* http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz
* http://video.ittensive.com/machine-learning/ashrae/train.0.csv.gz
Соревнование: https://www.kaggle.com/c/ashrae-energy-prediction/

© ITtensive, 2020

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [3]:
buildings = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz")
weather = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz")
energy = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/train.0.csv.gz")

In [4]:
energy = energy[(energy["building_id"]==0)]

In [5]:
energy = pd.merge(left=energy, right=buildings, how="left",
                   left_on="building_id", right_on="building_id")
energy = energy.set_index(["timestamp", "site_id"])
weather = weather.set_index(["timestamp", "site_id"])
energy = pd.merge(left=energy, right=weather, how="left",
                  left_index=True, right_index=True)
energy.reset_index(inplace=True)
energy = energy.drop(columns=["meter", "site_id", "floor_count"], axis=1)
del buildings
del weather
print (energy.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   timestamp           8784 non-null   object 
 1   building_id         8784 non-null   int64  
 2   meter_reading       8784 non-null   float64
 3   primary_use         8784 non-null   object 
 4   square_feet         8784 non-null   int64  
 5   year_built          8784 non-null   float64
 6   air_temperature     8781 non-null   float64
 7   cloud_coverage      4954 non-null   float64
 8   dew_temperature     8781 non-null   float64
 9   precip_depth_1_hr   8783 non-null   float64
 10  sea_level_pressure  8699 non-null   float64
 11  wind_direction      8534 non-null   float64
 12  wind_speed          8784 non-null   float64
dtypes: float64(9), int64(2), object(2)
memory usage: 892.2+ KB
None


In [6]:
def reduce_mem_usage (df):
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if str(col_type)[:5] == "float":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo("f2").min and c_max < np.finfo("f2").max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo("f4").min and c_max < np.finfo("f4").max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        elif str(col_type)[:3] == "int":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo("i1").min and c_max < np.iinfo("i1").max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo("i2").min and c_max < np.iinfo("i2").max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo("i4").min and c_max < np.iinfo("i4").max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo("i8").min and c_max < np.iinfo("i8").max:
                df[col] = df[col].astype(np.int64)
        elif col == "timestamp":
            df[col] = pd.to_datetime(df[col])
        elif str(col_type)[:8] != "datetime":
            df[col] = df[col].astype("category")
    end_mem = df.memory_usage().sum() / 1024**2
    print('Потребление памяти меньше на', round(start_mem - end_mem, 2), 'Мб (минус', round(100 * (start_mem - end_mem) / start_mem, 1), '%)')
    return df

In [7]:
energy = reduce_mem_usage(energy)

Потребление памяти меньше на 0.62 Мб (минус 71.1 %)


In [8]:
energy['precip_depth_1_hr'] = energy['precip_depth_1_hr'].apply(lambda x: 0 if x < 0 else x)

In [9]:
energy.columns

Index(['timestamp', 'building_id', 'meter_reading', 'primary_use',
       'square_feet', 'year_built', 'air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure',
       'wind_direction', 'wind_speed'],
      dtype='object')

In [10]:
interpolate_cols = ['air_temperature','cloud_coverage','dew_temperature','wind_speed','precip_depth_1_hr','sea_level_pressure']

In [11]:
for col in interpolate_cols:
    energy[col] = energy[col].interpolate(limit_direction='both',kind='cubic')

In [12]:
pd.set_option('use_inf_as_na',True)

In [13]:
for col in interpolate_cols:
    print(col,energy[col].isnull().sum())

air_temperature 0
cloud_coverage 0
dew_temperature 0
wind_speed 0
precip_depth_1_hr 0
sea_level_pressure 0


In [41]:
X_train,X_test = train_test_split(energy[energy['meter_reading']>0],test_size=0.2)

In [15]:
reg_cols = ['air_temperature','cloud_coverage','dew_temperature','wind_speed','meter_reading','sea_level_pressure']

In [42]:
X_train_lr = X_train[reg_cols]

In [43]:
y = X_train_lr['meter_reading']
X = X_train_lr.drop('meter_reading',axis=1)

In [44]:
model = LinearRegression()

In [45]:
model.fit(X,y)

LinearRegression()

In [46]:
model.coef_,model.intercept_

(array([ 2.4893153, -2.2992911,  3.773    , -2.4098268, -1.0149957],
       dtype=float32),
 1152.3059)

In [47]:
def calculate_model(x):
    lr = np.sum([x[col] * model.coef_[i] for i,col in enumerate(reg_cols[1:])])
    lr += model.intercept_
    x["meter_reading_lr_q"] = (np.log(1 + x.meter_reading) -
                               np.log(1 + lr))**2
    return x

In [48]:
X_test = X_test.apply(calculate_model,axis=1,result_type='expand')

  np.log(1 + lr))**2


In [49]:
X_test_lr_rmsle = np.sqrt(X_test['meter_reading_lr_q'].sum() / len(X_test))

In [50]:
X_test_lr_rmsle

0.016403367132333577

In [51]:
X_test[['meter_reading','meter_reading_lr_q']].head()

Unnamed: 0,meter_reading,meter_reading_lr_q
7604,225.25,
5263,313.25,
5924,238.25,
7195,179.5,
4429,239.625,


In [54]:
X_test['meter_reading_lr_q'].isnull().sum()

1082