In [66]:
import pandas as pd
import numpy as np
from scipy.interpolate import interp1d
import matplotlib.pyplot as plt
from matplotlib.pyplot import rcParams
import seaborn as sns

In [67]:
buildings = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz")
weather = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz")
energy = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/train.0.csv.gz")

In [68]:
def reduce_mem_usage (df):
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if str(col_type)[:5] == "float":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo("f2").min and c_max < np.finfo("f2").max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo("f4").min and c_max < np.finfo("f4").max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        elif str(col_type)[:3] == "int":
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo("i1").min and c_max < np.iinfo("i1").max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo("i2").min and c_max < np.iinfo("i2").max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo("i4").min and c_max < np.iinfo("i4").max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo("i8").min and c_max < np.iinfo("i8").max:
                df[col] = df[col].astype(np.int64)
        elif col == "timestamp":
            df[col] = pd.to_datetime(df[col])
        elif str(col_type)[:8] != "datetime":
            df[col] = df[col].astype("category")
    end_mem = df.memory_usage().sum() / 1024**2
    print('Потребление памяти меньше на', round(start_mem - end_mem, 2), 'Мб (минус', round(100 * (start_mem - end_mem) / start_mem, 1), '%)')
    return df

def round_fillna (df, columns):
    for col in columns:
        type_ = "int8"
        if col in ["wind_direction", "year_built", "precip_depth_1_hr"]:
            type_ = "int16"
        if col == "precip_depth_1_hr":
            df[col] = df[col].apply(lambda x:0 if x<0 else x)
        df[col] = np.round(df[col].fillna(value=0)).astype(type_)
    return df

In [69]:
buildings = reduce_mem_usage(buildings)
weather = reduce_mem_usage(weather)
energy = reduce_mem_usage(energy)

Потребление памяти меньше на 0.05 Мб (минус 73.8 %)
Потребление памяти меньше на 6.53 Мб (минус 68.1 %)
Потребление памяти меньше на 195.54 Мб (минус 53.1 %)


In [70]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

In [71]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [72]:
energy = pd.merge(left=energy, right=buildings, how="left",
                   left_on="building_id", right_on="building_id")
energy = energy.set_index(["timestamp", "site_id"])
weather = weather.set_index(["timestamp", "site_id"])
energy = pd.merge(left=energy, right=weather, how="left",
                  left_index=True, right_index=True)
energy.reset_index(inplace=True)
energy = energy.drop(columns=["meter", "site_id", "floor_count"], axis=1)
del buildings
del weather
print (energy.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12060910 entries, 0 to 12060909
Data columns (total 13 columns):
 #   Column              Dtype         
---  ------              -----         
 0   timestamp           datetime64[ns]
 1   building_id         int16         
 2   meter_reading       float32       
 3   primary_use         category      
 4   square_feet         int32         
 5   year_built          float16       
 6   air_temperature     float16       
 7   cloud_coverage      float16       
 8   dew_temperature     float16       
 9   precip_depth_1_hr   float16       
 10  sea_level_pressure  float16       
 11  wind_direction      float16       
 12  wind_speed          float16       
dtypes: category(1), datetime64[ns](1), float16(8), float32(1), int16(1), int32(1)
memory usage: 402.6 MB
None


In [73]:
interpolate_columns = ["air_temperature", "dew_temperature", "cloud_coverage", "wind_speed", "precip_depth_1_hr", "sea_level_pressure"]

In [74]:
for col in interpolate_columns:
    energy[col] = energy[col].interpolate(limit_direction='both',
                            kind='cubic')

In [76]:
regression_columns = ["meter_reading", "air_temperature",
        "dew_temperature", "cloud_coverage", "wind_speed",
        "precip_depth_1_hr", "sea_level_pressure","is_holiday"]

In [77]:
dates_range = pd.date_range(start='2015-12-31', end='2017-01-01')
us_holidays = calendar().holidays(start=dates_range.min(),
                                  end=dates_range.max())

energy["dayofweek"] = energy["timestamp"].dt.dayofweek.astype("int8")
energy["day"] = energy["timestamp"].dt.day.astype("int8")
energy["dayofyear"] = energy["timestamp"].dt.dayofyear.astype("int16")
energy["month"] = energy["timestamp"].dt.month.astype("int8")
energy["week"] = energy["timestamp"].dt.week.astype("int8")
energy["date"] = pd.to_datetime(energy["timestamp"].dt.date)
energy['is_holiday'] = (energy['date'].isin(us_holidays)).astype("int8")

In [81]:
energy['building_id'].min()

0

In [82]:
energy = energy[energy['building_id'] < 21]

In [83]:
energy_train = energy[regression_columns]

In [84]:
energy_train.isnull().sum()

meter_reading         0
air_temperature       0
dew_temperature       0
cloud_coverage        0
wind_speed            0
precip_depth_1_hr     0
sea_level_pressure    0
is_holiday            0
dtype: int64

In [85]:
X_train,X_test = train_test_split(energy_train,test_size=0.2)

In [86]:
X = X_train.drop('meter_reading',axis=1)

In [87]:
y = X_train['meter_reading']

In [88]:
model = LinearRegression()

In [89]:
model.fit(X,y)

LinearRegression()

In [90]:
model.coef_,model.intercept_

(array([ 8.335905  , 17.669596  , -4.3035636 , -4.0379324 , -0.14847988,
        11.589966  , 22.177511  ], dtype=float32),
 -11935.577)

In [91]:
from sklearn.metrics import mean_squared_log_error

In [61]:
def calculate_model1(x):
    lr = np.sum([x[col] * model.coef_[i] for i,col in enumerate(regression_columns[1:])])
    lr += model.intercept_
    
    x['meter_reading_lr_q'] = (np.log(1 + x.meter_reading) - np.log(1 + lr)) ** 2
    return x

In [92]:
def calculate_model (x):
    lr = np.sum([x[col] * model.coef_[i] for i,col in enumerate(regression_columns[1:])])
    lr += model.intercept_
    x["meter_reading_lr_q"] = (np.log(1 + x.meter_reading) -
                               np.log(1 + lr))**2
    return x

energy_test = X_test.apply(calculate_model,
                                    axis=1, result_type="expand")
energy_test_lr_rmsle = np.sqrt(energy_test["meter_reading_lr_q"].sum() / len(energy_test))
print ("Качество линейной регрессии:", energy_test_lr_rmsle, round(energy_test_lr_rmsle, 1))

  np.log(1 + lr))**2


Качество линейной регрессии: 3.349160736941303 3.3
