<a href="https://colab.research.google.com/github/AleksandrRevuka/Algo_data_science/blob/main/4_Practical_skills_1_2_linear_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Linear regression:

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [8]:
buildings = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz")
weather = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz")
energy_0 = pd.read_csv("http://video.ittensive.com/machine-learning/ashrae/train.0.0.csv.gz")
energy_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   building_id    8784 non-null   int64  
 1   meter          8784 non-null   int64  
 2   timestamp      8784 non-null   object 
 3   meter_reading  8784 non-null   float64
dtypes: float64(1), int64(2), object(1)
memory usage: 274.6+ KB


In [9]:
energy_0 = pd.merge(left=energy_0, right=buildings, how="left", left_on="building_id", right_on="building_id")

energy_0.set_index(["timestamp", "site_id"], inplace=True)
weather.set_index(["timestamp", "site_id"], inplace=True)

energy_0 = pd.merge(left=energy_0, right=weather, how="left", left_index=True, right_index=True)
energy_0.reset_index(inplace=True)

energy_0 = energy_0[energy_0['meter_reading'] > 0]
energy_0['timestamp'] = pd.to_datetime(energy_0['timestamp'])
energy_0['hour'] = energy_0['timestamp'].dt.hour
energy_0

Unnamed: 0,timestamp,site_id,building_id,meter,meter_reading,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour
704,2016-01-30 08:00:00,0,0,0,43.6839,Education,7432,2008.0,,8.3,,6.1,0.0,1019.0,220.0,2.1,8
725,2016-01-31 05:00:00,0,0,0,37.5408,Education,7432,2008.0,,12.8,,10.0,0.0,1021.9,0.0,0.0,5
737,2016-01-31 17:00:00,0,0,0,52.5571,Education,7432,2008.0,,20.6,,11.7,0.0,1020.9,110.0,1.5,17
2366,2016-04-08 14:00:00,0,0,0,59.3827,Education,7432,2008.0,,21.7,2.0,14.4,0.0,1015.1,250.0,3.1,14
2923,2016-05-01 19:00:00,0,0,0,448.0000,Education,7432,2008.0,,31.1,,17.2,0.0,1016.1,100.0,4.1,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8779,2016-12-31 19:00:00,0,0,0,232.0710,Education,7432,2008.0,,22.8,,10.0,0.0,1021.7,140.0,5.7,19
8780,2016-12-31 20:00:00,0,0,0,189.0690,Education,7432,2008.0,,23.3,,8.9,0.0,1021.0,170.0,4.1,20
8781,2016-12-31 21:00:00,0,0,0,169.9580,Education,7432,2008.0,,23.3,,10.0,0.0,1021.1,150.0,4.1,21
8782,2016-12-31 22:00:00,0,0,0,169.9580,Education,7432,2008.0,,22.8,,10.0,0.0,1021.1,160.0,3.1,22


In [10]:
energy_0_train, energy_0_test = train_test_split(energy_0, test_size=0.2)
energy_0_train.head()

Unnamed: 0,timestamp,site_id,building_id,meter,meter_reading,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour
6627,2016-10-03 03:00:00,0,0,0,251.865,Education,7432,2008.0,,25.6,,22.8,0.0,1015.3,280.0,1.5,3
6024,2016-09-08 00:00:00,0,0,0,259.373,Education,7432,2008.0,,27.2,2.0,19.4,0.0,1020.1,60.0,4.6,0
8500,2016-12-20 04:00:00,0,0,0,218.419,Education,7432,2008.0,,23.3,,21.7,0.0,1026.7,10.0,3.6,4
8218,2016-12-08 10:00:00,0,0,0,68.9386,Education,7432,2008.0,,13.9,0.0,12.8,0.0,1019.1,360.0,2.6,10
5707,2016-08-25 19:00:00,0,0,0,232.753,Education,7432,2008.0,,33.3,4.0,21.1,0.0,1016.3,60.0,7.2,19


In [13]:
energy_0_train_averages = energy_0_train.groupby("hour")["meter_reading"].mean()

energy_0_train_lr = pd.DataFrame(energy_0_train, columns=["meter_reading", "air_temperature", "dew_temperature"])
y = energy_0_train_lr["meter_reading"]
x = energy_0_train_lr.drop(labels=["meter_reading"], axis=1)
model = LinearRegression().fit(x, y)
print(model.coef_, model.intercept_)

[2.26930549 4.14538281] 100.29387180495476


In [15]:
def calculate_model (x):
  meter_reading_log = np.log(x.meter_reading + 1)
  meter_reading_mean = np.log(energy_0_train_averages[x.hour] + 1)
  meter_reading_lr = np.log(1 + x.air_temperature * model.coef_[0] +
                                x.dew_temperature * model.coef_[1] +
                                model.intercept_)

  x["meter_reading_lr_q"] = (meter_reading_log - meter_reading_lr) ** 2
  x["meter_reading_mean_q"] = (meter_reading_log - meter_reading_mean) ** 2
  return x

energy_0_test = energy_0_test.apply(calculate_model, axis=1, result_type="expand")

energy_0_test_lr_rmsle = np.sqrt(energy_0_test['meter_reading_lr_q'].sum() / len(energy_0_test))
energy_0_test_mean_rmsle = np.sqrt(energy_0_test['meter_reading_mean_q'].sum() / len(energy_0_test))

print(f"Якість середнього: {energy_0_test_mean_rmsle}")
print(f"Якість лінійної регресії: {energy_0_test_lr_rmsle}")

Якість середнього: 0.2498711009262749
Якість лінійної регресії: 0.21777940789489802
