# Linear Regression Analysis

Author(s):
* Federico Maria Cruciani

In [188]:
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from math import sqrt
from sklearn.metrics import mean_squared_error, r2_score

In [189]:
df = pd.read_csv('../../dataset/KAG_energydata_complete.csv')

## Setup data

In [190]:
col_temp = ["T1", "T2", "T3", "T4", "T5", "T6", "T7", "T8", "T9"]
col_hum = ["RH_1", "RH_2", "RH_3", "RH_4", "RH_5", "RH_6", "RH_7", "RH_8", "RH_9"]
col_weather = ["T_out", "Tdewpoint", "RH_out", "Press_mm_hg", "Windspeed", "Visibility"]
col_light = ["lights"]
col_randoms = ["rv1", "rv2"]
col_target = ["Appliances"]

In [191]:
train: pd.DataFrame
test: pd.DataFrame
train, test = train_test_split(df, test_size=0.25, random_state=42)
feature_vars = train[col_temp + col_hum + col_weather + col_randoms]
target_vars = train[col_target]

In [192]:
feature_vars.describe()

Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,T9,RH_1,...,RH_8,RH_9,T_out,Tdewpoint,RH_out,Press_mm_hg,Windspeed,Visibility,rv1,rv2
count,14801.0,14801.0,14801.0,14801.0,14801.0,14801.0,14801.0,14801.0,14801.0,14801.0,...,14801.0,14801.0,14801.0,14801.0,14801.0,14801.0,14801.0,14801.0,14801.0,14801.0
mean,21.684882,20.340154,22.262874,20.847201,19.588907,7.902213,20.261409,22.026272,19.481392,40.258743,...,42.951792,41.564688,7.409235,3.770374,79.815677,755.535791,4.031558,38.336201,25.02683,25.02683
std,1.607712,2.194775,2.003559,2.045689,1.846895,6.100189,2.112339,1.955867,2.017567,3.964974,...,5.225716,4.157938,5.326024,4.19984,14.908459,7.380735,2.438925,11.825196,14.496293,14.496293
min,16.79,16.1,17.2,15.1,15.33,-6.03,15.39,16.306667,14.89,27.023333,...,29.6,29.166667,-5.0,-6.6,24.0,729.3,0.0,1.0,0.006033,0.006033
25%,20.76,18.79,20.79,19.5,18.275,3.59,18.7,20.79,18.0,37.363333,...,39.09,38.518,3.6,0.9,70.333333,750.933333,2.0,29.0,12.511717,12.511717
50%,21.6,20.0,22.1,20.6,19.39,7.3,20.0,22.1,19.39,39.633333,...,42.397143,40.9,6.9,3.433333,84.0,756.1,3.666667,40.0,24.9303,24.9303
75%,22.6,21.5,23.29,22.1,20.6,11.233333,21.6,23.39,20.6,43.06,...,46.555,44.363333,10.4,6.583333,91.666667,760.966667,5.333333,40.0,37.660334,37.660334
max,26.26,29.856667,29.236,26.2,25.795,28.29,25.963333,27.23,24.5,57.423333,...,58.78,53.326667,26.1,15.4,100.0,772.283333,14.0,66.0,49.99653,49.99653


In [None]:
_ = feature_vars.hist(bins=20, figsize=(12, 16))

In [193]:
train_X = train[feature_vars.columns]
train_y = train[target_vars.columns]

test_X = test[feature_vars.columns]
test_y = test[target_vars.columns]

In [194]:
train_X.drop(["rv1", "rv2", "Visibility", "T9", "Tdewpoint"], axis=1 , inplace=True)
test_X.drop(["rv1", "rv2", "Visibility", "T9", "Tdewpoint"], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_X.drop(["rv1", "rv2", "Visibility", "T9", "Tdewpoint"], axis=1 , inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_X.drop(["rv1", "rv2", "Visibility", "T9", "Tdewpoint"], axis=1, inplace=True)


### Perform scaling with Appliances column included

In [195]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler(with_mean=False)

In [196]:
# Create test and training set by including Appliances column
train = train[list(train_X.columns.values) + col_target]
test = test[list(test_X.columns.values) + col_target]

# Create dummy test and training set to hold scaled values
sc_train = pd.DataFrame(columns=train.columns, index=train.index)
sc_train[sc_train.columns] = sc.fit_transform(train)

sc_test = pd.DataFrame(columns=test.columns, index=test.index)
sc_test[sc_test.columns] = sc.fit_transform(test)

In [197]:
train_X = sc_train.drop(['Appliances'], axis=1)
train_y = sc_train['Appliances']

test_X = sc_test.drop(['Appliances'], axis=1)
test_y = sc_test['Appliances']

## Models testing

In [198]:
def print_metrics(model, X_train, y_train, X_test, y_test):
    print(f'Train R2 score: {r2_score(y_train, model.predict(X_train))}')
    print(f'Test R2 score: {r2_score(y_test, model.predict(X_test))}')
    
    mse = mean_squared_error(y_test, model.predict(X_test))
    print(f'Test RMSE score: {sqrt(mse)}')

### Gamma regressor

In [199]:
from sklearn.linear_model import GammaRegressor
from sklearn.utils import column_or_1d

gamma = GammaRegressor(max_iter=5000)

start = time.time()
gamma.fit(train_X, column_or_1d(train_y))
end = time.time()
print(f'Train time: {end - start}')

Train time: 0.24423766136169434


In [200]:
print_metrics(gamma, train_X, train_y, test_X, test_y)

Train R2 score: 0.055208994161055625
Test R2 score: 0.05240442930404876
Test RMSE score: 0.9734452068277657


### Polynomial regressor

In [201]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
train_X_poly = poly.fit_transform(train_X)
test_X_poly = poly.fit_transform(test_X)

In [202]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
start = time.time()
lr.fit(train_X_poly, train_y)
end = time.time()
print(f'Train time: {end - start}')

Train time: 0.30675601959228516


In [203]:
print_metrics(lr, train_X_poly, train_y, test_X_poly, test_y)

Train R2 score: 0.28757316261111976
Test R2 score: 0.19107140124694144
Test RMSE score: 0.8994045801267962


### Inverse Gaussian regressor

In [204]:
from sklearn.linear_model import TweedieRegressor

inv_gaussian = TweedieRegressor(power=3, max_iter=5000)

start = time.time()
inv_gaussian.fit(train_X, column_or_1d(train_y))
end = time.time()
print(f'Train time: {end - start}')

Train time: 0.2502450942993164


In [205]:
print_metrics(inv_gaussian, train_X, train_y, test_X, test_y)

Train R2 score: 0.05487534461571064
Test R2 score: 0.050562880576359714
Test RMSE score: 0.9743906400533825
