In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv('energydata_complete.csv')
df.sample(3)

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
9445,2016-03-17 07:10:00,60,0,20.6,36.223333,17.066667,40.56,22.0,36.0,20.0,...,19.6,39.4,-0.35,766.35,87.666667,1.166667,62.666667,-2.233333,5.216816,5.216816
4690,2016-02-13 06:40:00,50,0,20.29,38.29,18.29,39.59,21.0,40.06,18.7,...,18.6,43.0,1.5,738.933333,87.333333,2.0,23.666667,-0.433333,43.28419,43.28419
7702,2016-03-05 04:40:00,50,0,20.356667,37.126667,17.566667,40.23,20.89,37.663333,18.7,...,18.39,41.09,0.3,740.333333,99.0,1.0,41.333333,0.1,4.622052,4.622052


In [3]:
df = df.drop(['date', 'lights'], axis=1)

In [5]:
sc = MinMaxScaler()
normal_df = pd.DataFrame(sc.fit_transform(df), columns = df.columns)


In [14]:
normal_df.head()

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,0.046729,0.32735,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,...,0.223032,0.67729,0.37299,0.097674,0.894737,0.5,0.953846,0.538462,0.265449,0.265449
1,0.046729,0.32735,0.541326,0.225345,0.68214,0.215188,0.748871,0.351351,0.782437,0.175506,...,0.2265,0.678532,0.369239,0.1,0.894737,0.47619,0.894872,0.533937,0.372083,0.372083
2,0.037383,0.32735,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,0.175506,...,0.219563,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848,0.572848
3,0.037383,0.32735,0.52408,0.225345,0.678414,0.215188,0.758685,0.341441,0.770949,0.175506,...,0.219563,0.671909,0.361736,0.104651,0.894737,0.428571,0.776923,0.524887,0.908261,0.908261
4,0.046729,0.32735,0.531419,0.225345,0.676727,0.215188,0.758685,0.341441,0.762697,0.178691,...,0.219563,0.671909,0.357985,0.106977,0.894737,0.404762,0.717949,0.520362,0.201611,0.201611


In [19]:
X = normal_df['T2'].values.reshape(-1,1)
Y = normal_df['T6'].values

In [20]:
X.shape

(19735, 1)

In [21]:
Y.shape

(19735,)

In [22]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state = 42)

In [23]:
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)

y_pred = lin_reg.predict(x_test)

In [24]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [25]:
r_squared = r2_score(y_test, y_pred)

In [33]:
print('R Squared is %.2f'% r_squared)

R Squared is 0.64


In [27]:
features = normal_df.drop(['Appliances'], axis=1)
target = normal_df['Appliances']

In [28]:
features.shape

(19735, 26)

In [29]:
target.shape

(19735,)

In [30]:
X_train, X_test, Y_train, Y_test = train_test_split(features, target, test_size=0.3, random_state = 42)

In [31]:
linear_reg = LinearRegression()
linear_reg.fit(X_train, Y_train)

y_prediction = linear_reg.predict(X_test)

In [35]:
mae = mean_absolute_error(Y_test, y_prediction)
print('Mean Absolute Error is %.2f '% mae)

Mean Absolute Error is 0.05 


In [36]:
rss = np.sum(np.square(Y_test - y_prediction))
print('Residual Sum of Squares is %.2f '% rss)

Residual Sum of Squares is 45.35 


In [37]:
mse = mean_squared_error(Y_test, y_prediction)
rmse = np.sqrt(mse)
print('Root Mean Squared Error is %.3f '% rmse)

Root Mean Squared Error is 0.088 


In [39]:
coef_of_det = r2_score(Y_test, y_prediction)

print('Coefficient of Determination is %.2f '% coef_of_det)

Coefficient of Determination is 0.15 


In [44]:
general= {'Features': features.columns, 'Coefficients': linear_reg.coef_}

coefficient_df = pd.DataFrame(general)
coefficient_df.sort_values(by='Coefficients', ascending=False)

Unnamed: 0,Features,Coefficients
1,RH_1,0.553547
4,T3,0.290627
10,T6,0.236425
23,Tdewpoint,0.117758
14,T8,0.101995
5,RH_3,0.096048
11,RH_6,0.038049
21,Windspeed,0.029183
6,T4,0.028981
7,RH_4,0.026386


In [45]:
ridge_reg = Ridge(alpha=0.4)
ridge_reg.fit(X_train, Y_train)

ridge_pred = ridge_reg.predict(X_test)

In [49]:
mse_ridge = mean_squared_error(Y_test, ridge_pred)
rmse_ridge = np.sqrt(mse_ridge)
print('Root Mean Squared Error for Ridge Regression is %.3f '% rmse_ridge)

Root Mean Squared Error for Ridge Regression is 0.088 


In [47]:
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train, Y_train)


Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [48]:
general_lasso= {'Features': features.columns, 'Coefficients': lasso_reg.coef_}

coeff_lasso_df = pd.DataFrame(general_lasso)
coeff_lasso_df.sort_values(by='Coefficients', ascending=False)

Unnamed: 0,Features,Coefficients
1,RH_1,0.01788
21,Windspeed,0.002912
0,T1,0.0
12,T7,-0.0
24,rv1,-0.0
23,Tdewpoint,0.0
22,Visibility,0.0
19,Press_mm_hg,-0.0
18,T_out,0.0
17,RH_9,-0.0


In [50]:
lasso_pred = lasso_reg.predict(X_test)

In [51]:
mse_lasso = mean_squared_error(Y_test, lasso_pred)
rmse_lasso = np.sqrt(mse_lasso)
print('Root Mean Squared Error for Lasso Regression is %.3f '% rmse_lasso)

Root Mean Squared Error for Lasso Regression is 0.094 
