In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib

Using matplotlib backend: Qt5Agg


In [3]:
df = pd.read_csv('energydata_complete.csv')
df.head(1)

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433


In [4]:
df = df.drop(['date', 'lights'], axis=1)

In [27]:
df.head(1)

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,60,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433


### Normalising the Data

In [36]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
normalised_df = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)
features_df = normalised_df.drop(columns=['Appliances'])
target_df = normalised_df['Appliances']

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,0.046729,0.32735,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,...,0.223032,0.67729,0.37299,0.097674,0.894737,0.5,0.953846,0.538462,0.265449,0.265449
1,0.046729,0.32735,0.541326,0.225345,0.68214,0.215188,0.748871,0.351351,0.782437,0.175506,...,0.2265,0.678532,0.369239,0.1,0.894737,0.47619,0.894872,0.533937,0.372083,0.372083
2,0.037383,0.32735,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,0.175506,...,0.219563,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848,0.572848
3,0.037383,0.32735,0.52408,0.225345,0.678414,0.215188,0.758685,0.341441,0.770949,0.175506,...,0.219563,0.671909,0.361736,0.104651,0.894737,0.428571,0.776923,0.524887,0.908261,0.908261
4,0.046729,0.32735,0.531419,0.225345,0.676727,0.215188,0.758685,0.341441,0.762697,0.178691,...,0.219563,0.671909,0.357985,0.106977,0.894737,0.404762,0.717949,0.520362,0.201611,0.201611


### Splitting the Data

In [35]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(features_df, target_df, test_size = 0.3, random_state = 42)

ValueError: Found input variables with inconsistent numbers of samples: [5, 19735]

In [7]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()

reg.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [8]:
y_pred = reg.predict(x_test)

### R2_Score

In [9]:
from sklearn.metrics import r2_score

score = r2_score(y_test, y_pred)
score

0.14890246319303535

### Mean Squared Error

In [10]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pred)
round(mse, 3)

0.008

### Root Mean Squared Error

In [15]:
rmse = np.sqrt(mse)
rmse

0.0875144494766171

### Residual Sum of Means

In [12]:
rss = np.sum(np.square(y_test - y_pred))
round(rss, 3)

45.348

### Mean Absolute Error

In [14]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_pred)
round(mae, 3)

0.05

### Ridge and Lasso Regression

In [80]:
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha= 0.5 )
ridge_reg.fit(x_train, y_train)

Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [81]:
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha= 0.001 )
lasso_reg.fit(x_train, y_train)

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [86]:
def get_weights_df(reg, feat, col_name) :
#this function returns the weight of every feature
    weights = pd.Series(reg.coef_, feat.columns).sort_values()
    weights_df = pd.DataFrame(weights).reset_index()
    weights_df.columns = [ 'Features' , col_name]
    weights_df[col_name].round( 3 )
    return weights_df

In [87]:
linear_model_weights = get_weights_df(reg, x_train, 'Linear_Model_Weight' )
ridge_weights_df = get_weights_df(ridge_reg, x_train, 'Ridge_Weight' )
lasso_weights_df = get_weights_df(lasso_reg, x_train, 'Lasso_weight' )
final_weights = pd.merge(linear_model_weights, ridge_weights_df, on= 'Features' )
final_weights = pd.merge(final_weights, lasso_weights_df, on= 'Features' )

In [19]:
X = df.T2
Y = df.T6

In [20]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(df[['T2']], df.T6) 

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [22]:
ypred = reg.predict(df[['T6']])

In [26]:
from sklearn.metrics import r2_score
score = r2_score(df.T6, ypred)
round(score, 2)

-21.13

### Residual Sum of Squares