In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [6]:
df = pd.read_csv('energydata_complete.csv')

In [8]:
df.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [9]:
# Extract the predictor variable 
X = df[['T2']]
y = df['T6']

In [10]:
# Create a LinearRegression object and fit the model to the data
reg = LinearRegression().fit(X, y)

In [11]:
# Get the R-squared value of the model
r_squared = reg.score(X, y)

# Print the R-squared value rounded to two decimal places
print('R-squared:', round(r_squared, 2))

R-squared: 0.64


In [13]:
# Remove the "date" and "lights" columns from df
df = df.drop(['date', 'lights'], axis=1)

In [18]:
# Normalize the dataframe variables using the MinMaxScaler
scaler = MinMaxScaler()
df_2 = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [19]:
# Extract the predictor variables and response variable from df
X = df_2.drop('Appliances', axis=1)
y = df_2['Appliances']

In [20]:
# Split the data into 70-30 train-test sets with a random state of 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit a multiple linear regression model to the training data
model = LinearRegression().fit(X_train, y_train)

In [21]:
# Evaluate the model on the test data
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)

# Print the mean absolute error rounded to two decimal places
print('Mean Absolute Error:', round(mae, 2))

Mean Absolute Error: 0.05


In [22]:
rss = ((y_test - y_pred) ** 2).sum()

# Print the RSS rounded to two decimal places
print('Residual Sum of Squares:', round(rss, 2))

Residual Sum of Squares: 45.35


In [24]:
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Print the Root Mean Squared Error rounded to 3 decimal places
print('Root Mean Squared Error:', round(rmse, 3))

Root Mean Squared Error: 0.088


In [26]:
r_squared = r2_score(y_test, y_pred)

# Print the R-squared rounded to two decimal places

print('Coefficient of Determination (R-squared):', round(r_squared, 2))

Coefficient of Determination (R-squared): 0.15


In [28]:
# Print the feature weights (coefficients)
coef = pd.Series(model.coef_, index=X.columns).sort_values()
print(coef)

RH_2          -0.456698
T_out         -0.321860
T2            -0.236178
T9            -0.189941
RH_8          -0.157595
RH_out        -0.077671
RH_7          -0.044614
RH_9          -0.039800
T5            -0.015657
T1            -0.003281
rv2            0.000770
rv1            0.000770
Press_mm_hg    0.006839
T7             0.010319
Visibility     0.012307
RH_5           0.016006
RH_4           0.026386
T4             0.028981
Windspeed      0.029183
RH_6           0.038049
RH_3           0.096048
T8             0.101995
Tdewpoint      0.117758
T6             0.236425
T3             0.290627
RH_1           0.553547
dtype: float64


In [30]:
# Train a ridge regression model with an alpha value of 0.4
ridge = Ridge(alpha=0.4)
ridge.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred_2 = ridge.predict(X_test)
rmse = mean_squared_error(y_test, y_pred_2, squared=False)
print("RMSE using Ridge: {:.3f}".format(rmse))

RMSE using Ridge: 0.088


In [32]:
# Train a Lasso regression model with alpha=0.001
lasso = Lasso(alpha=0.001)
lasso.fit(X_train, y_train)

Lasso(alpha=0.001)

In [35]:
# Get the feature weights
feature_weights = lasso.coef_

# Count the number of non-zero feature weights
num_nonzero = np.count_nonzero(feature_weights)
print("Number of features with non-zero weights:", num_nonzero)

Number of features with non-zero weights: 4


In [36]:
# Evaluate the model on the test set
y_pred_3 = lasso.predict(X_test)
rmse = mean_squared_error(y_test, y_pred_3, squared=False)
print("RMSE using lasso: {:.3f}".format(rmse))

RMSE using lasso: 0.094
