In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the dataset
data = pd.read_csv('energydata_complete.csv')

# Fit a linear model to predict T6 from T2
X = data[['T2']]
y = data['T6']
lm = LinearRegression()
lm.fit(X, y)

# Calculate the Root Mean Squared Error (RMSE) for the linear model
y_pred = lm.predict(X)
rmse = np.sqrt(mean_squared_error(y, y_pred))
print(f"RMSE for the linear model: {rmse:.3f}")


RMSE for the linear model: 3.644


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Remove date and lights columns
data.drop(['date', 'lights'], axis=1, inplace=True)

# Define features and target variable
X = data.drop('Appliances', axis=1)
y = data['Appliances']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Normalize the dataset using MinMaxScaler
scaler = MinMaxScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

# Train a multiple linear regression model
lm = LinearRegression()
lm.fit(X_train_norm, y_train)

# Predictions on the training set
y_train_pred = lm.predict(X_train_norm)

# Calculate Mean Absolute Error (MAE) for the training set
mae_train = mean_absolute_error(y_train, y_train_pred)
print(f"Mean Absolute Error (MAE) for the training set: {mae_train:.3f}")

# Calculate Root Mean Squared Error (RMSE) for the training set
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f"Root Mean Squared Error (RMSE) for the training set: {rmse_train:.3f}")

# Predictions on the test set
y_test_pred = lm.predict(X_test_norm)

# Calculate Mean Absolute Error (MAE) for the test set
mae_test = mean_absolute_error(y_test, y_test_pred)
print(f"Mean Absolute Error (MAE) for the test set: {mae_test:.3f}")

# Calculate Root Mean Squared Error (RMSE) for the test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print(f"Root Mean Squared Error (RMSE) for the test set: {rmse_test:.3f}")

# Check for overfitting
overfitting = "Yes" if rmse_train < rmse_test else "No"
print(f"Did the model overfit to the training set? {overfitting}")

# Train a Ridge regression model
ridge = Ridge()
ridge.fit(X_train_norm, y_train)

# Evaluate Ridge model on the test set
ridge_rmse_test = np.sqrt(mean_squared_error(y_test, ridge.predict(X_test_norm)))
print(f"RMSE with Ridge Regression on the test set: {ridge_rmse_test:.3f}")

# Train a Lasso regression model
lasso = Lasso()
lasso.fit(X_train_norm, y_train)

# Evaluate Lasso model on the test set
lasso_rmse_test = np.sqrt(mean_squared_error(y_test, lasso.predict(X_test_norm)))
print(f"RMSE with Lasso Regression on the test set: {lasso_rmse_test:.3f}")

# Number of features with non-zero weights in Lasso regression
non_zero_features = np.sum(lasso.coef_ != 0)
print(f"Number of features with non-zero weights in Lasso Regression: {non_zero_features}")


Mean Absolute Error (MAE) for the training set: 53.742
Root Mean Squared Error (RMSE) for the training set: 95.216
Mean Absolute Error (MAE) for the test set: 53.643
Root Mean Squared Error (RMSE) for the test set: 93.640
Did the model overfit to the training set? No
RMSE with Ridge Regression on the test set: 93.709
RMSE with Lasso Regression on the test set: 99.424
Number of features with non-zero weights in Lasso Regression: 4


In [3]:

# Train a Ridge regression model with default parameters
ridge = Ridge()
ridge.fit(X_train_norm, y_train)

# Predictions on the test set
y_test_pred_ridge = ridge.predict(X_test_norm)

# Calculate Root Mean Squared Error (RMSE) for the test set with Ridge regression
rmse_test_ridge = np.sqrt(mean_squared_error(y_test, y_test_pred_ridge))
print(f"RMSE with Ridge Regression on the test set: {rmse_test_ridge:.3f}")

# Compare with RMSE of multiple linear regression
print(f"RMSE of multiple linear regression on the test set: {rmse_test:.3f}")


RMSE with Ridge Regression on the test set: 93.709
RMSE of multiple linear regression on the test set: 93.640
