In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

In [18]:
def load_data(file_path):
    data = pd.read_csv(file_path)
    data['Datetime'] = pd.to_datetime(data['Datetime'])
    data['Total_consumption'] = data['Sub_metering_1'] + data['Sub_metering_2'] + data['Sub_metering_3']
    data['Month'] = data['Datetime'].dt.month
    data['Year'] = data['Datetime'].dt.year
    return data

In [19]:
def preprocess_data(data):
    grouped_data = data.groupby(['Year', 'Month'], as_index=False).agg({
        'Total_consumption': 'sum',
        'Voltage': 'mean',
        'Global_intensity': 'mean',
        'power_factor': 'mean'
    })
    return grouped_data

In [20]:
def train_model(data):
    X = data[['Voltage', 'Global_intensity', 'power_factor']]
    y = data['Total_consumption']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Linear Regression
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    lr_pred = lr.predict(X_test)
    lr_rmse = np.sqrt(mean_squared_error(y_test, lr_pred))

    # Decision Tree
    dt = DecisionTreeRegressor(random_state=42)
    dt.fit(X_train, y_train)
    dt_pred = dt.predict(X_test)
    dt_rmse = np.sqrt(mean_squared_error(y_test, dt_pred))

    return lr, dt, lr_rmse, dt_rmse, X_train.columns