In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime


df1 = pd.read_csv('YEAR\\sessions_filtered_2018.csv') # Load the extra data for 2019
df2 = pd.read_csv('YEAR\\sessions_filtered_2019.csv')  # use raw string for Windows paths
df3 = pd.read_csv('YEAR\\sessions_filtered_2020.csv')  # use raw string for Windows paths 
df4 = pd.read_csv('YEAR\\sessions_filtered_2021.csv')  # use raw string for Windows paths

df = pd.concat([df1, df2, df3, df4], ignore_index=True)
df.dropna(subset=['sessionID', 'connectionTime', 'disconnectTime', 'doneChargingTime', 'kWhDelivered'], inplace=True)

for col in ['connectionTime', 'disconnectTime', 'doneChargingTime']:
    df[col] = pd.to_datetime(df[col], errors='coerce')

df.dropna(subset=['connectionTime', 'disconnectTime'], inplace=True)

df['charge_duration'] = (df['doneChargingTime'] - df['connectionTime']).dt.total_seconds() / 3600.0
df = df[df['charge_duration'] > 0]

df['power_output'] = df['kWhDelivered'] / df['charge_duration']


df_start = df[['connectionTime', 'power_output']].copy()
df_start.rename(columns={'connectionTime': 'timestamp'}, inplace=True)

df_end = df[['doneChargingTime', 'power_output']].copy()
df_end.rename(columns={'doneChargingTime': 'timestamp'}, inplace=True)
df_end['power_output'] = -df_end['power_output']


events_df = pd.concat([df_start, df_end]).sort_values('timestamp')

events_df['total_power'] = events_df['power_output'].cumsum()

events_df = events_df.set_index('timestamp')

daily_avg = events_df['total_power'].resample('D').mean().reset_index()


fig_demand = px.scatter(daily_avg,
                     x='timestamp',
                     y='total_power',
                     labels={'timestamp': 'Date', 'total_power': 'Average System Demand (kW)'},
                     title='Average kW Demand of the Garage Throughout the Year')
fig_demand.show()


In [46]:
# Exclude Saturdays and Sundays from the daily average
daily_avg = daily_avg[daily_avg['timestamp'].dt.dayofweek < 5]

fig_demand = px.scatter(
    daily_avg,
    x='timestamp',
    y='total_power',
    labels={'timestamp': 'Date', 'total_power': 'Average System Demand (kW)'},
    title='Average kW Demand of the Garage Throughout the Year (Weekdays Only)'
)

fig_demand.show()


In [None]:
# Generate and exclude US holidays for the years 2018-2021
years = [2018, 2019, 2020, 2021]
us_holidays = holidays.US(years=years)
holiday_dates = pd.to_datetime(list(us_holidays.keys()))
daily_avg = daily_avg[~daily_avg['timestamp'].isin(holiday_dates)]

# Also exclude any dates from December 24 to December 31 regardless
daily_avg = daily_avg[~((daily_avg['timestamp'].dt.month == 12) & (daily_avg['timestamp'].dt.day >= 24))]

fig_demand = px.scatter(
    daily_avg,
    x='timestamp',
    y='total_power',
    labels={'timestamp': 'Date', 'total_power': 'Average System Demand (kW)'},
    title='Average kW Demand of the Garage (Weekdays, Non-Holidays, and Dec 24-31 Excluded)'
)

fig_demand.show()


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score

def plot_regression_models(daily_avg, start_date, end_date, plot_start=None, plot_end=None, models=('linear', 'quadratic')):
    start_dt = pd.to_datetime(start_date)
    end_dt = pd.to_datetime(end_date)
    
    mask = (daily_avg['timestamp'] >= start_dt) & (daily_avg['timestamp'] <= end_dt)
    df_filtered = daily_avg.loc[mask].copy()
    df_filtered.dropna(subset=['total_power'], inplace=True)
    df_filtered.sort_values('timestamp', inplace=True)
    
    df_filtered['days'] = (df_filtered['timestamp'] - start_dt).dt.days
    X = df_filtered['days'].values.reshape(-1, 1) 
    y = df_filtered['total_power'].values         

    plot_start_dt = pd.to_datetime(plot_start) if plot_start is not None else start_dt
    plot_end_dt = pd.to_datetime(plot_end) if plot_end is not None else end_dt
    
    plot_start_day = (plot_start_dt - start_dt).days
    plot_end_day = (plot_end_dt - start_dt).days
    
    x_range = np.linspace(plot_start_day, plot_end_day, 200).reshape(-1, 1)
    x_date_range = start_dt + pd.to_timedelta(x_range.flatten(), unit='D')
    
    def fit_and_predict(model, X, y, X_plot):
        model.fit(X, y)
        y_pred = model.predict(X)
        y_plot = model.predict(X_plot)
        r2 = r2_score(y, y_pred)
        mse = mean_squared_error(y, y_pred)
        return y_plot, r2, mse

    regressions = {}

    if 'linear' in models:
        lin_model = LinearRegression()
        y_lin, r2_lin, mse_lin = fit_and_predict(lin_model, X, y, x_range)
        regressions['Linear (Degree 1)'] = (y_lin, r2_lin, mse_lin)
    
    if 'quadratic' in models:
        poly2 = PolynomialFeatures(degree=2)
        X_poly2 = poly2.fit_transform(X)
        lin_poly2 = LinearRegression()
        x_range_poly2 = poly2.transform(x_range)
        y_poly2, r2_poly2, mse_poly2 = fit_and_predict(lin_poly2, X_poly2, y, x_range_poly2)
        regressions['Quadratic (Degree 2)'] = (y_poly2, r2_poly2, mse_poly2)
    
    fig = go.Figure()

    df_all = daily_avg.copy()
    df_all.dropna(subset=['total_power'], inplace=True)
    df_all.sort_values('timestamp', inplace=True)
    fig.add_trace(go.Scatter(x=df_all['timestamp'], y=df_all['total_power'], mode='markers', name='All Data Points', marker=dict(color='light blue')))
    
    fig.add_trace(go.Scatter(x=df_filtered['timestamp'], y=y, mode='markers', name='Data Points used for estimation', marker=dict(size=8, color='blue')))
    
    for label, (y_pred_line, r2_val, mse_val) in regressions.items():
        fig.add_trace(go.Scatter(x=x_date_range, y=y_pred_line, mode='lines', name=f"{label}: R²={r2_val:.3f}, MSE={mse_val:.3f}" ))
    
    fig.update_layout(title="Regression Analysis",xaxis_title="Date",yaxis_title="Average System Demand (kW)",legend_title="Legend")
    fig.show()



In [65]:
plot_regression_models(daily_avg, '2018-11-01','2020-03-01', plot_start='2018-04-25', plot_end='2022-01-31', models=('linear', 'quadratic'))

In [67]:
plot_regression_models(daily_avg, '2018-04-24', '2018-10-30', plot_start='2018-04-25', plot_end='2022-01-31', models=('linear', 'quadratic'))

In [59]:
plot_regression_models(daily_avg, '2018-04-24', '2018-10-30', plot_start='2018-04-25', plot_end='2022-01-31', models=('linear'))