# Downloading data
Go to the [Kaggle download page](https://www.kaggle.com/datasets/robikscube/hourly-energy-consumption?resource=download)

## Import the data and libraries & Set Up 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm

In [None]:
# Read the data from the pjme-data dataset
energy_df = pd.read_csv('DAYTON_hourly.csv')

# For joining purposes
energy_df.set_index('Datetime', inplace=True) 

# We need same type when joining (other dataset wil have datetime)
energy_df.index = pd.to_datetime(energy_df.index)

# Display the first few rows of the dataframe
energy_df.head(-10)

# dayton_temp_df.head(num) - helps specify how many rows of the dataframe you want to see

In [None]:
dayton_temp_df = pd.read_csv('dayton-temp.csv')

# Convert 'DATE' column to datetime, assuming the year is 2010
dayton_temp_df['DATE'] = pd.to_datetime('2010-' + dayton_temp_df['DATE'].str[:5] + dayton_temp_df['DATE'].str[5:], format='%Y-%m-%dT%H:%M:%S')

# Set the 'DATE' column as the index
dayton_temp_df.set_index('DATE', inplace=True)

# Display the first few rows to verify
print(dayton_temp_df.head())

In [None]:
energy_temp_df = energy_df.join(dayton_temp_df, how='inner')
energy_temp_df.head()

# Clean the data

In [None]:
# Remove all columns which is not HLY-TEMP-NORMAL, and DAYTON_MW
energy_temp_df = energy_temp_df[['HLY-TEMP-NORMAL', 'DAYTON_MW']]
energy_temp_df.head()

In [None]:
# drop rows with missing values
energy_temp_df.dropna(inplace=True)

# Visualizing the data

In [None]:
# Plotting the data

plt.figure(figsize=(12, 6))
energy_temp_df['DAYTON_MW'].plot(label='Hourly', alpha=0.5)
energy_temp_df.resample('D').mean()['DAYTON_MW'].plot(label='Daily Mean', alpha=0.8)
energy_temp_df.resample('ME').mean()['DAYTON_MW'].plot(label='Monthly Mean', linewidth=3)
plt.legend()


In [None]:
# set up the legend for hourly, daily, and monthly

plt.figure(figsize=(12, 6))
energy_temp_df.resample('H').mean()['HLY-TEMP-NORMAL'].plot(label='Hourly', alpha=0.5)
energy_temp_df.resample('D').mean()['HLY-TEMP-NORMAL'].plot(label='Daily Mean', alpha=0.8)
energy_temp_df.resample('M').mean()['HLY-TEMP-NORMAL'].plot(label='Monthly Mean', linewidth=3)
plt.legend()


In [None]:
def do_stats(df, x_col, y_col):
    # Drop all rows with NaN values
    df.dropna(inplace=True)

    # Add a constant to the independent value
    X = sm.add_constant(df[x_col])

    # Fit the model with the temperature as the independent variable and energy consumption as the dependent variable
    model = sm.OLS(df[y_col], X)

    results = model.fit()

    # Print out the statistics
    print(results.summary())

    # Plot the data and the best fit line
    plt.figure(figsize=(12, 6))
    plt.scatter(energy_temp_df[x_col], energy_temp_df[y_col], alpha=0.5)
    plt.plot(energy_temp_df[x_col], results.predict(), color='red', linewidth=3)
    plt.xlabel('Temperature')
    plt.ylabel('Energy Consumption')
    plt.title('Energy Consumption vs Temperature')
    plt.show()

This doesn't yield great results (we can see that clearly from the graph, but it's verified using R^2)

In [None]:
do_stats(energy_temp_df, 'HLY-TEMP-NORMAL', 'DAYTON_MW')

In [None]:
# Make a new column for the absolute value of the different between the actual temperature and 50 degrees
energy_temp_df['TEMP_DIFF'] = np.abs(energy_temp_df['HLY-TEMP-NORMAL'] - 50)

In [None]:
do_stats(energy_temp_df, 'TEMP_DIFF', 'DAYTON_MW')

In [None]:
# plot temperature difference
plt.figure(figsize=(12, 6))
energy_temp_df['TEMP_DIFF'].plot()

In [None]:
# plot temperature difference
plt.figure(figsize=(12, 6))
energy_temp_df['DAYTON_MW'].plot()