# This notebook shows how we implemented clustering using the whole dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
import matplotlib.pyplot as plt

In [None]:
file_path3 = '/kaggle/input/all-data/residential_all.pkl'
df3 = pd.read_pickle(file_path3)
df3["ID"] = df3["ID"].astype("category")
df3["time_code"] = df3["time_code"].astype("uint16")

In [None]:
df3 = df3.set_index(["date_time","ID"])
df3 = df3.groupby('ID', group_keys=False, observed=True).apply(resample_building_data)
df3=df3.reset_index(level=['ID',"date_time"])

In [None]:
# Generate the range of date_time values
start_time = pd.Timestamp('2009-07-14 00:00:00')
end_time = pd.Timestamp('2011-01-01 00:00:00')
date_range = pd.date_range(start=start_time, end=end_time, freq='h')

# Pivot the dataset
df_pivoted = df3.pivot(index='ID', columns='date_time', values='consumption').reset_index()

# Ensure columns are sorted by date_time
# df_pivoted = df_pivoted.sort_index(axis=1)

# Optional: Rename the columns to make them more readable
df_pivoted.columns.name = None
df_pivoted.columns = ['ID'] + [date.strftime('%Y-%m-%d %H:%M:%S') for date in date_range]

In [None]:
# Assuming df_pivoted is your pivoted DataFrame with 'ID' as the first column
# Create a list to store the aggregation results
aggregation_results = []

for i, row in df_pivoted.iterrows():
    building_data = row[1:].astype(float)  # Skip the first column (ID)
    building_series = pd.Series(building_data.values, index=pd.date_range(start='2009-07-14 00:00:00', periods=len(building_data), freq='h'))
    
    # Perform the aggregations
    weekday_avg = weekday_average(building_series)
    segment_avg = day_segment_average(building_series)
    total_energy = total_energy_used(building_series)
    avg_energy = average_energy_used(building_series)
    we_bd_avg = weekend_businessday_avg(building_series)
    
    # Combine all the aggregation results for the current building
    aggregation_result = np.concatenate([weekday_avg, segment_avg, [total_energy], avg_energy, we_bd_avg])
    aggregation_results.append(aggregation_result)

# Define the columns for the aggregation results
aggregation_columns = [
    'avg_mon', 'avg_tue', 'avg_wed', 'avg_thu', 'avg_fri', 'avg_sat', 'avg_sun',
    'avg_early_morning', 'avg_morning', 'avg_early_afternoon', 'avg_late_afternoon', 'avg_night',
    'total_energy_used',
    'hourly_avg_energy', 'daily_avg_energy', 'weekly_avg_energy', 'monthly_avg_energy',
    'weekend_avg_energy', 'business_day_avg_energy'
]

# Convert the aggregation results to a DataFrame
df_aggregations = pd.DataFrame(aggregation_results, columns=aggregation_columns)
df_aggregations["business_day_avg_energy"] = df_aggregations["avg_mon"]+df_aggregations["avg_tue"]+df_aggregations["avg_wed"]+df_aggregations["avg_thu"]+df_aggregations["avg_fri"]
df_aggregations["weekend_avg_energy"] = df_aggregations["avg_sat"]+df_aggregations["avg_sun"]

# Concatenate the original pivoted dataframe with the aggregations dataframe
df_final = pd.concat([df_pivoted.reset_index(drop=True), df_aggregations], axis=1)

# Display the final dataframe
len(df_final)

In [None]:
# Standardize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df_aggregations) 

# Apply PCA transformation
pca = PCA(n_components=1)  # Keep 95% of the variance
data_pca = pca.fit_transform(data_scaled)


# Set the maximum number of clusters to test
max_k = 40

# Calculate and plot elbow method
wcss = calculate_wcss(data_pca, max_k)
plot_elbow_method(wcss, max_k)
explained_variance_ratio = pca.explained_variance_ratio_
print(explained_variance_ratio)
best_k_elbow = 3
print(f'Best k according to elbow method: {best_k_elbow}')

# Apply KMeans clustering
final_kmeans = KMeans(n_clusters=best_k_elbow, random_state=42, n_init=90)
final_kmeans.fit(data_pca)
labels = final_kmeans.labels_

# Calculate silhouette score
silhouette = silhouette_score(data_pca, labels)
print("Silhouette score for the KMeans model is ", silhouette)

# Calculate Davies-Bouldin score
dbs = davies_bouldin_score(data_pca, labels)
print("Davies-Bouldin score for the KMeans model is ", dbs)
