# This notebook shows how we implemented clustering using the whole dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
import matplotlib.pyplot as plt

In [None]:
def resample_building_data(group):
    group = group.reset_index(level='ID')
    # Specify columns explicitly for summing
    resampled_group = group.resample('h').agg({'consumption': 'sum'})  # Example if 'consumption' is your numeric column
    resampled_group['ID'] = group['ID'].iloc[0]  # Handle non-numeric separately if needed
    resampled_group = resampled_group.set_index('ID', append=True)
    return resampled_group

In [None]:
file_path3 = '/kaggle/input/all-data/residential_all.pkl'
df3 = pd.read_pickle(file_path3)
#print("Data loaded")
df3["ID"] = df3["ID"].astype("category")
df3["time_code"] = df3["time_code"].astype("uint16")

In [None]:
df3 = df3.set_index(["date_time","ID"])
df3 = df3.groupby('ID', group_keys=False, observed=True).apply(resample_building_data)
df3=df3.reset_index(level=['ID',"date_time"])

In [None]:
# Generate the range of date_time values
start_time = pd.Timestamp('2009-07-14 00:00:00')
end_time = pd.Timestamp('2011-01-01 00:00:00')
date_range = pd.date_range(start=start_time, end=end_time, freq='h')

# Pivot the dataset
df_pivoted = df3.pivot(index='ID', columns='date_time', values='consumption').reset_index()

# Ensure columns are sorted by date_time
# df_pivoted = df_pivoted.sort_index(axis=1)

# Optional: Rename the columns to make them more readable
df_pivoted.columns.name = None
df_pivoted.columns = ['ID'] + [date.strftime('%Y-%m-%d %H:%M:%S') for date in date_range]


In [None]:


# Function to calculate average energy used on each weekday
def weekday_average(data):
    weekday_avgs = data.groupby(data.index.weekday).mean() * 24
    return weekday_avgs.values.flatten()

# Function to calculate average energy used during different day segments
def day_segment_average(data):
    segments = {
        'early_morning': (7, 9),
        'morning': (9, 13),
        'early_afternoon': (13, 17),
        'late_afternoon': (17, 21),
        'night': [(21, 24), (0, 7)]
    }
    averages = []
    for segment, hours in segments.items():
        if segment == 'night':
            energy = data.between_time('21:00', '23:59').mean() * 10 + data.between_time('00:00', '06:59').mean() * 10
        elif  segment == 'early_morning':
            start, end = hours
            energy = data.between_time(f'{start}:00', f'{end-1}:59').mean() * 2
        else :
            start, end = hours
            energy = data.between_time(f'{start}:00', f'{end-1}:59').mean() * 4
            
        averages.append(energy)
    return np.array(averages).flatten()

# Function to calculate total energy used
def total_energy_used(data):
    return data.sum()

# Function to calculate average energy used for different periods
def average_energy_used(data):
    hourly_avg = data.mean()
    daily_avg = data.resample('D').sum().mean()
    weekly_avg = data.resample('W').sum().mean()
    monthly_avg = data.resample('ME').sum().mean()
    return np.array([hourly_avg, daily_avg, weekly_avg, monthly_avg]).flatten()

# Function to calculate average energy used on weekends and business days
def weekend_businessday_avg(data):
    weekends_avg = data[data.index.weekday >= 5].resample('D').sum().mean() 
    business_days_avg = data[data.index.weekday < 5].resample('D').sum().mean()
    return np.array([weekends_avg, business_days_avg]).flatten()

# Assuming df_pivoted is your pivoted DataFrame with 'ID' as the first column
# Create a list to store the aggregation results
aggregation_results = []

for i, row in df_pivoted.iterrows():
    building_data = row[1:].astype(float)  # Skip the first column (ID)
    building_series = pd.Series(building_data.values, index=pd.date_range(start='2009-07-14 00:00:00', periods=len(building_data), freq='h'))
    
    # Perform the aggregations
    weekday_avg = weekday_average(building_series)
    segment_avg = day_segment_average(building_series)
    total_energy = total_energy_used(building_series)
    avg_energy = average_energy_used(building_series)
    we_bd_avg = weekend_businessday_avg(building_series)
    
    # Combine all the aggregation results for the current building
    aggregation_result = np.concatenate([weekday_avg, segment_avg, [total_energy], avg_energy, we_bd_avg])
    aggregation_results.append(aggregation_result)

# Define the columns for the aggregation results
aggregation_columns = [
    'avg_mon', 'avg_tue', 'avg_wed', 'avg_thu', 'avg_fri', 'avg_sat', 'avg_sun',
    'avg_early_morning', 'avg_morning', 'avg_early_afternoon', 'avg_late_afternoon', 'avg_night',
    'total_energy_used',
    'hourly_avg_energy', 'daily_avg_energy', 'weekly_avg_energy', 'monthly_avg_energy',
    'weekend_avg_energy', 'business_day_avg_energy'
]

# Convert the aggregation results to a DataFrame
df_aggregations = pd.DataFrame(aggregation_results, columns=aggregation_columns)
df_aggregations["business_day_avg_energy"] = df_aggregations["avg_mon"]+df_aggregations["avg_tue"]+df_aggregations["avg_wed"]+df_aggregations["avg_thu"]+df_aggregations["avg_fri"]
df_aggregations["weekend_avg_energy"] = df_aggregations["avg_sat"]+df_aggregations["avg_sun"]

# Concatenate the original pivoted dataframe with the aggregations dataframe
df_final = pd.concat([df_pivoted.reset_index(drop=True), df_aggregations], axis=1)

# Display the final dataframe
len(df_final)


In [None]:
# Standardize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df_aggregations) 

# Apply PCA transformation
pca = PCA(n_components=1)  # Keep 95% of the variance
data_pca = pca.fit_transform(data_scaled)

# Function to calculate WCSS for elbow method
def calculate_wcss(data, max_k):
    wcss = []
    for k in range(1, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(data)
        wcss.append(kmeans.inertia_)
    return wcss

# Function to plot the elbow method
def plot_elbow_method(wcss, max_k):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, max_k + 1), wcss, marker='o')
    plt.title('Elbow Method for Optimal k')
    plt.xlabel('Number of clusters (k)')
    plt.ylabel('WCSS')
    plt.grid(True)
    plt.show()

# Set the maximum number of clusters to test
max_k = 40

# Calculate and plot elbow method
wcss = calculate_wcss(data_pca, max_k)
plot_elbow_method(wcss, max_k)
explained_variance_ratio = pca.explained_variance_ratio_
print(explained_variance_ratio)
best_k_elbow = 3
print(f'Best k according to elbow method: {best_k_elbow}')

# Apply KMeans clustering
final_kmeans = KMeans(n_clusters=best_k_elbow, random_state=42, n_init=90)
final_kmeans.fit(data_pca)
labels = final_kmeans.labels_

# Calculate silhouette score
silhouette = silhouette_score(data_pca, labels)
print("Silhouette score for the KMeans model is ", silhouette)

# Calculate Davies-Bouldin score
dbs = davies_bouldin_score(data_pca, labels)
print("Davies-Bouldin score for the KMeans model is ", dbs)


In [None]:
from sklearn.manifold import TSNE

# Apply t-SNE
tsne = TSNE(n_components=2, perplexity=30, n_iter=300, random_state=42)
tsne_results = tsne.fit_transform(data_scaled)

# Create a DataFrame with t-SNE results and cluster labels
tsne_df = pd.DataFrame({
    't-SNE1': tsne_results[:, 0],
    't-SNE2': tsne_results[:, 1],
    'Cluster': labels
})

# Plotting
plt.figure(figsize=(10, 8))
scatter = plt.scatter(tsne_df['t-SNE1'], tsne_df['t-SNE2'], c=tsne_df['Cluster'], cmap='viridis', s=50)
plt.title('t-SNE Plot of Building Energy Consumption Clusters')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.legend(handles=scatter.legend_elements()[0], labels=set(labels))
plt.colorbar(scatter, label='Cluster Label')
plt.show()

In [None]:
df_labels = pd.DataFrame(labels, columns=['labels'])
df_interpret = pd.concat([df_aggregations, df_labels], axis=1)

In [None]:
#how many buildings in each cluster
df_interpret["labels"].value_counts()

In [None]:
#interpreting the results of first cluster
df_interpret[df_interpret["labels"] == 0].describe()

In [None]:
#interpreting the results of third cluster
df_interpret[df_interpret["labels"] == 2].describe()

In [None]:
#interpreting the results of second cluster
df_interpret[df_interpret["labels"] == 1].describe()