    "Quick Chargers": Sessions with short durations and high charging rates.
    
    "Extended Sessions": Sessions that last longer and involve slower charging rates.
    
    "Off-Peak Chargers": Sessions that primarily occur during non-peak hours.
    
    "Weekend Chargers": Sessions that are more common on weekends.
    
    "Rapid Chargers": Sessions characterized by a high charging rate, regardless of duration.

    "Night Owls": Charging sessions that predominantly occur during nighttime hours.

    "Daytime Chargers": Sessions concentrated during daylight hours.

    "Intermittent Chargers": Sessions with frequent start-stop patterns, suggesting sporadic usage.

    "High Utilizers": Users or sessions with consistently high charging demand.

    "Low Utilizers": Users or sessions with consistently low charging demand.

    "Morning Commuters": Sessions that commonly occur during morning rush hours.

    "Afternoon Surges": Clusters with increased charging demand in the afternoon.

    "Weekday Warriors": High-demand sessions on weekdays, potentially indicating work-related charging.

    "Holiday Chargers": Sessions with distinct patterns during public holidays.

    "Energy Savers": Sessions that exhibit efforts to charge during lower energy cost periods.

    "Regular Routines": Sessions with predictable and consistent patterns.

    "Irregular Patterns": Clusters with unpredictable charging behavior.

    "Peak Load Chargers": Sessions contributing to high energy demand during peak load times.

    "Efficiency Seekers": Users optimizing for faster charging or energy efficiency.

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
original_data = pd.read_pickle("preparedData.pkl")
data = original_data.copy()

In [None]:
print(data['connectionTime'].max())
print(data['connectionTime'].min())

In [None]:
data = data[(data['disconnectTime'] < "2020-04-01") ^ (data['disconnectTime'] >= "2021-01-01")].copy()
data

In [None]:
print(data['connectionTime'].max())
print(data['connectionTime'].min())

In [None]:
data = data[data["siteID"] == 1].copy()

In [None]:
data.info()

In [None]:
data.dropna(inplace=True)

In [None]:
data.info()

In [None]:
data['connectionTime'] = data['connectionTime'].dt.tz_convert(tz='Etc/GMT-8')
data['disconnectTime'] = data['disconnectTime'].dt.tz_convert(tz='Etc/GMT-8')
data['doneChargingTime'] = data['doneChargingTime'].dt.tz_convert(tz='Etc/GMT-8')
data["ConnectionHour"] = data['connectionTime'].dt.hour
data["weekday"] = data["connectionTime"].dt.day_of_week

In [None]:
# Berechne die gesamte zur Verfügung stehende Zeit in der Woche in Minuten
total_time_in_week = 7 * 24 * 60  # 7 Tage * 24 Stunden * 60 Minuten
data["progressInWeek"] = (data["weekday"] * 24 * 60 + data["ConnectionHour"] * 60 + data['connectionTime'].dt.minute) / total_time_in_week

In [None]:
data

In [None]:
numeric_data = data.select_dtypes(include=[np.number])
numeric_data.describe()

In [None]:
# Calculate the correlation matrix
correlation_matrix = numeric_data.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(10, 8))  # Adjust the figure size as needed
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")

plt.title("Correlation Heatmap")
plt.show()

In [None]:
correlation_matrix

In [None]:
cluster_data = pd.DataFrame()
cluster_data['progressInWeek'] = numeric_data['progressInWeek']
cluster_data['kWhDelivered'] = numeric_data['kWhDelivered']

In [None]:
cluster_data

In [None]:
# Calculate the correlation matrix
correlation_matrix = cluster_data.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(10, 8))  # Adjust the figure size as needed
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")

plt.title("Correlation Heatmap")
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(cluster_data)
cluster_data_scaled = scaler.transform(cluster_data)
cluster_data_scaled_df = pd.DataFrame(cluster_data_scaled, columns=cluster_data.columns, index=cluster_data.index)
cluster_data_scaled_df.describe()

In [None]:
from sklearn.cluster import KMeans

In [None]:
k_max = 25  # We have a lot of datapoints, however more than 25 clusters are definitely
            # not reasonable!

In [None]:
#These lines initialize two empty lists to store the number of clusters (clusters) and the
#corresponding inertia (sum of squared distances to the nearest cluster center) for each k value (losses)
clusters = []
losses = []

#This is a loop that iterates over different values of k
for k in range(k_max):
    #For each k, a new instance of the KMeans clustering algorithm is created. The n_clusters parameter is set to k+1 to specify the number of clusters, and n_init='auto' indicates
    #that the algorithm should automatically choose the number of initializations for the centroids
    model = KMeans(n_clusters=k+1, n_init='auto')
    model.fit(cluster_data_scaled)
    clusters.append(k+1)

    #The inertia of the fitted model is calculated using the inertia_ attribute, representing the sum of squared distances to the nearest cluster center.
    #This value is then appended to the losses list
    losses.append(model.inertia_)

In [None]:
plt.plot(clusters, losses)
plt.ylabel("Loss")
plt.xlabel("Number of clusters")
plt.show()

In [None]:
plt.plot(clusters, losses)
plt.ylabel("Loss")
plt.xlabel("Number of clusters")
plt.xlim([0,15])

In [None]:
plt.plot(clusters, losses)
plt.ylabel("Loss")
plt.xlabel("Number of clusters")
plt.xlim([0,6])

In [None]:
numbers = ["zero", "one", "two"]

sessions_scaled = cluster_data_scaled_df.copy()
three_means = KMeans(n_clusters=3, n_init='auto')
three_means.fit(cluster_data_scaled)

In [None]:
sessions_scaled["three"] = three_means.predict(cluster_data_scaled)
sessions_scaled["three"] = sessions_scaled["three"].apply(lambda x: numbers[x])
sns.pairplot(data=sessions_scaled, hue="three")

In [None]:
cluster_sizes = sessions_scaled["three"].value_counts()
print("\nNumber of Datapoints in Each Cluster:")
print(cluster_sizes)

In [None]:
average_efficiency_by_cluster = sessions_scaled.groupby('three')['kWhDelivered'].mean()
average_efficiency_by_cluster

Cut-Off 1

In [None]:
cluster_data = pd.DataFrame()
cluster_data['progressInWeek'] = numeric_data['progressInWeek']
cluster_data['kWhDelivered'] = numeric_data['kWhDelivered']

In [None]:
cluster_data

In [None]:
# Calculate the correlation matrix
correlation_matrix = cluster_data.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(10, 8))  # Adjust the figure size as needed
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")

plt.title("Correlation Heatmap")
plt.show()

In [None]:
scaler = StandardScaler()
scaler.fit(cluster_data)
cluster_data_scaled = scaler.transform(cluster_data)
cluster_data_scaled_df = pd.DataFrame(cluster_data_scaled, columns=cluster_data.columns, index=cluster_data.index)
cluster_data_scaled_df.describe()

In [None]:
#These lines initialize two empty lists to store the number of clusters (clusters) and the
#corresponding inertia (sum of squared distances to the nearest cluster center) for each k value (losses)
clusters = []
losses = []

#This is a loop that iterates over different values of k
for k in range(k_max):
    #For each k, a new instance of the KMeans clustering algorithm is created. The n_clusters parameter is set to k+1 to specify the number of clusters, and n_init='auto' indicates
    #that the algorithm should automatically choose the number of initializations for the centroids
    model = KMeans(n_clusters=k+1, n_init='auto')
    model.fit(cluster_data_scaled)
    clusters.append(k+1)

    #The inertia of the fitted model is calculated using the inertia_ attribute, representing the sum of squared distances to the nearest cluster center.
    #This value is then appended to the losses list
    losses.append(model.inertia_)

In [None]:
plt.plot(clusters, losses)
plt.ylabel("Loss")
plt.xlabel("Number of clusters")
plt.show()

In [None]:
plt.plot(clusters, losses)
plt.ylabel("Loss")
plt.xlabel("Number of clusters")
plt.xlim([0,15])

In [None]:
plt.plot(clusters, losses)
plt.ylabel("Loss")
plt.xlabel("Number of clusters")
plt.xlim([0,6])

In [None]:
numbers = ["zero", "one", "two"]

sessions_scaled = cluster_data_scaled_df.copy()
three_means = KMeans(n_clusters=3, n_init='auto')
three_means.fit(cluster_data_scaled)

In [None]:
sessions_scaled["three"] = three_means.predict(cluster_data_scaled)
sessions_scaled["three"] = sessions_scaled["three"].apply(lambda x: numbers[x])
sns.pairplot(data=sessions_scaled, hue="three")

In [None]:
cluster_sizes = sessions_scaled["three"].value_counts()
print("\nNumber of Datapoints in Each Cluster:")
print(cluster_sizes)

In [None]:
average_efficiency_by_cluster = sessions_scaled.groupby('three')['kWhDelivered'].mean()
average_efficiency_by_cluster

Cut-Off2

In [None]:
cluster_data = pd.DataFrame()
cluster_data['progressInWeek'] = numeric_data['progressInWeek']
cluster_data['kWhDelivered'] = numeric_data['kWhDelivered']

In [None]:
cluster_data

In [None]:
# Calculate the correlation matrix
correlation_matrix = cluster_data.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(10, 8))  # Adjust the figure size as needed
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")

plt.title("Correlation Heatmap")
plt.show()

In [None]:
scaler = StandardScaler()
scaler.fit(cluster_data)
cluster_data_scaled = scaler.transform(cluster_data)
cluster_data_scaled_df = pd.DataFrame(cluster_data_scaled, columns=cluster_data.columns, index=cluster_data.index)
cluster_data_scaled_df.describe()

In [None]:
#These lines initialize two empty lists to store the number of clusters (clusters) and the
#corresponding inertia (sum of squared distances to the nearest cluster center) for each k value (losses)
clusters = []
losses = []

#This is a loop that iterates over different values of k
for k in range(k_max):
    #For each k, a new instance of the KMeans clustering algorithm is created. The n_clusters parameter is set to k+1 to specify the number of clusters, and n_init='auto' indicates
    #that the algorithm should automatically choose the number of initializations for the centroids
    model = KMeans(n_clusters=k+1, n_init='auto')
    model.fit(cluster_data_scaled)
    clusters.append(k+1)

    #The inertia of the fitted model is calculated using the inertia_ attribute, representing the sum of squared distances to the nearest cluster center.
    #This value is then appended to the losses list
    losses.append(model.inertia_)

In [None]:
plt.plot(clusters, losses)
plt.ylabel("Loss")
plt.xlabel("Number of clusters")
plt.show()

In [None]:
plt.plot(clusters, losses)
plt.ylabel("Loss")
plt.xlabel("Number of clusters")
plt.xlim([0,15])

In [None]:
plt.plot(clusters, losses)
plt.ylabel("Loss")
plt.xlabel("Number of clusters")
plt.xlim([0,6])

In [None]:
numbers = ["zero", "one", "two"]

sessions_scaled = cluster_data_scaled_df.copy()
three_means = KMeans(n_clusters=3, n_init='auto')
three_means.fit(cluster_data_scaled)

In [None]:
sessions_scaled["three"] = three_means.predict(cluster_data_scaled)
sessions_scaled["three"] = sessions_scaled["three"].apply(lambda x: numbers[x])
sns.pairplot(data=sessions_scaled, hue="three")

In [None]:
cluster_sizes = sessions_scaled["three"].value_counts()
print("\nNumber of Datapoints in Each Cluster:")
print(cluster_sizes)

In [None]:
average_efficiency_by_cluster = sessions_scaled.groupby('three')['kWhDelivered'].mean()
average_efficiency_by_cluster