In [1]:
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
import plotly.express as px

# Load the dataset
data = pd.read_csv("merged_output.csv")

# Clean the 'EuroCost' column by removing commas and converting to numeric
data['EuroCost'] = pd.to_numeric(data['EuroCost'].replace({',': ''}, regex=True), errors='coerce')

# Drop rows where 'EuroCost' is still NaN after conversion
data_clean = data.dropna(subset=['EuroCost'])

# Select features for DBSCAN
features = ['ReportYear', 'ReportMonth', 'OIPID', 'SupplierID', 'NRENID', 'Country', 'ServiceType', 'ConsumptionType']

# Splitting into features (X)
X = data_clean[features]

# Handling missing values and encoding categorical variables
categorical_features = ['OIPID', 'SupplierID', 'NRENID', 'Country', 'ServiceType', 'ConsumptionType']
numerical_features = ['ReportYear', 'ReportMonth']

# Preprocessing pipeline with imputation and scaling
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=True))  # Sparse output
        ]), categorical_features)
    ])

# Apply the preprocessing to the data
X_preprocessed = preprocessor.fit_transform(X)

# Apply DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)  # Adjust eps and min_samples based on your data
dbscan_labels = dbscan.fit_predict(X_preprocessed)

# Add cluster labels to the original data
data_clean['Cluster'] = dbscan_labels

# Handle noise points (which DBSCAN assigns as -1)
noise_points = data_clean[data_clean['Cluster'] == -1]
clusters = data_clean[data_clean['Cluster'] != -1]


  data = pd.read_csv("merged_output.csv")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['Cluster'] = dbscan_labels


#### Dominant Countries and Service Types in Each Cluster

In [3]:
# Group the data by Cluster and then count the occurrences of each Country and ServiceType
cluster_country_service = clusters.groupby('Cluster').agg({
    'Country': lambda x: x.value_counts().index[0] if len(x.value_counts()) > 0 else None,  # Most frequent country in each cluster
    'ServiceType': lambda x: x.value_counts().index[0] if len(x.value_counts()) > 0 else None  # Most frequent service type in each cluster
})

# Display the dominant country and service type per cluster
print("Dominant Country and Service Type per Cluster (DBSCAN):")
print(cluster_country_service)



Dominant Country and Service Type per Cluster (DBSCAN):
             Country ServiceType
Cluster                         
0            FINLAND        None
1            CROATIA        IaaS
2            CROATIA        IaaS
3                 UK        IaaS
4                 UK        IaaS
...              ...         ...
1071     NETHERLANDS     Compute
1072     NETHERLANDS     Compute
1073         BELGIUM     Compute
1074           SPAIN     Compute
1075         AUSTRIA     Compute

[1076 rows x 2 columns]


#### Suppliers Driving Costs in Each Cluster

In [4]:
# Group the data by Cluster and SupplierID, and calculate the mean EuroCost for each supplier
cluster_supplier_cost = clusters.groupby(['Cluster', 'SupplierID']).agg({
    'EuroCost': 'mean'
}).sort_values(by='EuroCost', ascending=False)

# Display suppliers driving costs within each cluster
print("Supplier Driving Costs in Each Cluster (Sorted by EuroCost - DBSCAN):")
print(cluster_supplier_cost)


Supplier Driving Costs in Each Cluster (Sorted by EuroCost - DBSCAN):
                                    EuroCost
Cluster SupplierID                          
80      Telefonica Soluciones  354204.909091
174     CloudFerro             288313.000000
108     Safespring AB          100421.363636
208     Safespring              97111.502176
202     Safespring AB           90187.596176
...                                      ...
764     SoftwareONE AG             -4.258333
977     SoftwareONE               -62.123043
599     SoftwareONE AG           -508.500000
528     Vancis                  -1196.965500
959     Vancis                  -1196.965500

[1072 rows x 1 columns]


#### Clusters with Consistently Higher Cost

In [5]:
# Calculate the mean EuroCost for each cluster
cluster_cost = clusters.groupby('Cluster')['EuroCost'].mean()

# Display the mean EuroCost per cluster
print("Average EuroCost per Cluster (DBSCAN):")
print(cluster_cost)


Average EuroCost per Cluster (DBSCAN):
Cluster
0           0.000000
1        4395.193333
2        4548.338889
3       28361.720181
4       11820.988693
            ...     
1071    63271.277619
1072      927.918333
1073     1400.948889
1074     2408.281667
1075       87.175000
Name: EuroCost, Length: 1076, dtype: float64


####  General Profiling of Clusters

In [7]:
# General profiling for each cluster, calculating the count of occurrences and mean values
cluster_profile = clusters.groupby('Cluster').agg({
    'Country': lambda x: x.value_counts().index[0] if len(x.value_counts()) > 0 else 'N/A',  # Most frequent country, or 'N/A' if none
    'ServiceType': lambda x: x.value_counts().index[0] if len(x.value_counts()) > 0 else 'N/A',  # Most frequent service type, or 'N/A'
    'SupplierID': lambda x: x.value_counts().index[0] if len(x.value_counts()) > 0 else 'N/A',  # Most frequent supplier, or 'N/A'
    'EuroCost': ['mean', 'median', 'count'],  # Average, median, and count of EuroCost
    'ReportYear': 'mean'  # Average report year
})

# Display the cluster profiling
print("Cluster Profiling Summary (DBSCAN):")
print(cluster_profile)


Cluster Profiling Summary (DBSCAN):
             Country ServiceType                    SupplierID      EuroCost  \
            <lambda>    <lambda>                      <lambda>          mean   
Cluster                                                                        
0            FINLAND         N/A                   Computas AS      0.000000   
1            CROATIA        IaaS                 SETCOR d.o.o.   4395.193333   
2            CROATIA        IaaS                 SETCOR d.o.o.   4548.338889   
3                 UK        IaaS                          Jisc  28361.720181   
4                 UK        IaaS                          Jisc  11820.988693   
...              ...         ...                           ...           ...   
1071     NETHERLANDS     Compute  Rackspace International GmbH  63271.277619   
1072     NETHERLANDS     Compute  Rackspace International GmbH    927.918333   
1073         BELGIUM     Compute  Rackspace International GmbH   1400.948889   
1074

#### Outlier Detection (Noise Points)

In [8]:
# Check how many points were classified as noise by DBSCAN (label = -1)
print(f"Number of noise points detected by DBSCAN: {len(noise_points)}")

# Optionally, inspect the noise points
print("Noise Points:")
print(noise_points)


Number of noise points detected by DBSCAN: 1215
Noise Points:
             Date  ReportYear  ReportMonth       OIPID   SupplierID   NRENID  \
3      30/06/2023      2023.0          4.0         AWS      Sparkle  UNINETT   
4      30/06/2023      2023.0          4.0         AWS      Sparkle  Rediris   
5      30/06/2023      2023.0          4.0         AWS      Sparkle      DFN   
6      30/06/2023      2023.0          4.0         AWS      Sparkle  UNINETT   
7      30/06/2023      2023.0          4.0         AWS      Sparkle  Rediris   
...           ...         ...          ...         ...          ...      ...   
98964  10/12/2023      2023.0         10.0          AZ      Atea AS    SUNET   
98965  29/04/2022      2022.0          1.0  CloudSigma   CloudSigma   SWITCH   
99342  07/06/2021      2021.0          4.0      Google  Computas AS     DeiC   
99343  07/06/2021      2021.0          4.0      Google  Computas AS     DeiC   
99344  07/06/2021      2021.0          4.0      Google  Co