In [1]:
import numpy as np

In [6]:
class DataGenerator:
    def __init__(self, d, k, n_points_per_cluster, relevant_dims, radius, n_outliers):
        self.d = d
        self.k = k
        self.n_points_per_cluster = n_points_per_cluster
        self.relevant_dims = relevant_dims
        self.radius = radius
        self.n_outliers = n_outliers
        self.data = []

    def generate_clusters(self):
        clusters = []
        for i in range(self.k):
            cluster_center = np.random.uniform(0, 100, self.d)
            relevant_indices = np.random.choice(self.d, self.relevant_dims, replace=False)
            cluster_points = np.random.uniform(0, 100, (self.n_points_per_cluster, self.d))
            for point in cluster_points:
                for idx in relevant_indices:
                    point[idx] = np.random.uniform(
                        max(0, cluster_center[idx] - self.radius),
                        min(100, cluster_center[idx] + self.radius)
                    )
            clusters.append(cluster_points)
        self.data.extend(clusters)

    def generate_outliers(self):
        outliers = np.random.uniform(0, 100, (self.n_outliers, self.d))
        self.data.extend(outliers)

    def get_data(self):
        self.data = []  # Reset data
        self.generate_clusters()
        self.generate_outliers()
        return np.vstack(self.data)


In [4]:
d = 5  # Dimensionality of the data
k = 3  # Number of clusters
n_points_per_cluster = 50  # Number of points in each cluster
relevant_dims = 3  # Number of relevant dimensions for each cluster
radius = 10  # Radius within which the cluster points are uniformly distributed in the relevant dimensions
n_outliers = 20  # Number of outliers

generator = DataGenerator(d, k, n_points_per_cluster, relevant_dims, radius, n_outliers)
data = generator.get_data()

print(data)

[[8.13490519e+01 7.59248427e+01 5.59763934e+01 8.43679911e+01
  6.63386680e+01]
 [8.14570504e+01 9.90408287e+01 2.44778463e+01 8.18494466e+01
  5.20524332e+01]
 [8.03560590e+01 7.40999121e+01 8.50669410e+00 7.27463774e+01
  5.40834884e+01]
 [8.08429224e+01 2.32551269e+01 9.85018130e+01 9.12292556e+01
  5.56355633e+01]
 [7.44528830e+01 1.04519191e+01 5.02692817e+01 8.38830513e+01
  6.37316460e+01]
 [8.31803355e+01 5.94547516e+01 5.08241241e+01 8.30447445e+01
  6.79500952e+01]
 [8.02291799e+01 4.33111179e+01 1.52899275e+01 9.10335797e+01
  5.72913149e+01]
 [7.13238719e+01 8.86443320e+01 5.35439906e+00 7.64658476e+01
  5.86564683e+01]
 [7.46239616e+01 9.02453826e+01 9.24108710e+01 7.51593868e+01
  6.67750719e+01]
 [7.75162867e+01 4.45986813e+01 7.17502188e+01 8.53634765e+01
  5.23437633e+01]
 [6.98510671e+01 1.25550403e-01 3.81668099e+01 7.80200917e+01
  6.88318400e+01]
 [7.03656738e+01 5.16759382e+01 9.18882989e+01 7.95245032e+01
  6.48248660e+01]
 [7.11767403e+01 5.82731066e+01 4.983991

In [7]:
# Define parameters for the datasets
dims = [1, 10, 25, 50, 100]  # Increasing dimensionality
k = 3  # Number of clusters
n_points_per_cluster = 50  # Number of points in each cluster
radius = 10  # Radius for the clusters
n_outliers = 20  # Number of outliers

datasets = []

for d in dims:
    relevant_dims = max(1, d // 5)  # Use a fraction of dimensions as relevant
    generator = DataGenerator(d, k, n_points_per_cluster, relevant_dims, radius, n_outliers)
    data = generator.get_data()
    datasets.append(data)
    print(f"Generated dataset with dimensionality {d}:\n", data)

# datasets list contains the five generated datasets with increasing dimensionality

Generated dataset with dimensionality 1:
 [[82.86027388]
 [74.5955119 ]
 [76.31236298]
 [76.94955389]
 [77.15752004]
 [87.39214853]
 [71.36481602]
 [87.75388025]
 [74.81132361]
 [84.27733231]
 [73.23972836]
 [83.80377469]
 [72.89045839]
 [73.35031511]
 [84.21039531]
 [68.68847167]
 [82.6125931 ]
 [80.53746822]
 [73.29337445]
 [75.52502862]
 [87.19590219]
 [70.8134522 ]
 [81.1184663 ]
 [85.93958441]
 [68.88750983]
 [86.89191247]
 [83.82674393]
 [70.6726298 ]
 [73.94482532]
 [73.97376413]
 [84.30933434]
 [75.81312673]
 [71.45154896]
 [84.7898659 ]
 [77.56581649]
 [69.79775438]
 [82.88145756]
 [77.1713636 ]
 [71.71901837]
 [71.38507574]
 [84.29584594]
 [87.76501509]
 [68.86572103]
 [74.86099114]
 [73.71791382]
 [76.18649492]
 [73.44688436]
 [71.78694385]
 [77.5576553 ]
 [77.7873967 ]
 [45.61625722]
 [44.12800589]
 [48.82607672]
 [50.8149938 ]
 [38.46062323]
 [55.36790834]
 [47.94916509]
 [52.32536572]
 [51.6401322 ]
 [38.87133281]
 [54.48646263]
 [55.13442575]
 [44.31591948]
 [38.86931518