<a href="https://colab.research.google.com/github/Ashail33/Masters-work/blob/master/One%20script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Environment prep**

In [None]:
## Set up for data generation
# !pip uninstall -y mdcgenpy
!pip install git+https://github.com/Ashail33/mdcgenpy.git

from  mdcgenpy import clusters as cl
import numpy as np
import pandas as pd
import json
import os
import glob
import re
from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from googleapiclient.http import MediaFileUpload
# import cl
from google.colab import drive

# **Create Clustering Dataset**




In [None]:
# The properties I will be adding to the dataset are:
#     1) Outliers - Binary ( two options) - outliers
#     2) Noise - Binary ( two options) add_noise=0,n_noise=None,
#     3) Number of clusters - I will select three values k
#     4) Number of data points - I will select three values ( 100 000, 1 000 000, 100 000 000)n_samples
#     5) Number of features - I will select five values ( 2, 10, 50, 100, 500) n_feats
#     6) Density - I will select three values compactness_factor
#     --7) Cluster shape - I will select three types - clusters within a radius ,  hollow shaped clusters , s-shaped / c-shaped clusters
#      distributions
#         'uniform': lambda shape, param: np.random.uniform(-param, param, shape),
#     'gaussian': lambda shape, param: np.random.normal(0, param, shape),
#     'logistic': lambda shape, param: np.random.logistic(0, param, shape),
#     'triangular': lambda shape, param: np.random.triangular(-param, 0, param, shape),
#     'gamma': lambda shape, param: np.random.gamma(2 + 8 * np.random.rand(), param / 5, shape),
#     'gap': lambda shape, param: gap(shape, param)

# --8) Missing values - this will need to be created by randomly removing values up to a certain number of columns and records

#Mount drive where you would like to add the data to

drive.mount('/content/gdrive', force_remount=False)

# Set the path to the 'Masters_data' folder in your Google Drive
base_path = '/content/gdrive/MyDrive/'
folder_name = 'Masters_data'
folder_path = os.path.join(base_path, folder_name)

# Create the 'Masters_data' folder if it doesn't exist
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Parameters for generating datasets
outliers = [0,500]
noise_options = [(0, None), (10, 100)]
n_clusters = [3, 5, 10]
n_samples = [10000,50000, 100000]
n_feats = [2, 10,100]
compactness_factors = [0.01, 0.5, 1]

# Distributions for cluster shapes
dist_options = ['gaussian']

# Find the last generated dataset
existing_files = glob.glob(os.path.join(folder_path, 'dataset_*.csv'))
dataset_numbers = [int(re.search(r'dataset_(\d+).csv', file).group(1)) for file in existing_files]
if dataset_numbers:
    last_dataset_id = max(dataset_numbers)
else:
    last_dataset_id = -1

metadata = []

# Continue from the last generated dataset
for idx, outlier in enumerate(outliers):
    for idy, (add_noise, n_noise) in enumerate(noise_options):
        for idz, k in enumerate(n_clusters):
            for idw, n_sample in enumerate(n_samples):
                for idv, n_feat in enumerate(n_feats):
                    for idu, compactness_factor in enumerate(compactness_factors):
                        for idt, distribution in enumerate(dist_options):
                            current_dataset_id = (idx * len(noise_options) * len(n_clusters) * len(n_samples) * len(n_feats) * len(compactness_factors) * len(dist_options) +
                                                  idy * len(n_clusters) * len(n_samples) * len(n_feats) * len(compactness_factors) * len(dist_options) +
                                                  idz * len(n_samples) * len(n_feats) * len(compactness_factors) * len(dist_options) +
                                                  idw * len(n_feats) * len(compactness_factors) * len(dist_options) +
                                                  idv * len(compactness_factors) * len(dist_options) +
                                                  idu * len(dist_options) +
                                                  idt)

                            if current_dataset_id <= last_dataset_id:
                                continue

                            file_name = f'dataset_{current_dataset_id}.csv'
                            file_path = os.path.join(folder_path, file_name)
                            distributions = [distribution] * k


                            cluster_gen = cl.ClusterGenerator(
                                          n_samples=n_sample,
                                          outliers=outlier,
                                          n_feats=n_feat,
                                          k=k,
                                          distributions=distributions,
                                          compactness_factor=compactness_factor,

                                          )

                            data = cluster_gen.generate_data(output_file=file_path)

                            dataset_properties = {
                                'id': current_dataset_id,
                                'outliers': outlier,
                                'add_noise': add_noise,
                                'n_noise': n_noise,'n_clusters': k,
                                'n_samples': n_sample,
                                'n_feats': n_feat,
                                'compactness_factor': compactness_factor,
                                'distribution': distribution,
                                'file_path': file_path
                                }

                            metadata.append(dataset_properties)
                            metadata_file_path = os.path.join(folder_path, 'metadata.json')
                            with open(metadata_file_path, 'w') as f:
                                json.dump(metadata, f)
