In [1]:
""" 15.10.2024
We create 'n' datasets made of a 'y' independent variable and 
                                 'x' dependent ones.
For each dataset we derive a 'ry' (Y) hyperparameter,
    the 'n_neighbors' of the kNN model,
    as well as an 'rx' (X) array of dependent variables
    derived from the (y, x) dataset.
The result is a single dataset (Y, X).

Functions:
- get_y/get_x/gen_dataset : generate the fake dataset.
- get_hp/get_dv           : derive a row for our dataset.
- simulate                : generates our dataset.
Note: 'get_x' determines the distribution for kNN
      'get_dv' determines our dependent variables
Note: Everything in this cell is purely for "data collection".
"""

from typing import Literal
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier as sk_kNN
from sklearn.metrics import accuracy_score as sk_acc
from sklearn.model_selection import train_test_split as sk_tts

## Fake datasets
def get_y(yc: int=3, ly: int=100) -> np.array:
    """Generates an 'y' vector (fake dataset)."""
    return np.random.choice([a for a in range(yc)], ly)
def get_x(y: np.array, lx: int=3) -> np.array:
    """Generates an 'x' matrix for a given 'y' vector (fake dataset)."""
    # define "distribution". Add parameters if need be. Default below.
    mx = np.random.randint(lx)
    x = [[np.random.normal(0, 5)*(ry+1) if i != mx else 
          np.random.normal(50, 5)*(ry+1) for i in range(lx)] 
         for ry in y]
    return np.array(x)
def gen_dataset(yc: int=3, ny: list=[100, 20], nx: list=[3, 0]) -> tuple:
    """Generates a fake dataset."""
    y = get_y(yc=yc, ly=int(np.random.normal(ny[0], ny[1])))
    return (y, get_x(y, lx=int(np.random.normal(nx[0], nx[1]))))

## Hyperparameter / dependent variables
def get_hp(y: np.array, x: np.array) -> int:
    """Runs a kNN to find the best 'k'."""
    x_tr, x_te, y_tr, y_te = sk_tts(x, y) # train/test
    rk, ra, ly = 0, -1., int(len(y)/2)-1
    for k in np.arange(1, ly, 1):         # all possible k's
        m = sk_kNN(k)                     # kNN model
        m.fit(x_tr, y_tr)                 # fit on training data
        yp_te = m.predict(x_te)           # predict on test data
        a = sk_acc(y_te, yp_te)           # get accuracy score
        if a > ra:                        # pick best 'k'
            ra, rk = a, k
    return rk
def get_dv(y: np.array, x: np.array) -> np.array:
    """Gets dependent variables from our y/x dataset."""
    # add variables...
    rx = [len(y),                         # nb of rows
          len(np.unique(y)),              # nb of classes
          len(x[0]),                      # nb of dependent variables
    ]
    return np.array(rx)

## Main function
def simulate(n: int=1000, yc: int=3, ny: list=[100, 20],
             nx: list=[3, 0]) -> np.array:
    """Main function to generate 'n' datapoints for our model.
    Parameters:
    - n        number of fake datasets.
    - yc       number of 'y' classes.
    - ny       tuple (mu, sigma) for len(y).
    - nx       tuple (mu, sigma) for number of dependent variables."""
    res = []
    for _ in range(n):
        y, x = gen_dataset(yc, ny, nx) # fake dataset
        ry = get_hp(y, x)              # hyperparameter
        rx = get_dv(y, x)              # dependent variables
        res.append([ry]+list(rx))
    return np.array(res)

df = pd.DataFrame(simulate(1000))
df.columns = [f"x{a}" if a > 0 else "y" for a in range(len(df.columns))]
df.to_csv("metadb.csv")
df.head()

Unnamed: 0,y,x1,x2,x3
0,1,127,3,3
1,1,73,3,3
2,1,112,3,3
3,2,122,3,3
4,1,108,3,3
