In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Paths
RAW_DATA_FOLDER = "../raw_generated_data"
PROCESSED_DATA_FILE = "processed_dataset.csv"

def load_raw_datasets(raw_data_folder):
    """Loads all datasets from the raw data folder."""
    datasets = []
    for file in os.listdir(raw_data_folder):
        if file.endswith(".csv"):
            file_path = os.path.join(raw_data_folder, file)
            dataset = pd.read_csv(file_path, header=None).values
            datasets.append(dataset)
    return datasets

def extract_features(dataset):
    """Extract features of interest from a dataset."""
    # Independent variable computation
    n_rows = dataset.shape[0]
    n_classes = len(np.unique(dataset[:, 0]))
    n_features = dataset.shape[1] - 1
    noise_level = np.var(dataset[:, 1:], axis=0).mean()
    
    return [n_rows, n_classes, n_features, noise_level]

def find_best_k(dataset):
    """Runs a kNN model and finds the best hyperparameter 'k'."""
    X, y = dataset[:, 1:], dataset[:, 0]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    best_k, best_accuracy = 1, 0
    for k in range(1, min(len(y), 50)):  # Limit k to avoid large values
        model = KNeighborsClassifier(n_neighbors=k)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        if accuracy > best_accuracy:
            best_k, best_accuracy = k, accuracy
    return best_k

def create_dataset_of_interest(raw_data_folder, output_file):
    """Creates the dataset of interest with extracted features."""
    datasets = load_raw_datasets(raw_data_folder)
    processed_data = []
    for dataset in datasets:
        features = extract_features(dataset)
        best_k = find_best_k(dataset)
        processed_data.append([best_k] + features)

    # Save the processed dataset
    header = ['best_k', 'n_rows', 'n_classes', 'n_features', 'noise_level']
    df = pd.DataFrame(processed_data, columns=header)
    df.to_csv(output_file, index=False)
    print(f"Processed dataset saved to {output_file}")

# Main execution
if __name__ == "__main__":
    create_dataset_of_interest(RAW_DATA_FOLDER, PROCESSED_DATA_FILE)