Import the libraries

In [1]:
import openml
import os
import zipfile
# Load the data
import pandas as pd
from ucimlrepo import fetch_ucirepo
import pickle 
from sklearn.model_selection import train_test_split

# Simulating Datasets through MakeRegression

In [2]:
import numpy as np
import pandas as pd
import os
import pickle
from sklearn.datasets import make_regression

# Set random seed for reproducibility
np.random.seed(42)

# Initialize output dictionary
dataset_dict = {}

# Feature counts
feature_counts = list(range(10, 501, 10))  # [10,20,30...150]

# Generate datasets
for n_features in feature_counts:
    X, y = make_regression(
        n_samples=500,
        n_features=n_features,
        noise=10.0,
        random_state=42
    )
    
    X_df = pd.DataFrame(X, columns=[f"f{i+1}" for i in range(n_features)])
    y_df = pd.Series(y, name="target")
    
    dataset_dict[f"sim_{n_features}"] = {
        "X": X_df,
        "y": y_df
    }

# Ensure the Data folder exists
os.makedirs("Data", exist_ok=True)

# Save the dictionary as a pickle file
with open("Data/Regression_Original_Simulated.pkl", "wb") as f:
    pickle.dump(dataset_dict, f)

print("Saved to Data/Regression_Original_Simulated.pkl")


Saved to Data/Regression_Original_Simulated.pkl


Splitting train and test

In [3]:
from sklearn.model_selection import train_test_split
# Initialize dictionary to store splits
encoded_split_data = {}

# Process each dataset
for name, data in dataset_dict.items():
    X = data["X"].copy()
    y = data["y"]

    # Step 2: Train/Test/Val Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Step 3: Store into dictionary
    encoded_split_data[name] = {
        "X_train": X_train,
        "X_test": X_test,
        "y_train": y_train,
        "y_test": y_test
    }

print("✅ All datasets split into train/test.")

# Save to pickle
os.makedirs("Data", exist_ok=True)
with open("Data/encoded_split_data_simulated.pkl", "wb") as f:
    pickle.dump(encoded_split_data, f)

print("📦 Saved to 'Data/encoded_split_data_simulated.pkl'")


✅ All datasets split into train/test.
📦 Saved to 'Data/encoded_split_data_simulated.pkl'
