### We are using cross-validation to verify the conformity of the synthetic dataset with its own data-generation model, treating the generated dataset as a base (ground-truth) set.

In [149]:
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin
import pandas as pd 

In [151]:
class FunctionRegressor(BaseEstimator,RegressorMixin):
    def __init__(self,func):
        self.func = func
    def fit(self,X,y = None):
        return self
    def predict(self,X):
        return self.func(X)

In [153]:
## We will use the Generated Model from ipynb Synthetic_data


In [155]:
def f(X):
    return \
    0.3 * X["curvature"] + \
    0.2 * (X["lighting"] == "night").astype(int) + \
    0.1 * (X["weather"] != "clear").astype(int) + \
    0.2 * (X["speed_limit"] >= 60).astype(int) + \
    0.1 * (X["num_reported_accidents"] > 2).astype(int)

X = pd.read_csv("synthetic_road_accidents_100k.csv")

y = X.pop("accident_risk")


In [157]:
from sklearn.model_selection import cross_val_score,KFold

In [159]:
kfold = KFold(10,shuffle = True , random_state = 0)
model = FunctionRegressor(f)

scores = cross_val_score(
    model , X , y ,
    cv = kfold,
    scoring = 'r2' ,
    n_jobs = 1
)


In [161]:
print(f'{scores.mean():.5f}±{scores.std():5f}')

0.92251±0.001293


### Here, cross-validation is used to verify that the synthetic dataset behaves consistently with its own generating function, treating the generated data as a base reference rather than a learning problem.