# Goal

Understand and use exca in a very basic scenario: simply to create a simple model, config and cache it.



In [None]:
"""
A minimalist example with sklearn to show how to develop and explore a model with exca.
"""
import typing as tp
import numpy as np
import pydantic
import sys
import exca
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error


class Dataset(pydantic.BaseModel):
    n_samples: int = 100
    noise: float = 0.1
    random_state: int = 42
    test_size: float = 0.2
    model_config = pydantic.ConfigDict(extra="forbid")

    def get(self) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        # Generate synthetic data
        X, y = make_regression(
            n_samples=self.n_samples,
            noise=self.noise,
            random_state=self.random_state
        )
        # Split into training and testing datasets
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, 
            test_size=self.test_size, 
            random_state=self.random_state
        )
        return X_train, X_test, y_train, y_test


class Model(pydantic.BaseModel):
    data: Dataset = Dataset()
    alpha: float = 1.0
    max_iter: int = 1000
    infra: exca.TaskInfra = exca.TaskInfra(folder='.cache/', version='v1.0')

    @infra.apply
    def score(self):
        # Get data
        X_train, X_test, y_train, y_test = self.data.get()

        # Train a Ridge regression model
        print('Fit...')
        model = Ridge(alpha=self.alpha, max_iter=self.max_iter)
        model.fit(X_train, y_train)

        # Evaluate
        print('Score...')
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        return mse


if __name__ == "__main__":
    # Validate config
    basic_config = {"alpha": 1.0, "max_iter": 1000}
    config = exca.ConfDict(basic_config)
    model = Model(**config)
    print(model.infra.config)

    # Score
    mse = model.score()
    print(mse)

<bound method BaseInfra.config of TaskInfra(folder='.cache/', cluster=None, logs='{folder}/logs/{user}/%j', job_name=None, timeout_min=None, nodes=1, tasks_per_node=1, cpus_per_task=None, gpus_per_node=None, mem_gb=None, slurm_constraint=None, slurm_partition=None, slurm_account=None, slurm_qos=None, slurm_use_srun=False, slurm_additional_parameters=None, conda_env=None, workdir=None, permissions=511, version='0', mode='cached', keep_in_ram=False)>
4352.458846189906


In [4]:
ls .cache/

[0m[01;34m__main__.Model.score,0[0m/


## Update the scoring function: make it a new version !

In [6]:

class Model(pydantic.BaseModel):
    data: Dataset = Dataset()
    alpha: float = 1.0
    max_iter: int = 1000
    infra: exca.TaskInfra = exca.TaskInfra(folder='.cache/', version='v2.0')

    @infra.apply
    def score(self):
        # Get data
        X_train, X_test, y_train, y_test = self.data.get()

        # Train a Ridge regression model
        print('Fit...')
        # model = Ridge(alpha=self.alpha, max_iter=self.max_iter)

        ## NEW VERSION: use not a Ridge model but a LinearRegression model
        from sklearn.linear_model import LinearRegression
        model = LinearRegression()
        model.fit(X_train, y_train)

        # Evaluate
        print('Score...')
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        return mse


if __name__ == "__main__":
    # Validate config
    basic_config = {"alpha": 1.0, "max_iter": 1000}
    config = exca.ConfDict(basic_config)
    model = Model(**config)
    print(model.infra.config)

    # Score
    mse = model.score()
    print(mse)

<bound method BaseInfra.config of TaskInfra(folder='.cache/', cluster=None, logs='{folder}/logs/{user}/%j', job_name=None, timeout_min=None, nodes=1, tasks_per_node=1, cpus_per_task=None, gpus_per_node=None, mem_gb=None, slurm_constraint=None, slurm_partition=None, slurm_account=None, slurm_qos=None, slurm_use_srun=False, slurm_additional_parameters=None, conda_env=None, workdir=None, permissions=511, version='v2.0', mode='cached', keep_in_ram=False)>
4501.658453980022


In [7]:
ls .cache/

[0m[01;34m__main__.Model.score,0[0m/  [01;34m__main__.Model.score,v2.0[0m/
