In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, KFold

from darwin.config import (
    PROCESSED_DATA_DIR,
    RANDOM_STATE,
    METRICS,
    RANDOM_SEEDS
)

[32m2025-03-27 20:12:00.709[0m | [1mINFO    [0m | [36mdarwin.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/eduardoduarte/Projects/refactor-darwin/darwin[0m


In [2]:
input_path = PROCESSED_DATA_DIR / 'feature_imp.csv'
target_path = PROCESSED_DATA_DIR / 'target.csv'

In [3]:
df = pd.read_csv(input_path)

In [4]:
knn = KNeighborsClassifier(n_neighbors=5,
                           weights='uniform',
                           metric='minkowski',
                           )

In [5]:
tree = DecisionTreeClassifier(random_state=RANDOM_STATE,
                              criterion='entropy',
                              max_depth=5,
                              )

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(100,),
                    activation='relu',
                    solver='adam',
                    random_state=RANDOM_STATE,
                    learning_rate='adaptive',
                    early_stopping=True,
                    max_iter=1000,
                    )

In [7]:
def evaluate_model(model, df: pd.DataFrame) -> pd.DataFrame:
    """
    Evaluate a model using cross-validation
    The model is evaluated using 5-fold cross-validation, 
    metrics used to calculate the final score are defined in METRICS
    For each seed defined in RANDOM_SEEDS, the model is trained and evaluated
    The result is returned as a DataFrame with the metrics evaluated of the model in each seed
    The resulting DataFrame has |RANDOM_SEEDS|x|METRICS| dimensions
        model: estimator
            A sklearn estimator with fit() and predict() methods
        df: pd.DataFrame
            The input data
        return: pd.DataFrame
            A DataFrame with the metrics evaluated of the model in each seed
    """
    # Load the data
    X = df
    y = pd.read_csv(target_path)['class']

    # Results should be e 2D matrix with |METRICS| columns and |RANDOM_SEEDS| rows
    results = {}

    # Loops through each seed
    # This is done to ensure that the results are robust to the randomness of the data
    for seed in RANDOM_SEEDS:
        scores = {}

        # Define the cross-validation strategy
        kf = KFold(n_splits=5, shuffle=True, random_state=seed)
        # model.random_state = seed

        # Evaluate the model using cross-validation
        for metric in METRICS:
            scores[metric] = cross_val_score(model, X, y, scoring=metric, cv=kf, n_jobs=-1).mean()

        results[seed] = scores

    return pd.DataFrame(results).T

In [8]:
score = evaluate_model(mlp, df)
score

Unnamed: 0,accuracy,precision,recall,f1
454,0.827731,0.781375,0.936775,0.846539
167,0.833782,0.787273,0.936458,0.849998
332,0.856303,0.811169,0.936187,0.867337
322,0.833109,0.787137,0.946815,0.858401
222,0.832941,0.793667,0.919444,0.848417
464,0.80437,0.760514,0.923436,0.831083
955,0.85042,0.798061,0.938039,0.860231
35,0.839664,0.797387,0.934284,0.858312
691,0.84521,0.817922,0.896732,0.851438
292,0.838655,0.814421,0.89827,0.84518


In [11]:
print(score.mean())

accuracy     0.853101
precision    0.814780
recall       0.930969
f1           0.865631
dtype: float64


In [10]:
assert score.shape == (len(RANDOM_SEEDS), len(METRICS))