In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, KFold

from darwin.config import (
    PROCESSED_DATA_DIR,
    RANDOM_STATE,
    METRICS,
    RANDOM_SEEDS
)

[32m2025-04-01 10:44:06.576[0m | [1mINFO    [0m | [36mdarwin.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/eduardoduarte/Projects/refactor-darwin/darwin[0m


In [2]:
input_path = PROCESSED_DATA_DIR / 'feature_imp.csv'
target_path = PROCESSED_DATA_DIR / 'target.csv'

In [3]:
df = pd.read_csv(input_path)

In [4]:
knn = KNeighborsClassifier(n_neighbors=5,
                           weights='uniform',
                           metric='minkowski',
                           )

In [5]:
tree = DecisionTreeClassifier(random_state=RANDOM_STATE,
                              criterion='entropy',
                              max_depth=5,
                              )

In [24]:
mlp = MLPClassifier(
    hidden_layer_sizes=(100,),
    activation="tanh",
    alpha=0.0001,
    solver="adam",
    random_state=RANDOM_STATE,
    learning_rate="adaptive",
    early_stopping=False,
    max_iter=1000,
)

In [7]:
def evaluate_model(model, df: pd.DataFrame) -> pd.DataFrame:
    """
    Evaluate a model using cross-validation
    The model is evaluated using 5-fold cross-validation, 
    metrics used to calculate the final score are defined in METRICS
    For each seed defined in RANDOM_SEEDS, the model is trained and evaluated
    The result is returned as a DataFrame with the metrics evaluated of the model in each seed
    The resulting DataFrame has |RANDOM_SEEDS|x|METRICS| dimensions
        model: estimator
            A sklearn estimator with fit() and predict() methods
        df: pd.DataFrame
            The input data
        return: pd.DataFrame
            A DataFrame with the metrics evaluated of the model in each seed
    """
    # Load the data
    X = df
    y = pd.read_csv(target_path)['class']

    # Results should be e 2D matrix with |METRICS| columns and |RANDOM_SEEDS| rows
    results = {}

    # Loops through each seed
    # This is done to ensure that the results are robust to the randomness of the data
    for seed in RANDOM_SEEDS:
        scores = {}

        # Define the cross-validation strategy
        kf = KFold(n_splits=5, shuffle=True, random_state=seed)
        # model.random_state = seed

        # Evaluate the model using cross-validation
        for metric in METRICS:
            scores[metric] = cross_val_score(model, X, y, scoring=metric, cv=kf, n_jobs=-1).mean()

        results[seed] = scores

    return pd.DataFrame(results).T

In [8]:
score = evaluate_model(mlp, df)
score

Unnamed: 0,accuracy,precision,recall,f1
454,0.80437,0.833733,0.789691,0.804258
167,0.787899,0.826947,0.767508,0.788125
332,0.769748,0.779006,0.783897,0.773668
322,0.815966,0.860539,0.777974,0.813971
222,0.769244,0.776044,0.79451,0.781613
464,0.769076,0.752895,0.817684,0.782552
955,0.787395,0.830367,0.740298,0.774141
35,0.804874,0.845314,0.773985,0.798161
691,0.747563,0.730197,0.80915,0.766679
292,0.816303,0.858187,0.764868,0.802361


In [9]:
print(score.mean())

accuracy     0.786151
precision    0.803450
recall       0.784994
f1           0.788361
dtype: float64


In [10]:
assert score.shape == (len(RANDOM_SEEDS), len(METRICS))

In [42]:
mlp_gs = MLPClassifier(max_iter=1000,
                       early_stopping=False,
                       random_state=RANDOM_STATE,
                       )

parameter_space = {
    'hidden_layer_sizes': [(100,100),(100,10),(100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

from sklearn.model_selection import GridSearchCV
X = df
y = pd.read_csv(target_path)['class']

clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=-1, cv=5, scoring='recall')
clf.fit(X, y) # X is train samples and y is the corresponding labels



In [43]:
print('Best parameters found:\n', clf.best_params_)

Best parameters found:
 {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (100, 100), 'learning_rate': 'constant', 'solver': 'adam'}


In [25]:
mlp

In [26]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from darwin.config import CLASS_WEIGHT

def select_rfe(df: pd.DataFrame, n: int) -> list[str]:
    """
    Given a DataFrame, returns the n most important features selected
    by Recursive Feature Elimination with a Support Vector Classifier
        df: pd.DataFrame
            DataFrame with the data
        n: int
            Number of features to return
        return: list
            List with the n most important features
    """
    # Splits the target and and features
    X = df.drop("class", axis="columns")
    y = df["class"]

    # Fit the selector to the data
    estimator = LogisticRegression(
        random_state=RANDOM_STATE,
        class_weight=CLASS_WEIGHT,
    )
    # estimator = SVC(kernel="linear", random_state=RANDOM_STATE)
    selector = RFE(estimator, n_features_to_select=n, step=1)
    selector = selector.fit(X, y)

    selected_features = X.columns[selector.support_].tolist()

    return selected_features

In [35]:
prepro_path = PROCESSED_DATA_DIR / 'preprocessed_data.csv'

og_data = pd.read_csv(prepro_path)
rfe_data = select_rfe(og_data, n=50)
rfe_data = og_data[rfe_data]
rfe_data.head()

Unnamed: 0,gmrt_on_paper2,mean_speed_on_paper3,num_of_pendown3,disp_index4,gmrt_in_air4,mean_acc_in_air4,pressure_var5,air_time6,gmrt_on_paper6,total_time6,...,max_x_extension21,mean_jerk_in_air22,gmrt_in_air23,mean_speed_in_air23,air_time24,mean_jerk_on_paper24,pressure_mean24,total_time24,max_x_extension25,max_y_extension25
0,-0.842912,-0.952938,0.634383,0.352673,0.549306,1.259456,0.321565,-0.240794,1.548961,-0.350925,...,0.226036,-0.384138,-0.755326,-0.757625,-0.091876,1.149563,0.235056,-0.130212,0.919237,-1.291872
1,-0.392468,-0.956263,2.429223,4.256555,-0.93815,-0.269097,0.655566,0.50608,-1.565836,1.292099,...,0.432118,-0.625082,-1.433867,-1.313514,0.103754,0.109019,-0.193304,0.280857,-0.214399,-0.333294
2,0.018541,0.460417,-0.711747,-0.410656,-0.586813,-0.309289,-0.632098,-0.508424,1.008714,-0.521791,...,0.408586,0.298176,-0.903415,-0.790423,0.619504,0.08955,-1.34253,0.911061,-0.078833,-0.874162
3,-0.867751,-0.763791,1.083093,0.09096,0.428695,0.495633,1.109872,1.537032,-0.812476,1.284033,...,0.382915,-0.255785,0.655712,0.770137,1.654826,0.523499,-0.760555,1.716944,-0.620679,-0.506091
4,-0.591601,-0.792495,0.634383,0.287245,0.292532,0.298772,1.473205,0.090853,-0.684978,-0.043293,...,0.340843,-0.570567,-1.126999,-1.255739,2.175055,0.261479,-0.213721,2.252624,-1.364824,-0.216223


In [41]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(rfe_data, y, test_size=0.2, random_state=RANDOM_STATE)
mlp = mlp
mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)
print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.94      1.00      0.97        15
        True       1.00      0.95      0.97        20

    accuracy                           0.97        35
   macro avg       0.97      0.97      0.97        35
weighted avg       0.97      0.97      0.97        35

[[15  0]
 [ 1 19]]
