In [32]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, KFold

from darwin.config import (
    PROCESSED_DATA_DIR,
    RANDOM_STATE,
    METRICS,
    RANDOM_SEEDS
)

In [33]:
input_path = PROCESSED_DATA_DIR / 'feature_imp.csv'
target_path = PROCESSED_DATA_DIR / 'target.csv'

In [34]:
df = pd.read_csv(input_path)

In [35]:
knn = KNeighborsClassifier(n_neighbors=5,
                           weights='uniform',
                           metric='minkowski',
                           )

In [36]:
tree = DecisionTreeClassifier(random_state=RANDOM_STATE,
                              criterion='entropy',
                              max_depth=5,
                              )

In [37]:
mlp = MLPClassifier(
    hidden_layer_sizes=(100,),
    activation="tanh",
    alpha=0.0001,
    solver="adam",
    random_state=RANDOM_STATE,
    learning_rate="adaptive",
    early_stopping=False,
    max_iter=1000,
)

In [38]:
def evaluate_model(model, df: pd.DataFrame) -> pd.DataFrame:
    """
    Evaluate a model using cross-validation
    The model is evaluated using 5-fold cross-validation, 
    metrics used to calculate the final score are defined in METRICS
    For each seed defined in RANDOM_SEEDS, the model is trained and evaluated
    The result is returned as a DataFrame with the metrics evaluated of the model in each seed
    The resulting DataFrame has |RANDOM_SEEDS|x|METRICS| dimensions
        model: estimator
            A sklearn estimator with fit() and predict() methods
        df: pd.DataFrame
            The input data
        return: pd.DataFrame
            A DataFrame with the metrics evaluated of the model in each seed
    """
    # Load the data
    X = df
    y = pd.read_csv(target_path)['class']

    # Results should be e 2D matrix with |METRICS| columns and |RANDOM_SEEDS| rows
    results = {}

    # Loops through each seed
    # This is done to ensure that the results are robust to the randomness of the data
    for seed in RANDOM_SEEDS:
        scores = {}

        # Define the cross-validation strategy
        kf = KFold(n_splits=5, shuffle=True, random_state=seed)
        # model.random_state = seed

        # Evaluate the model using cross-validation
        for metric in METRICS:
            scores[metric] = cross_val_score(model, X, y, scoring=metric, cv=kf, n_jobs=-1).mean()

        results[seed] = scores

    return pd.DataFrame(results).T

In [39]:
score = evaluate_model(mlp, df)
score



KeyboardInterrupt: 

In [None]:
print(score.mean())

accuracy     0.865513
precision    0.880258
recall       0.858392
f1           0.865130
dtype: float64


In [None]:
assert score.shape == (len(RANDOM_SEEDS), len(METRICS))

In [None]:
prepro_path = PROCESSED_DATA_DIR / 'preprocessed_data.csv'
df = pd.read_csv(prepro_path)

In [40]:
mlp_gs = MLPClassifier(max_iter=1000,
                       early_stopping=False,
                       random_state=RANDOM_STATE,
                       )

input_path = PROCESSED_DATA_DIR / 'rfe.csv'
df = pd.read_csv(input_path)

parameter_space = {
    'hidden_layer_sizes': [(100,100),(100,10),(100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

from sklearn.model_selection import GridSearchCV
X = df
y = pd.read_csv(target_path)['class']

clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=-1, cv=5, scoring='recall')
clf.fit(X, y) # X is train samples and y is the corresponding labels

print('Best parameters found:\n', clf.best_params_)

Best parameters found:
 {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'solver': 'adam'}


In [None]:
tree_gs = DecisionTreeClassifier(random_state=RANDOM_STATE)

input_path = PROCESSED_DATA_DIR / 'feature_imp.csv'
df = pd.read_csv(input_path)

parameter_space = {
    'criterion': ['entropy', 'gini', 'log_loss'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 3, 4, 5, 6, 7, 8],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['sqrt', 'log2', None],
    'class_weight': ['balanced', None],
}

from sklearn.model_selection import GridSearchCV
df = pd.read_csv(prepro_path)

X = df
y = pd.read_csv(target_path)['class']

clf = GridSearchCV(tree_gs, parameter_space, n_jobs=-1, cv=5, scoring='recall')
clf.fit(X, y) # X is train samples and y is the corresponding labels

print('Best parameters found:\n', clf.best_params_)

Best parameters found:
 {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}


In [None]:
knn_gs = KNeighborsClassifier()

input_path = PROCESSED_DATA_DIR / 'anova.csv'
df = pd.read_csv(input_path)

parameter_space = {
    'n_neighbors': [3,5,7,9,11],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 20, 30, 40, 50],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'p': [1, 2],
}

from sklearn.model_selection import GridSearchCV

X = df
y = pd.read_csv(target_path)['class']

clf = GridSearchCV(knn_gs, parameter_space, n_jobs=-1, cv=5, scoring='recall')
clf.fit(X, y) # X is train samples and y is the corresponding labels

print('Best parameters found:\n', clf.best_params_)
print('Best score found:\n', clf.best_score_)

Best parameters found:
 {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'euclidean', 'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}
Best score found:
 0.6849673202614379


In [None]:
from sklearn.model_selection import StratifiedKFold
def evaluate_model(model, df: pd.DataFrame) -> pd.DataFrame:
    """
    Evaluate a model using cross-validation
    The model is evaluated using 5-fold cross-validation,
    metrics used to calculate the final score are defined in METRICS
    For each seed defined in RANDOM_SEEDS, the model is trained and evaluated
    The result is returned as a DataFrame with the metrics evaluated of the model in each seed
    The resulting DataFrame has |RANDOM_SEEDS|x|METRICS| dimensions
        model: estimator
            A sklearn estimator with fit() and predict() methods
        df: pd.DataFrame
            The input data
        return: pd.DataFrame
            A DataFrame with the metrics evaluated of the model in each seed
    """
    # Load the data
    X = df
    y = pd.read_csv(target_path)["class"]

    # Results should be e 2D matrix with |METRICS| columns and |RANDOM_SEEDS| rows
    results = {}

    # Loops through each seed
    # This is done to ensure that the results are robust to the randomness of the data
    for seed in RANDOM_SEEDS:
        scores = {}

        # Define the cross-validation strategy
        kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
        # model.random_state = seed

        # Evaluate the model using cross-validation
        for metric in METRICS:
            scores[metric] = cross_val_score(model, X, y, scoring=metric, cv=kf, n_jobs=-1).mean()

        results[seed] = scores

    return pd.DataFrame(results).T

In [None]:
tree = DecisionTreeClassifier(random_state=RANDOM_STATE)


evaluate_model(tree, X_transformed)

Unnamed: 0,accuracy,precision,recall,f1
454,0.805378,0.829244,0.787582,0.806976
167,0.753109,0.757413,0.764706,0.760463
332,0.764538,0.766672,0.786928,0.775495
322,0.770084,0.812557,0.719608,0.752831
222,0.793613,0.818145,0.786928,0.797423
464,0.747395,0.763411,0.752941,0.751353
955,0.787731,0.819167,0.753595,0.782681
35,0.788067,0.796977,0.787582,0.78817
691,0.74084,0.744094,0.763399,0.751143
292,0.770084,0.798897,0.741176,0.765417


In [54]:
from sklearn.feature_selection import RFE
from sklearn.model_selection import RepeatedStratifiedKFold
import numpy as np

input_path = PROCESSED_DATA_DIR / 'preprocessed_data.csv'
df = pd.read_csv(input_path)

X = df.drop(columns=['class'])
y = pd.read_csv(target_path)['class']

new_model = DecisionTreeClassifier(random_state=RANDOM_STATE)
rfe = RFE(estimator=new_model, n_features_to_select=20)
X_transformed = rfe.fit_transform(X, y)

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=RANDOM_STATE)

scores = cross_val_score(new_model, X_transformed, y, cv=cv, scoring='accuracy')
precision = cross_val_score(new_model, X_transformed, y, cv=cv, scoring='precision')
recall = cross_val_score(new_model, X_transformed, y, cv=cv, scoring='recall')
f1 = cross_val_score(new_model, X_transformed, y, cv=cv, scoring='f1')

print('Accuracy: ', np.mean(scores))
print('Precision: ', np.mean(precision))
print('Recall: ', np.mean(recall))
print('F1: ', np.mean(f1))


Accuracy:  0.8657703081232494
Precision:  0.8800011541033214
Recall:  0.8568627450980392
F1:  0.8655381890708467
