In [7]:
import pandas as pd
features_df = pd.read_csv("/Volumes/Mac/DatasetSP/pipeline1/cleaned_features_1.csv")

In [8]:
metadata = features_df[["Subject", "Test", "Trial", "Bout", "Position"]]
only_features = features_df.drop(columns=["Subject", "Test", "Trial", "Bout", "Position"])

In [11]:
from distfit import distfit
import pandas as pd
from joblib import Parallel, delayed
from tqdm import tqdm


def analyze_feature(feature, data):
    """Analizza una singola feature con distfit e restituisce i risultati."""
    data = data.dropna().values

    if len(data) < 10:  # Impostiamo un minimo di 10 valori validi per il fitting
        return None

    dist = distfit()
    dist.fit_transform(data)

    best_dist = dist.model
    return {
        "Feature": feature,
        "Best_Distribution": best_dist["name"],
        "Params": best_dist["params"],
        "Score": best_dist["score"],
        "P_Value": best_dist["stats"]["pvalue"] if "stats" in best_dist else None
    }

results = []
with tqdm(total=len(features_df.columns), desc="Analizzando le feature") as pbar:
    parallel_results = Parallel(n_jobs=6)(
        delayed(analyze_feature)(feature, only_features[feature]) for feature in only_features.columns
    )
    for result in parallel_results:
        if result is not None:
            results.append(result)
        pbar.update(1)  


results_df = pd.DataFrame(results)
display(results_df.head())

results_df.to_csv("/Volumes/Mac/DatasetSP/preprocessed/distribution_analysis.csv", index=False)


Analizzando le feature:   0%|          | 0/1571 [00:00<?, ?it/s][distfit] >INFO> fit
[distfit] >INFO> transform
[distfit] >INFO> fit
[distfit] >INFO> transform
[distfit] >INFO> fit
[distfit] >INFO> transform
[distfit] >INFO> fit
[distfit] >INFO> transform
[distfit] >INFO> fit
[distfit] >INFO> transform
[distfit] >INFO> [norm      ] [0.00 sec] [RSS: nan] [loc=0.000 scale=0.000]
[distfit] >INFO> [norm      ] [0.00 sec] [RSS: 6.01808e-07] [loc=12294.189 scale=2996.608]
[distfit] >INFO> [expon     ] [0.00 sec] [RSS: nan] [loc=0.000 scale=0.000]
[distfit] >INFO> [expon     ] [0.00 sec] [RSS: 1.16885e-06] [loc=7462.510 scale=4831.679]
[distfit] >INFO> [norm      ] [0.00 sec] [RSS: 0.000520693] [loc=1057.213 scale=85.813]
[distfit] >INFO> [norm      ] [0.00 sec] [RSS: 259.197] [loc=0.887 scale=0.317]
[distfit] >INFO> fit
[distfit] >INFO> transform
[distfit] >INFO> [expon     ] [0.00 sec] [RSS: 280.63] [loc=0.000 scale=0.887]
[distfit] >INFO> [norm      ] [0.00 sec] [RSS: nan] [loc=0.000 scale

Unnamed: 0,Feature,Best_Distribution,Params,Score,P_Value
0,Acc_value__variance_larger_than_standard_devia...,genextreme,"(1.3769186559939617, 0.9348152659754445, 0.089...",89.96084,
1,Acc_value__has_duplicate_max,pareto,"(2.25803497307119, -7.535551383120264e-19, 4.8...",1.178546e+36,
2,Acc_value__has_duplicate_min,pareto,"(2.25803497307119, -7.535551383120264e-19, 4.8...",1.178546e+36,
3,Acc_value__has_duplicate,pareto,"(2.25803497307119, -7.535551383120264e-19, 4.8...",1.178546e+36,
4,Acc_value__sum_values,t,"(2.119236326338349, 1032.7420136762894, 42.010...",0.0001685691,
