In [175]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import pickle
from pprint import pp
from mlv2.vectorize import FpVectSupervised
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder


In [176]:
# Load pickle
filePath = "../save/S05_2024-10-27_13-19-50/FpVectSupervised_71390.pickle"
with open(filePath, "rb") as handle:
    fpVectSup: FpVectSupervised = pickle.load(handle)

In [177]:
X = fpVectSup.getX()
y = fpVectSup.getLabels()



In [178]:
le = LabelEncoder()

In [179]:
le.fit(y)
pp(le.classes_)

array([ 0,  1,  4,  5,  7,  8,  9, 10, 11, 12, 13, 15, 16, 19, 20, 21, 22,
       23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
       40, 41, 42, 43, 44, 45, 46, 47, 48, 50, 51, 52, 53, 54, 55, 56, 58,
       59, 60, 62, 63, 64, 65, 66, 67, 68, 71, 72, 74, 75, 77, 78],
      dtype=int64)


In [180]:
pp(y.values)
# yt = pd.Series(le.transform(y))
yt =y
pp(yt.values)

array([72, 72, 72, ..., 78, 78, 78], dtype=int64)
array([72, 72, 72, ..., 78, 78, 78], dtype=int64)


In [181]:
X_train, X_test, y_train, y_test = train_test_split(X, yt, random_state=0)

In [182]:
pipe = Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression())])

In [183]:
pipe.get_params()

{'memory': None,
 'steps': [('scaler', StandardScaler()), ('lr', LogisticRegression())],
 'verbose': False,
 'scaler': StandardScaler(),
 'lr': LogisticRegression(),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'lr__C': 1.0,
 'lr__class_weight': None,
 'lr__dual': False,
 'lr__fit_intercept': True,
 'lr__intercept_scaling': 1,
 'lr__l1_ratio': None,
 'lr__max_iter': 100,
 'lr__multi_class': 'deprecated',
 'lr__n_jobs': None,
 'lr__penalty': 'l2',
 'lr__random_state': None,
 'lr__solver': 'lbfgs',
 'lr__tol': 0.0001,
 'lr__verbose': 0,
 'lr__warm_start': False}

In [184]:
params_grid = {"lr__C": [0.01, 0.1, 1, 10, 100]}

In [185]:
gs = GridSearchCV(
    estimator=pipe, param_grid=params_grid, cv=5, scoring="f1_weighted"
)

In [186]:
gs.fit(X_train, y_train)

In [187]:
gs.best_params_

{'lr__C': 10}

In [188]:
pd.DataFrame(gs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lr__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.02944,0.002534,0.0024,0.000489,0.01,{'lr__C': 0.01},0.715696,0.726685,0.741444,0.733939,0.710702,0.725693,0.011329,5
1,0.045151,0.003818,0.001898,0.000667,0.1,{'lr__C': 0.1},0.984045,0.962314,0.965112,0.984091,0.976561,0.974425,0.009208,4
2,0.068432,0.005159,0.0022,0.000399,1.0,{'lr__C': 1},0.984045,0.973196,0.975994,0.994711,0.987364,0.983062,0.007778,3
3,0.067477,0.010632,0.002199,0.000399,10.0,{'lr__C': 10},0.984045,0.977663,0.983211,1.0,0.989637,0.986911,0.007565,1
4,0.067421,0.010216,0.0024,0.000489,100.0,{'lr__C': 100},0.984045,0.979823,0.983211,0.994993,0.989637,0.986342,0.005353,2


In [189]:
yPred = gs.predict(X_test)
pp(balanced_accuracy_score(yPred,y_test))

0.9833333333333333
