In [149]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import pickle
from pprint import pp
from mlv2.vectorize import FpVectSupervised
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from imblearn.combine import SMOTEENN, SMOTETomek

In [150]:
# Load pickle
filePath = "../save/S05_2024-10-28_05-48-58/FpVectSupervised_5330a.pickle"
with open(filePath, "rb") as handle:
    fpVectSup: FpVectSupervised = pickle.load(handle)

In [151]:
X = fpVectSup.getX()
y = fpVectSup.getLabels()

In [152]:
le = LabelEncoder()

In [153]:
le.fit(y)
pp(le.classes_)

array([ 0,  1,  4,  5,  7,  8,  9, 10, 11, 12, 13, 15, 16, 19, 20, 21, 22,
       23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
       40, 41, 42, 43, 44, 45, 46, 47, 48, 50, 51, 52, 53, 54, 55, 56, 58,
       59, 60, 62, 63, 64, 65, 66, 67, 68, 71, 72, 74, 75, 77, 78],
      dtype=int64)


In [154]:
pp(y.values)
# yt = pd.Series(le.transform(y))
yt = y
pp(yt.values)

array([72, 72, 72, ..., 78, 78, 78], dtype=int64)
array([72, 72, 72, ..., 78, 78, 78], dtype=int64)


In [155]:
X_train, X_test, y_train, y_test = train_test_split(X, yt, random_state=0)

In [156]:
y_train.value_counts()

y
9     40
15    33
62    28
24    26
1     25
      ..
20     8
38     8
35     8
22     7
41     7
Name: count, Length: 66, dtype: int64

In [157]:
stats = y_train.value_counts().describe()
display(stats)

count    66.000000
mean     14.681818
std       6.635937
min       7.000000
25%      10.000000
50%      12.000000
75%      18.000000
max      40.000000
Name: count, dtype: float64

In [158]:
# target = int(np.ceil(stats["mean"]))
target = int(np.ceil(stats["75%"]))

def rowFn(row):
    y = row["y"]
    _count = row["count"]
    if _count < target:
        count = target
    else:
        count = _count 
    return pd.Series([y, count], index=["y", "count"])

res = y_train.value_counts().reset_index().apply(rowFn, axis=1)
samplingStrategy = res.set_index("y", drop=True).to_dict()["count"]


In [159]:
samplingStrategy

{9: 40,
 15: 33,
 62: 28,
 24: 26,
 1: 25,
 51: 24,
 33: 22,
 5: 22,
 28: 21,
 68: 21,
 55: 21,
 32: 20,
 43: 20,
 44: 20,
 56: 20,
 37: 20,
 72: 18,
 64: 18,
 75: 18,
 40: 18,
 23: 18,
 65: 18,
 19: 18,
 30: 18,
 4: 18,
 67: 18,
 48: 18,
 7: 18,
 63: 18,
 8: 18,
 66: 18,
 21: 18,
 27: 18,
 60: 18,
 25: 18,
 54: 18,
 53: 18,
 26: 18,
 12: 18,
 52: 18,
 36: 18,
 29: 18,
 34: 18,
 10: 18,
 45: 18,
 50: 18,
 74: 18,
 59: 18,
 78: 18,
 58: 18,
 31: 18,
 46: 18,
 16: 18,
 71: 18,
 0: 18,
 47: 18,
 11: 18,
 39: 18,
 77: 18,
 13: 18,
 42: 18,
 20: 18,
 38: 18,
 35: 18,
 22: 18,
 41: 18}

In [160]:
# sme = SMOTEENN(sampling_strategy=samplingStrategy, random_state=42)
sme = SMOTETomek(sampling_strategy=samplingStrategy, random_state=42)
X_res, y_res = sme.fit_resample(X_train, y_train)

In [161]:
y_res.value_counts()

y
9     40
15    33
62    28
24    26
1     25
      ..
11    18
16    18
21    18
35    18
53    17
Name: count, Length: 66, dtype: int64

In [162]:
pipe = Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression())])

In [163]:
pipe.get_params()

{'memory': None,
 'steps': [('scaler', StandardScaler()), ('lr', LogisticRegression())],
 'verbose': False,
 'scaler': StandardScaler(),
 'lr': LogisticRegression(),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'lr__C': 1.0,
 'lr__class_weight': None,
 'lr__dual': False,
 'lr__fit_intercept': True,
 'lr__intercept_scaling': 1,
 'lr__l1_ratio': None,
 'lr__max_iter': 100,
 'lr__multi_class': 'deprecated',
 'lr__n_jobs': None,
 'lr__penalty': 'l2',
 'lr__random_state': None,
 'lr__solver': 'lbfgs',
 'lr__tol': 0.0001,
 'lr__verbose': 0,
 'lr__warm_start': False}

In [164]:
params_grid = {"lr__C": [0.01, 0.1, 1, 10, 100]}

In [165]:
gs = GridSearchCV(
    estimator=pipe, param_grid=params_grid, cv=5, scoring="f1_micro"
)

In [166]:
gs.fit(X_train, y_train)

In [167]:
gs.best_params_

{'lr__C': 10}

In [168]:
pd.DataFrame(gs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lr__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.025964,0.00136,0.002365,0.0004489374,0.01,{'lr__C': 0.01},0.783505,0.793814,0.804124,0.793814,0.777202,0.790492,0.00931,5
1,0.041196,0.001509,0.001999,2.40602e-06,0.1,{'lr__C': 0.1},0.984536,0.969072,0.969072,0.984536,0.979275,0.977298,0.006986,4
2,0.063278,0.001425,0.002446,0.0004604441,1.0,{'lr__C': 1},0.984536,0.979381,0.979381,0.994845,0.989637,0.985556,0.006004,3
3,0.062045,0.00259,0.002007,8.577769e-06,10.0,{'lr__C': 10},0.984536,0.979381,0.984536,1.0,0.989637,0.987618,0.006989,1
4,0.065138,0.010569,0.002,3.234067e-07,100.0,{'lr__C': 100},0.984536,0.979381,0.984536,0.994845,0.989637,0.986587,0.00525,2


In [169]:
yPred = gs.predict(X_test)
pp(balanced_accuracy_score(yPred,y_test))

0.9833333333333333
