In [38]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import pickle
from pprint import pp
from mlv2.vectorize import FpVectSupervised
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.under_sampling import ClusterCentroids, EditedNearestNeighbours
from sklearn.cluster import MiniBatchKMeans

In [39]:
# Load pickle
filePath = "../save/S05_2024-10-28_05-48-58/FpVectSupervised_5330a.pickle"
with open(filePath, "rb") as handle:
    fpVectSup: FpVectSupervised = pickle.load(handle)

In [40]:
X = fpVectSup.getX()
y = fpVectSup.getLabels()

In [41]:
le = LabelEncoder()

In [42]:
le.fit(y)
pp(le.classes_)

array([ 0,  1,  4,  5,  7,  8,  9, 10, 11, 12, 13, 15, 16, 19, 20, 21, 22,
       23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
       40, 41, 42, 43, 44, 45, 46, 47, 48, 50, 51, 52, 53, 54, 55, 56, 58,
       59, 60, 62, 63, 64, 65, 66, 67, 68, 71, 72, 74, 75, 77, 78],
      dtype=int64)


In [43]:
pp(y.values)
# yt = pd.Series(le.transform(y))
yt = y
pp(yt.values)

array([72, 72, 72, ..., 78, 78, 78], dtype=int64)
array([72, 72, 72, ..., 78, 78, 78], dtype=int64)


In [44]:
X_train1, X_test, y_train1, y_test = train_test_split(X, yt, test_size=0.3, random_state=42, stratify=yt)

In [45]:
y_train1.value_counts()

y
9     38
24    29
15    29
51    27
62    27
      ..
71     7
42     7
46     7
41     6
38     6
Name: count, Length: 66, dtype: int64

In [46]:
stats = y_train1.value_counts().describe()
display(stats)

count    66.00000
mean     13.69697
std       6.57494
min       6.00000
25%       8.25000
50%      12.00000
75%      17.00000
max      38.00000
Name: count, dtype: float64

In [47]:
# target = int(np.ceil(stats["mean"]))
target = int(np.ceil(stats["75%"]))

def rowFn(row):
    y = row["y"]
    _count = row["count"]
    if _count < target:
        count = target
    else:
        count = _count 
    return pd.Series([y, count], index=["y", "count"])

res = y_train1.value_counts().reset_index().apply(rowFn, axis=1)
overSamplingStrategy = res.set_index("y", drop=True).to_dict()["count"]
display(overSamplingStrategy)


{9: 38,
 24: 29,
 15: 29,
 51: 27,
 62: 27,
 1: 26,
 68: 22,
 55: 20,
 5: 20,
 43: 20,
 33: 20,
 56: 20,
 28: 19,
 72: 18,
 32: 18,
 7: 17,
 30: 17,
 37: 17,
 40: 17,
 64: 17,
 44: 17,
 23: 17,
 65: 17,
 48: 17,
 75: 17,
 19: 17,
 21: 17,
 4: 17,
 63: 17,
 67: 17,
 59: 17,
 66: 17,
 10: 17,
 8: 17,
 78: 17,
 74: 17,
 25: 17,
 31: 17,
 29: 17,
 52: 17,
 53: 17,
 11: 17,
 54: 17,
 13: 17,
 36: 17,
 34: 17,
 45: 17,
 27: 17,
 12: 17,
 20: 17,
 35: 17,
 22: 17,
 16: 17,
 77: 17,
 47: 17,
 50: 17,
 0: 17,
 58: 17,
 26: 17,
 60: 17,
 39: 17,
 71: 17,
 42: 17,
 46: 17,
 41: 17,
 38: 17}

In [48]:
kNeighborsMax = 6
minNumSample = y_train1.value_counts().min() - 1 
kNeightbors = minNumSample if minNumSample <= kNeighborsMax else kNeighborsMax
pp(kNeightbors)

5


In [49]:
oversampler = SMOTEENN(sampling_strategy=overSamplingStrategy, random_state=42, smote=SMOTE(sampling_strategy=overSamplingStrategy, k_neighbors=kNeightbors))
# oversampler = SMOTETomek(sampling_strategy=overSamplingStrategy, random_state=42, k_neighbors=kNeightbors)
# oversampler = SMOTE(sampling_strategy=overSamplingStrategy, random_state=42, k_neighbors=kNeightbors)
X_train2, y_train2 = oversampler.fit_resample(X_train1, y_train1)

In [50]:
y_train2.value_counts()

y
9     38
24    29
15    29
51    27
62    27
      ..
31    16
77    16
75    14
19    11
20     9
Name: count, Length: 66, dtype: int64

In [51]:
stats = y_train2.value_counts().describe()
target = int(np.ceil(stats["75%"]))

def rowFn(row):
    y = row["y"]
    _count = row["count"]
    if _count > target:
        count = target
    else:
        count = _count 
    return pd.Series([y, count], index=["y", "count"])

res = y_train2.value_counts().reset_index().apply(rowFn, axis=1)
underSamplingStrategy = res.set_index("y", drop=True).to_dict()["count"]

In [52]:
underSampler = ClusterCentroids(
    sampling_strategy=underSamplingStrategy,
    estimator=MiniBatchKMeans(n_init=1, random_state=0),
    random_state=42,
)
X_train3, y_train3 = underSampler.fit_resample(X_train2, y_train2)
y_train3.value_counts()

y
0     17
37    17
1     17
40    17
41    17
      ..
31    16
64    16
75    14
19    11
20     9
Name: count, Length: 66, dtype: int64

In [53]:
X_train = X_train3
y_train = y_train3

In [54]:
pipe = Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression())])

In [55]:
pipe.get_params()

{'memory': None,
 'steps': [('scaler', StandardScaler()), ('lr', LogisticRegression())],
 'verbose': False,
 'scaler': StandardScaler(),
 'lr': LogisticRegression(),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'lr__C': 1.0,
 'lr__class_weight': None,
 'lr__dual': False,
 'lr__fit_intercept': True,
 'lr__intercept_scaling': 1,
 'lr__l1_ratio': None,
 'lr__max_iter': 100,
 'lr__multi_class': 'deprecated',
 'lr__n_jobs': None,
 'lr__penalty': 'l2',
 'lr__random_state': None,
 'lr__solver': 'lbfgs',
 'lr__tol': 0.0001,
 'lr__verbose': 0,
 'lr__warm_start': False}

In [56]:
params_grid = {"lr__C": [0.01, 0.1, 1, 10, 100]}

In [57]:
gs = GridSearchCV(
    estimator=pipe, param_grid=params_grid, cv=5, scoring="f1_micro"
)

In [58]:
gs.fit(X_train, y_train)

In [59]:
gs.best_params_

{'lr__C': 1}

In [60]:
pd.DataFrame(gs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_lr__C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.023146,0.002993,0.002342,0.000372,0.01,{'lr__C': 0.01},0.968182,0.927273,0.927273,0.981818,0.963636,0.953636,0.022342,5
1,0.041498,0.001473,0.0022,0.0004,0.1,{'lr__C': 0.1},0.995455,0.977273,0.981818,0.990909,0.990909,0.987273,0.00668,4
2,0.055009,0.006467,0.002512,0.000451,1.0,{'lr__C': 1},1.0,0.986364,1.0,1.0,1.0,0.997273,0.005455,1
3,0.042196,0.002372,0.0028,0.000401,10.0,{'lr__C': 10},1.0,0.986364,1.0,1.0,1.0,0.997273,0.005455,1
4,0.034225,0.001084,0.002604,0.000492,100.0,{'lr__C': 100},1.0,0.986364,1.0,1.0,1.0,0.997273,0.005455,1


In [61]:
yPred = gs.predict(X_test)
pp(balanced_accuracy_score(yPred,y_test))

0.9793040293040292




In [62]:
from sklearn.metrics import classification_report

In [63]:
report = classification_report(y_test, yPred, output_dict=True)
dfReport = pd.DataFrame(report).transpose()
dfReport


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,precision,recall,f1-score,support
0,1.000000,1.000000,1.000000,3.000000
1,1.000000,0.909091,0.952381,11.000000
4,0.750000,1.000000,0.857143,6.000000
5,1.000000,0.888889,0.941176,9.000000
7,1.000000,1.000000,1.000000,7.000000
...,...,...,...,...
77,0.000000,0.000000,0.000000,3.000000
78,1.000000,1.000000,1.000000,5.000000
accuracy,0.976804,0.976804,0.976804,0.976804
macro avg,0.964466,0.971687,0.964818,388.000000


In [76]:
filt  = dfReport.iloc[:-3, :]["f1-score"] < 0.9
dfReport.iloc[:-3, :][filt].sort_values

Unnamed: 0,precision,recall,f1-score,support
4,0.75,1.0,0.857143,6.0
13,0.666667,1.0,0.8,4.0
19,1.0,0.666667,0.8,6.0
20,0.666667,1.0,0.8,4.0
60,0.571429,1.0,0.727273,4.0
75,1.0,0.666667,0.8,6.0
77,0.0,0.0,0.0,3.0
