In [1]:
import os
import pandas as pd

In [2]:
cat_attrs: list[str] = ['f12', 'f11', 'f5', 'f9']  # TODO
n_jobs: int = 8

In [3]:
# Parameters
n_jobs = 8


In [4]:
pd.set_option('display.float_format', '{:.3f}'.format)

In [5]:
csv_path = os.path.join('data', 'data.csv')
data = pd.read_csv(csv_path)

In [6]:
X = data.drop(columns=['target', 'p'])
y = data['target']

In [7]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = pd.Series(y, name='target')

In [8]:
from sklearn.model_selection import StratifiedShuffleSplit

test_size_ratio = 0.2
test_split = StratifiedShuffleSplit(n_splits=1, test_size=test_size_ratio, random_state=42)
train_index, test_index = next(test_split.split(X, y))
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]

print("Train:", X_train.shape, y_train.shape)
print("Test: ", X_test.shape, y_test.shape)

Train: (838860, 17) (838860,)
Test:  (209715, 17) (209715,)


In [9]:
from collections import Counter  # TODO

df = X_train.copy()
df['target'] = y_train

class_counts = Counter(df['target'])
print(f"Class distribution before balancing: {class_counts}")

# Find the number of samples in the minority class
min_class_count = min(class_counts.values())

# Undersample the majority class
df_balanced = df.groupby('target').apply(lambda x: x.sample(min_class_count)).reset_index(drop=True)

# Separate the balanced X_train and y_train
X_train = df_balanced.drop('target', axis=1)
y_train = df_balanced['target']

Class distribution before balancing: Counter({0: 824658, 1: 14202})


In [10]:
y_train_pos_ratio = y_train[y_train == 1].shape[0] / y_train.shape[0]
y_test_pos_ratio = y_test[y_test == 1].shape[0] / y_test.shape[0]

print("Train positives ratio:", f"{y_train_pos_ratio * 100:.5f} %")
print("Test positives ratio: ", f"{y_test_pos_ratio * 100:.5f} %")

Train positives ratio: 50.00000 %
Test positives ratio:  1.69325 %


In [11]:
train = pd.concat([X_train, y_train], axis=1)

new_train = train.drop_duplicates()
print("Duplicates removed:", train.shape[0] - new_train.shape[0])
train = new_train

new_train = train.drop_duplicates(subset=train.columns.difference(['target']))
print("Collisions removed:", train.shape[0] - new_train.shape[0])
train = new_train

X_train = train.drop(columns=['target'])
y_train = train['target']

Duplicates removed: 1138
Collisions removed: 1


In [12]:
from sklearn.compose import ColumnTransformer
from itertools import combinations
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

cat_encoder = OneHotEncoder().fit(X[cat_attrs])
cat_encoder.categories_  # TODO

[array([0, 1], dtype=int64),
 array([0, 1], dtype=int64),
 array([1, 2, 3, 4, 5, 6, 8], dtype=int64),
 array([0, 1, 2, 3, 4, 5, 6, 7], dtype=int64)]

In [13]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(categories=cat_encoder.categories_), cat_attrs)
    ],
    remainder=StandardScaler()
)
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

In [14]:
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, HistGradientBoostingClassifier

classifiers = {
    'LogisticRegression': LogisticRegression(random_state=42),
    'RandomForestClassifier': RandomForestClassifier(random_state=42),
    'SVC': SVC(random_state=42),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(random_state=42),
    'GradientBoostingClassifier': GradientBoostingClassifier(random_state=42),
    'AdaBoostClassifier': AdaBoostClassifier(random_state=42),
    'MLPClassifier': MLPClassifier(random_state=42),
    'HistGradientBoostingClassifier': HistGradientBoostingClassifier(random_state=42),
}
param_grid = [
    # {
    #     'classifier': [classifiers['RandomForestClassifier']],
    #     'classifier__n_estimators': [100],
    #     'classifier__max_depth': [None],
    # },
    # {
    #     'classifier': [classifiers['MLPClassifier']],
    #     'classifier__hidden_layer_sizes': [(512, ), (256, ), (256, 64), (64, 64, 64)],
    #     'classifier__batch_size': [32],
    #     'classifier__early_stopping': [True],
    # },
    # {
    #     'classifier': [classifiers['MLPClassifier']],
    #     'classifier__hidden_layer_sizes': [(64,), (64, 64), (128, 64), (64, 128), (128, 128), (64, 64, 64)],
    #     'classifier__learning_rate_init': [0.00001, 0.0001, 0.001],
    #     'classifier__alpha': [0.00001, 0.0001, 0.001],
    #     'classifier__early_stopping': [True],
    # },
    {
        'classifier': [classifiers['HistGradientBoostingClassifier']],
        'classifier__learning_rate': [0.001, 0.01, 0.1],
        'classifier__max_leaf_nodes': [None, 20, 50],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_leaf': [None, 20, 50],
        'classifier__l2_regularization': [0.0, 0.1, 1.0],
        'classifier__early_stopping': [True],
        'classifier__class_weight': [None, 'balanced'],
    },
    # {
    #     'classifier': [classifiers['KNeighborsClassifier']],
    #     'classifier__n_neighbors': [3, 5, 7, 9],
    #     'classifier__weights': ['uniform', 'distance'],
    # },
]
# param_grid = [ 
#     {
#         'classifier': [classifiers['LogisticRegression']],
#         'classifier__C': [0.01, 0.1, 1, 10],
#         'classifier__solver': ['liblinear', 'lbfgs'],
#         'preprocessor__dynamic_column_transformer__cat_attrs': cat_attrs_combinations
#     },
#     {
#         'classifier': [classifiers['RandomForestClassifier']],
#         'classifier__n_estimators': [100, 200, 300],
#         'classifier__max_depth': [None, 30],
#         'preprocessor__dynamic_column_transformer__cat_attrs': cat_attrs_combinations
#     },
#     {
#         'classifier': [classifiers['SVC']],
#         'classifier__C': [0.1, 1, 10, 100],
#         'classifier__kernel': ['linear', 'rbf', 'poly'],
#         'preprocessor__dynamic_column_transformer__cat_attrs': cat_attrs_combinations
#     },
#     {
#         'classifier': [classifiers['KNeighborsClassifier']],
#         'classifier__n_neighbors': [3, 5, 7, 9],
#         'classifier__weights': ['uniform', 'distance'],
#         'preprocessor__dynamic_column_transformer__cat_attrs': cat_attrs_combinations
#     },
#     {
#         'classifier': [classifiers['DecisionTreeClassifier']],
#         'classifier__max_depth': [None, 10, 20, 30],
#         'classifier__min_samples_split': [2, 10, 20],
#         'preprocessor__dynamic_column_transformer__cat_attrs': cat_attrs_combinations
#     },
#     {
#         'classifier': [classifiers['GradientBoostingClassifier']],
#         'classifier__n_estimators': [50, 100, 200],
#         'classifier__learning_rate': [0.01, 0.1, 0.2],
#         'classifier__max_depth': [3, 5, 7],
#         'preprocessor__dynamic_column_transformer__cat_attrs': cat_attrs_combinations
#     },
#     {
#         'classifier': [classifiers['AdaBoostClassifier']],
#         'classifier__n_estimators': [50, 100, 200],
#         'classifier__learning_rate': [0.01, 0.1, 0.2],
#         'preprocessor__dynamic_column_transformer__cat_attrs': cat_attrs_combinations
#     },
#     {
#         'classifier': [classifiers['MLPClassifier']],
#         'classifier__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50), (50, 100), (100, 100), (50, 100, 50)],
#         'classifier__learning_rate_init': [0.001, 0.01, 0.1],
#         'classifier__alpha': [0.0001, 0.001, 0.01],
#         'classifier__early_stopping': [True],
#         'preprocessor__dynamic_column_transformer__cat_attrs': cat_attrs_combinations
#     }
# ]

In [15]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(
    pipeline, param_grid,
    cv=5,
    scoring=['f1', 'precision', 'recall'],
    refit='f1',
    n_jobs=n_jobs,
    verbose=3
)
grid_search.fit(X_train, y_train);

Fitting 5 folds for each of 486 candidates, totalling 2430 fits


810 fits failed out of a total of 2430.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
810 fits failed with the following error:
Traceback (most recent call last):
  File "C:\GAMES and PROGRAMS\my programs\univ5.1_ML\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\GAMES and PROGRAMS\my programs\univ5.1_ML\.venv\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\GAMES and PROGRAMS\my programs\univ5.1_ML\.venv\lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\GAMES and PROGRAMS\my progra

In [16]:
grid_search.best_params_

{'classifier': HistGradientBoostingClassifier(random_state=42),
 'classifier__class_weight': None,
 'classifier__early_stopping': True,
 'classifier__l2_regularization': 0.0,
 'classifier__learning_rate': 0.1,
 'classifier__max_depth': None,
 'classifier__max_leaf_nodes': 50,
 'classifier__min_samples_leaf': 20}

In [17]:
grid_search.best_estimator_

In [18]:
grid_search.best_score_

0.9517033812280695

In [19]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values('rank_test_f1')
cv_results[[
    'rank_test_f1',
    'mean_test_f1',
    'param_classifier',
    # 'param_preprocessor__dynamic_column_transformer__cat_attrs',
    'params'
]].head(10)

Unnamed: 0,rank_test_f1,mean_test_f1,param_classifier,params
61,1,0.952,HistGradientBoostingClassifier(random_state=42),{'classifier': HistGradientBoostingClassifier(...
394,2,0.952,HistGradientBoostingClassifier(random_state=42),{'classifier': HistGradientBoostingClassifier(...
466,3,0.952,HistGradientBoostingClassifier(random_state=42),{'classifier': HistGradientBoostingClassifier(...
476,4,0.952,HistGradientBoostingClassifier(random_state=42),{'classifier': HistGradientBoostingClassifier(...
232,5,0.952,HistGradientBoostingClassifier(random_state=42),{'classifier': HistGradientBoostingClassifier(...
241,6,0.952,HistGradientBoostingClassifier(random_state=42),{'classifier': HistGradientBoostingClassifier(...
143,7,0.952,HistGradientBoostingClassifier(random_state=42),{'classifier': HistGradientBoostingClassifier(...
484,8,0.952,HistGradientBoostingClassifier(random_state=42),{'classifier': HistGradientBoostingClassifier(...
470,9,0.951,HistGradientBoostingClassifier(random_state=42),{'classifier': HistGradientBoostingClassifier(...
161,10,0.951,HistGradientBoostingClassifier(random_state=42),{'classifier': HistGradientBoostingClassifier(...


In [20]:
from sklearn.metrics import f1_score, precision_score, recall_score

y_pred = grid_search.best_estimator_.predict(X_test)
f1 = f1_score(y_test, y_pred)
p = precision_score(y_test, y_pred)
r = recall_score(y_test, y_pred)

print("F1:", f1)
print("Precision:", p)
print("Recall:", r)

F1: 0.48063354931605473
Precision: 0.32285520843408455
Recall: 0.9400168966488314


In [21]:
from joblib import dump

if not os.path.exists('out'):
    os.makedirs('out')
dump(grid_search, os.path.join('out', 'GridSearchCV_dump.pkl'), compress=True)

['out\\GridSearchCV_dump.pkl']