In [None]:
import os
import pandas as pd

In [None]:
cat_attrs: list[str] = ['f12', 'f11']
balance_data: bool = False
clear_data: bool = True
n_jobs: int = 1
verbose: int = 10

In [None]:
pd.set_option('display.float_format', '{:.3f}'.format)

In [None]:
csv_path = os.path.join('data', 'data.csv')
data = pd.read_csv(csv_path)

In [None]:
X = data.drop(columns=['target', 'p'])
y = data['target']

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = pd.Series(y, name='target')

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

test_size_ratio = 0.2
test_split = StratifiedShuffleSplit(n_splits=1, test_size=test_size_ratio, random_state=42)
train_index, test_index = next(test_split.split(X, y))
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]

print(f"Train: X={X_train.shape} y={y_train.shape}")
print(f"Test: X={X_test.shape} y={y_test.shape}")

In [None]:
from collections import Counter

if balance_data:
    train = pd.concat([X_train, y_train], axis=1)
    train_n = train.shape[0]
    
    class_counts = Counter(train['target'])
    min_class = min(class_counts, key=lambda x: class_counts[x])
    
    new_train = train.groupby('target').apply(
        lambda x: x.sample(class_counts[min_class])
    ).reset_index(drop=True)
    removed_n = train.shape[0] - new_train.shape[0]
    removed_ratio = removed_n / train_n
    
    print(f"Majority class: '{min_class}'")
    print(f"Records removed: {removed_n} ({removed_ratio * 100:.3f} %)")
    X_train = new_train.drop(columns=['target'])
    y_train = new_train['target']
    print(f"\nTrain: X={X_train.shape} y={y_train.shape}")
else:
    print("balance_data set to False")

In [None]:
y_train_pos_ratio = y_train[y_train == 1].shape[0] / y_train.shape[0]
y_test_pos_ratio = y_test[y_test == 1].shape[0] / y_test.shape[0]

print("Train positives ratio:", f"{y_train_pos_ratio * 100:.5f} %")
print("Test positives ratio: ", f"{y_test_pos_ratio * 100:.5f} %")

In [None]:
if clear_data:
    train = pd.concat([X_train, y_train], axis=1)
    train_n = train.shape[0]
    
    new_train = train.drop_duplicates()
    duplicates_n = train.shape[0] - new_train.shape[0]
    duplicates_ratio = duplicates_n / train_n
    print(f"Duplicates removed: {duplicates_n} ({duplicates_ratio * 100:.3f} %)")
    train = new_train
    
    new_train = train.drop_duplicates(subset=train.columns.difference(['target']))
    collisions_n = train.shape[0] - new_train.shape[0]
    collisions_ratio = collisions_n / train_n
    print(f"Collisions removed: {collisions_n} ({collisions_ratio * 100:.3f} %)")
    train = new_train
    
    print(f"Total removed:      {duplicates_n + collisions_n} "
          f"({(duplicates_ratio + collisions_ratio) * 100:.3f} %)")
    X_train = train.drop(columns=['target'])
    y_train = train['target']
    print(f"\nTrain: X={X_train.shape} y={y_train.shape}")
else:
    print("clear_data set to False")

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

cat_encoder = OneHotEncoder().fit(X[cat_attrs])
cat_encoder.categories_  # TODO

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(categories=cat_encoder.categories_), cat_attrs)
    ],
    remainder=StandardScaler()
)
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from imblearn.ensemble import BalancedBaggingClassifier

classifiers = {
    'LogisticRegression': LogisticRegression(random_state=42),
    'RandomForestClassifier': RandomForestClassifier(random_state=42),
    'SVC': SVC(random_state=42),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(random_state=42),
    'GradientBoostingClassifier': GradientBoostingClassifier(random_state=42),
    'AdaBoostClassifier': AdaBoostClassifier(random_state=42),
    'MLPClassifier': MLPClassifier(random_state=42),
    'HistGradientBoostingClassifier': HistGradientBoostingClassifier(random_state=42),
    'XGBClassifier': XGBClassifier(scale_pos_weight=1.69, eval_metric='logloss'),
    'BalancedBaggingClassifier': BalancedBaggingClassifier(random_state=42),
}
param_grid = [
    {
        'classifier': [classifiers['RandomForestClassifier']],
        'classifier__n_estimators': [100],
        'classifier__max_depth': [None],
    },
    # {
    #     'classifier': [classifiers['MLPClassifier']],
    #     'classifier__hidden_layer_sizes': [(512, ), (256, ), (256, 64), (64, 64, 64)],
    #     'classifier__batch_size': [32],
    #     'classifier__early_stopping': [True],
    # },
    # {
    #     'classifier': [classifiers['MLPClassifier']],
    #     'classifier__hidden_layer_sizes': [(64,), (64, 64), (128, 64), (64, 128), (128, 128), (64, 64, 64)],
    #     'classifier__learning_rate_init': [0.00001, 0.0001, 0.001],
    #     'classifier__alpha': [0.00001, 0.0001, 0.001],
    #     'classifier__early_stopping': [True],
    # },
    # {
    #     'classifier': [classifiers['HistGradientBoostingClassifier']],
    #     'classifier__learning_rate': [0.001, 0.01, 0.1],
    #     'classifier__max_leaf_nodes': [None, 20, 50],
    #     'classifier__max_depth': [None, 10, 20],
    #     'classifier__min_samples_leaf': [None, 20, 50],
    #     'classifier__l2_regularization': [0.0, 0.1, 1.0],
    #     'classifier__early_stopping': [True],
    #     'classifier__class_weight': [None, 'balanced'],
    # },
    # {
    #     'classifier': [classifiers['KNeighborsClassifier']],
    #     'classifier__n_neighbors': [3, 5, 7, 9],
    #     'classifier__weights': ['uniform', 'distance'],
    # },
    
    # {
    #     'classifier': [classifiers['RandomForestClassifier']],
    #     'classifier__n_estimators': [None, 200],  # 100
    #     'classifier__max_depth': [None],
    #     'classifier__class_weight': [None, 'balanced'],
    # },
    # {
    #     # 'classifier': [classifiers['XGBClassifier']],
    #     # 'classifier__n_estimators': [50, 100, 150],           # Number of boosting rounds
    #     # 'classifier__max_depth': [3, 5, 7],                   # Maximum depth of the trees
    #     # 'classifier__learning_rate': [0.01, 0.1, 0.2],        # Learning rate for weight updates
    #     # 'classifier__colsample_bytree': [0.7, 0.8, 1.0],      # Fraction of columns to be randomly sampled for each tree
    #     # 'classifier__gamma': [0, 0.1, 0.2],                   # Minimum loss reduction required to make a further partition on a leaf node
    # },
    # {
    #     'classifier': [classifiers['BalancedBaggingClassifier']],
    #     'classifier__estimator': [DecisionTreeClassifier(random_state=42),],
    #     'classifier__bootstrap': [True, False],
    #     # 'classifier__max_features': [1.0,2.0,3.0],
    #     'classifier__n_estimators': [100, 200],
    #     'classifier__estimator__class_weight': [None, 'balanced'],
    #     'classifier__estimator__criterion': ['gini', 'entropy'],
    #     # 'classifier__estimator__max_depth':[1,2,3,4,5,6],
    #     # 'classifier__estimator__max_features':[1, 'auto'],
    #     'classifier__estimator__min_weight_fraction_leaf': [0.001, 0.005, 0.01],
    #     # 'classifier__estimator__splitter': ['best','random'],
    # }
]
# param_grid = [ 
#     {
#         'classifier': [classifiers['LogisticRegression']],
#         'classifier__C': [0.01, 0.1, 1, 10],
#         'classifier__solver': ['liblinear', 'lbfgs'],
#         'preprocessor__dynamic_column_transformer__cat_attrs': cat_attrs_combinations
#     },
#     {
#         'classifier': [classifiers['RandomForestClassifier']],
#         'classifier__n_estimators': [100, 200, 300],
#         'classifier__max_depth': [None, 30],
#         'preprocessor__dynamic_column_transformer__cat_attrs': cat_attrs_combinations
#     },
#     {
#         'classifier': [classifiers['SVC']],
#         'classifier__C': [0.1, 1, 10, 100],
#         'classifier__kernel': ['linear', 'rbf', 'poly'],
#         'preprocessor__dynamic_column_transformer__cat_attrs': cat_attrs_combinations
#     },
#     {
#         'classifier': [classifiers['KNeighborsClassifier']],
#         'classifier__n_neighbors': [3, 5, 7, 9],
#         'classifier__weights': ['uniform', 'distance'],
#         'preprocessor__dynamic_column_transformer__cat_attrs': cat_attrs_combinations
#     },
#     {
#         'classifier': [classifiers['DecisionTreeClassifier']],
#         'classifier__max_depth': [None, 10, 20, 30],
#         'classifier__min_samples_split': [2, 10, 20],
#         'preprocessor__dynamic_column_transformer__cat_attrs': cat_attrs_combinations
#     },
#     {
#         'classifier': [classifiers['GradientBoostingClassifier']],
#         'classifier__n_estimators': [50, 100, 200],
#         'classifier__learning_rate': [0.01, 0.1, 0.2],
#         'classifier__max_depth': [3, 5, 7],
#         'preprocessor__dynamic_column_transformer__cat_attrs': cat_attrs_combinations
#     },
#     {
#         'classifier': [classifiers['AdaBoostClassifier']],
#         'classifier__n_estimators': [50, 100, 200],
#         'classifier__learning_rate': [0.01, 0.1, 0.2],
#         'preprocessor__dynamic_column_transformer__cat_attrs': cat_attrs_combinations
#     },
#     {
#         'classifier': [classifiers['MLPClassifier']],
#         'classifier__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50), (50, 100), (100, 100), (50, 100, 50)],
#         'classifier__learning_rate_init': [0.001, 0.01, 0.1],
#         'classifier__alpha': [0.0001, 0.001, 0.01],
#         'classifier__early_stopping': [True],
#         'preprocessor__dynamic_column_transformer__cat_attrs': cat_attrs_combinations
#     }
# ]

In [None]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(
    pipeline, param_grid,
    cv=5,
    scoring=['f1', 'precision', 'recall'],
    refit='f1',
    n_jobs=n_jobs,
    verbose=verbose
)
grid_search.fit(X_train, y_train);

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
grid_search.best_score_

In [None]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values('rank_test_f1')
cv_results[[
    'rank_test_f1',
    'mean_test_f1',
    'param_classifier',
    'params'
]].head(10)

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

y_pred = grid_search.best_estimator_.predict(X_test)
f1 = f1_score(y_test, y_pred)
p = precision_score(y_test, y_pred)
r = recall_score(y_test, y_pred)

print("F1:", f1)
print("Precision:", p)
print("Recall:", r)

In [None]:
from joblib import dump

if not os.path.exists('out'):
    os.makedirs('out')
dump(grid_search, os.path.join('out', 'GridSearchCV_dump.pkl'), compress=True)