In [52]:
import os
import pandas as pd

In [53]:
cat_attrs: list[str] = ['f12', 'f11', 'f5', 'f9']  # TODO
n_jobs: int = 1

In [54]:
# Parameters
n_jobs = 8


In [55]:
pd.set_option('display.float_format', '{:.3f}'.format)

In [56]:
csv_path = os.path.join('data', 'data.csv')
data = pd.read_csv(csv_path)

In [57]:
X = data.drop(columns=['target', 'p'])
y = data['target']

In [58]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = pd.Series(y, name='target')

In [59]:
from sklearn.model_selection import StratifiedShuffleSplit

test_size_ratio = 0.2
test_split = StratifiedShuffleSplit(n_splits=1, test_size=test_size_ratio, random_state=42)
train_index, test_index = next(test_split.split(X, y))
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]

print("Train:", X_train.shape, y_train.shape)
print("Test: ", X_test.shape, y_test.shape)

Train: (838860, 17) (838860,)
Test:  (209715, 17) (209715,)


In [60]:
# from collections import Counter  # TODO
# 
# df = X_train.copy()
# df['target'] = y_train
# 
# class_counts = Counter(df['target'])
# print(f"Class distribution before balancing: {class_counts}")
# 
# # Find the number of samples in the minority class
# min_class_count = min(class_counts.values())
# 
# # Undersample the majority class
# df_balanced = df.groupby('target').apply(lambda x: x.sample(min_class_count)).reset_index(drop=True)
# 
# # Separate the balanced X_train and y_train
# X_train = df_balanced.drop('target', axis=1)
# y_train = df_balanced['target']

In [61]:
y_train_pos_ratio = y_train[y_train == 1].shape[0] / y_train.shape[0]
y_test_pos_ratio = y_test[y_test == 1].shape[0] / y_test.shape[0]

print("Train positives ratio:", f"{y_train_pos_ratio * 100:.5f} %")
print("Test positives ratio: ", f"{y_test_pos_ratio * 100:.5f} %")

Train positives ratio: 1.69301 %
Test positives ratio:  1.69325 %


In [62]:
train = pd.concat([X_train, y_train], axis=1)

new_train = train.drop_duplicates()
print("Duplicates removed:", train.shape[0] - new_train.shape[0])
train = new_train

new_train = train.drop_duplicates(subset=train.columns.difference(['target']))
print("Collisions removed:", train.shape[0] - new_train.shape[0])
train = new_train

X_train = train.drop(columns=['target'])
y_train = train['target']

Duplicates removed: 61923
Collisions removed: 56


In [63]:
from sklearn.compose import ColumnTransformer
from itertools import combinations
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.pipeline import Pipeline as ImPipeline

cat_encoder = OneHotEncoder().fit(X[cat_attrs])
cat_encoder.categories_  # TODO

[array([0, 1], dtype=int64),
 array([0, 1], dtype=int64),
 array([1, 2, 3, 4, 5, 6, 8], dtype=int64),
 array([0, 1, 2, 3, 4, 5, 6, 7], dtype=int64)]

In [64]:
from imblearn.over_sampling import SMOTE

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(categories=cat_encoder.categories_), cat_attrs)
    ],
    remainder=StandardScaler()
)
pipeline = ImPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression())
])

In [65]:
from sklearn.linear_model import SGDOneClassSVM
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier

classifiers = {
    'LogisticRegression': LogisticRegression(random_state=42),
    'RandomForestClassifier': RandomForestClassifier(random_state=42),
    'SVC': SVC(random_state=42),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(random_state=42),
    'GradientBoostingClassifier': GradientBoostingClassifier(random_state=42),
    'AdaBoostClassifier': AdaBoostClassifier(random_state=42),
    'MLPClassifier': MLPClassifier(random_state=42),
    'HistGradientBoostingClassifier': HistGradientBoostingClassifier(random_state=42),
    'XGBClassifier': XGBClassifier(scale_pos_weight=1.69, eval_metric='logloss'),
}
param_grid = [
    # {
    #     'classifier': [classifiers['RandomForestClassifier']],
    #     'classifier__n_estimators': [100],
    #     'classifier__max_depth': [None],
    # },
    # {
    #     'classifier': [classifiers['MLPClassifier']],
    #     'classifier__hidden_layer_sizes': [(512, ), (256, ), (256, 64), (64, 64, 64)],
    #     'classifier__batch_size': [32],
    #     'classifier__early_stopping': [True],
    # },
    # {
    #     'classifier': [classifiers['MLPClassifier']],
    #     'classifier__hidden_layer_sizes': [(64,), (64, 64), (128, 64), (64, 128), (128, 128), (64, 64, 64)],
    #     'classifier__learning_rate_init': [0.00001, 0.0001, 0.001],
    #     'classifier__alpha': [0.00001, 0.0001, 0.001],
    #     'classifier__early_stopping': [True],
    # },
    # {
    #     'classifier': [classifiers['HistGradientBoostingClassifier']],
    #     'classifier__learning_rate': [0.001, 0.01, 0.1],
    #     'classifier__max_leaf_nodes': [None, 20, 50],
    #     'classifier__max_depth': [None, 10, 20],
    #     'classifier__min_samples_leaf': [None, 20, 50],
    #     'classifier__l2_regularization': [0.0, 0.1, 1.0],
    #     'classifier__early_stopping': [True],
    #     'classifier__class_weight': [None, 'balanced'],
    # },
    # {
    #     'classifier': [classifiers['KNeighborsClassifier']],
    #     'classifier__n_neighbors': [3, 5, 7, 9],
    #     'classifier__weights': ['uniform', 'distance'],
    # },
    
    # {
    #     'classifier': [classifiers['RandomForestClassifier']],
    #     'classifier__n_estimators': [None, 200],  # 100
    #     'classifier__max_depth': [None],
    #     'classifier__class_weight': [None, 'balanced'],
    # },
    {
        'classifier': [classifiers['XGBClassifier']],
        'classifier__n_estimators': [150],           # Number of boosting rounds
        'classifier__max_depth': [7],                   # Maximum depth of the trees
        # 'classifier__learning_rate': [0.01, 0.1, 0.2],        # Learning rate for weight updates
        # 'classifier__colsample_bytree': [0.7, 0.8, 1.0],      # Fraction of columns to be randomly sampled for each tree
        # 'classifier__gamma': [0, 0.1, 0.2],                   # Minimum loss reduction required to make a further partition on a leaf node
    }
]
# param_grid = [ 
#     {
#         'classifier': [classifiers['LogisticRegression']],
#         'classifier__C': [0.01, 0.1, 1, 10],
#         'classifier__solver': ['liblinear', 'lbfgs'],
#         'preprocessor__dynamic_column_transformer__cat_attrs': cat_attrs_combinations
#     },
#     {
#         'classifier': [classifiers['RandomForestClassifier']],
#         'classifier__n_estimators': [100, 200, 300],
#         'classifier__max_depth': [None, 30],
#         'preprocessor__dynamic_column_transformer__cat_attrs': cat_attrs_combinations
#     },
#     {
#         'classifier': [classifiers['SVC']],
#         'classifier__C': [0.1, 1, 10, 100],
#         'classifier__kernel': ['linear', 'rbf', 'poly'],
#         'preprocessor__dynamic_column_transformer__cat_attrs': cat_attrs_combinations
#     },
#     {
#         'classifier': [classifiers['KNeighborsClassifier']],
#         'classifier__n_neighbors': [3, 5, 7, 9],
#         'classifier__weights': ['uniform', 'distance'],
#         'preprocessor__dynamic_column_transformer__cat_attrs': cat_attrs_combinations
#     },
#     {
#         'classifier': [classifiers['DecisionTreeClassifier']],
#         'classifier__max_depth': [None, 10, 20, 30],
#         'classifier__min_samples_split': [2, 10, 20],
#         'preprocessor__dynamic_column_transformer__cat_attrs': cat_attrs_combinations
#     },
#     {
#         'classifier': [classifiers['GradientBoostingClassifier']],
#         'classifier__n_estimators': [50, 100, 200],
#         'classifier__learning_rate': [0.01, 0.1, 0.2],
#         'classifier__max_depth': [3, 5, 7],
#         'preprocessor__dynamic_column_transformer__cat_attrs': cat_attrs_combinations
#     },
#     {
#         'classifier': [classifiers['AdaBoostClassifier']],
#         'classifier__n_estimators': [50, 100, 200],
#         'classifier__learning_rate': [0.01, 0.1, 0.2],
#         'preprocessor__dynamic_column_transformer__cat_attrs': cat_attrs_combinations
#     },
#     {
#         'classifier': [classifiers['MLPClassifier']],
#         'classifier__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50), (50, 100), (100, 100), (50, 100, 50)],
#         'classifier__learning_rate_init': [0.001, 0.01, 0.1],
#         'classifier__alpha': [0.0001, 0.001, 0.01],
#         'classifier__early_stopping': [True],
#         'preprocessor__dynamic_column_transformer__cat_attrs': cat_attrs_combinations
#     }
# ]

In [66]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(
    pipeline, param_grid,
    cv=5,
    scoring=['f1', 'precision', 'recall'],
    refit='f1',
    n_jobs=n_jobs,
    verbose=10,
    return_train_score=True,
)
grid_search.fit(X_train, y_train);

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [67]:
grid_search.best_params_

{'classifier': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric='logloss',
               feature_types=None, gamma=None, grow_policy=None,
               importance_type=None, interaction_constraints=None,
               learning_rate=None, max_bin=None, max_cat_threshold=None,
               max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
               max_leaves=None, min_child_weight=None, missing=nan,
               monotone_constraints=None, multi_strategy=None, n_estimators=None,
               n_jobs=None, num_parallel_tree=None, random_state=None, ...),
 'classifier__max_depth': 7,
 'classifier__n_estimators': 150}

In [68]:
grid_search.best_estimator_

In [69]:
grid_search.best_score_

0.7205547816668724

In [70]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values('rank_test_f1')
cv_results[[
    'rank_test_f1',
    'mean_test_f1',
    'param_classifier',
    # 'param_preprocessor__dynamic_column_transformer__cat_attrs',
    'params'
]].head(10)

Unnamed: 0,rank_test_f1,mean_test_f1,param_classifier,params
0,1,0.721,"XGBClassifier(base_score=None, booster=None, c...","{'classifier': XGBClassifier(base_score=None, ..."


In [71]:
from sklearn.metrics import f1_score, precision_score, recall_score

y_pred = grid_search.best_estimator_.predict(X_test)
f1 = f1_score(y_test, y_pred)
p = precision_score(y_test, y_pred)
r = recall_score(y_test, y_pred)

print("F1:", f1)
print("Precision:", p)
print("Recall:", r)

F1: 0.7269887046383081
Precision: 0.6340389855376232
Recall: 0.8518727119121374


In [72]:
from joblib import dump

if not os.path.exists('out'):
    os.makedirs('out')
dump(grid_search, os.path.join('out', 'GridSearchCV_dump.pkl'), compress=True)

['out\\GridSearchCV_dump.pkl']