In [1]:
import os
import pandas as pd

In [2]:
cat_attrs: str = "f12 f11"
balance_data: bool = False
clear_data: bool = True
n_jobs: int = 1
verbose: int = 10

In [3]:
# Parameters
n_jobs = 8


In [4]:
cat_attrs: list[str] = cat_attrs.split()
cat_attrs

['f12', 'f11']

In [5]:
pd.set_option('display.float_format', '{:.3f}'.format)

In [6]:
csv_path = os.path.join('data', 'data.csv')
data = pd.read_csv(csv_path)

In [7]:
X = data.drop(columns=['target', 'p', 'f16'])
y = data['target']

In [8]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = pd.Series(y, name='target')

In [9]:
from sklearn.model_selection import train_test_split

test_size_ratio = 0.2
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=test_size_ratio, 
    stratify=y, 
    random_state=42
)

print(f"Train: X={X_train.shape} y={y_train.shape}")
print(f"Test: X={X_test.shape} y={y_test.shape}")

Train: X=(838860, 16) y=(838860,)
Test: X=(209715, 16) y=(209715,)


In [10]:
from collections import Counter

if balance_data:
    train = pd.concat([X_train, y_train], axis=1)
    train_n = train.shape[0]
    
    class_counts = Counter(train['target'])
    min_class = min(class_counts, key=lambda x: class_counts[x])
    
    new_train = train.groupby('target').apply(
        lambda x: x.sample(class_counts[min_class])
    ).reset_index(drop=True)
    removed_n = train.shape[0] - new_train.shape[0]
    removed_ratio = removed_n / train_n
    
    print(f"Majority class: '{min_class}'")
    print(f"Records removed: {removed_n} ({removed_ratio * 100:.3f} %)")
    X_train = new_train.drop(columns=['target'])
    y_train = new_train['target']
    print(f"\nTrain: X={X_train.shape} y={y_train.shape}")
else:
    print("balance_data set to False")

balance_data set to False


In [11]:
y_train_pos_ratio = y_train[y_train == 1].shape[0] / y_train.shape[0]
y_test_pos_ratio = y_test[y_test == 1].shape[0] / y_test.shape[0]

print("Train positives ratio:", f"{y_train_pos_ratio * 100:.5f} %")
print("Test positives ratio: ", f"{y_test_pos_ratio * 100:.5f} %")

Train positives ratio: 1.69301 %
Test positives ratio:  1.69325 %


In [12]:
if clear_data:
    train = pd.concat([X_train, y_train], axis=1)
    train_n = train.shape[0]
    
    new_train = train.drop_duplicates()
    duplicates_n = train.shape[0] - new_train.shape[0]
    duplicates_ratio = duplicates_n / train_n
    print(f"Duplicates removed: {duplicates_n} ({duplicates_ratio * 100:.3f} %)")
    train = new_train
    
    new_train = train.drop_duplicates(subset=train.columns.difference(['target']))
    collisions_n = train.shape[0] - new_train.shape[0]
    collisions_ratio = collisions_n / train_n
    print(f"Collisions removed: {collisions_n} ({collisions_ratio * 100:.3f} %)")
    train = new_train
    
    print(f"Total removed:      {duplicates_n + collisions_n} "
          f"({(duplicates_ratio + collisions_ratio) * 100:.3f} %)")
    X_train = train.drop(columns=['target'])
    y_train = train['target']
    print(f"\nTrain: X={X_train.shape} y={y_train.shape}")
else:
    print("clear_data set to False")

Duplicates removed: 61923 (7.382 %)


Collisions removed: 56 (0.007 %)
Total removed:      61979 (7.388 %)

Train: X=(776881, 16) y=(776881,)


In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

cat_encoder = OneHotEncoder().fit(X[cat_attrs])
cat_encoder.categories_  # TODO

[array([0, 1], dtype=int64), array([0, 1], dtype=int64)]

In [14]:
from imblearn.over_sampling import SMOTE, SMOTENC
from imblearn.pipeline import Pipeline as ImPipeline

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', 'passthrough', cat_attrs)
    ],
    remainder=StandardScaler(),
    verbose_feature_names_out=True
)
preprocessor = preprocessor.set_output(transform='pandas')

one_hot = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), list(map(lambda attr: 'cat__' + attr, cat_attrs)))
    ],
    remainder='passthrough',
    verbose_feature_names_out=True
)
# one_hot = one_hot.set_output(transform='pandas')

pipeline = ImPipeline(steps=[
    ('preprocessor', preprocessor),
    ('sampling', SMOTENC(categorical_features=list(map(lambda attr: 'cat__' + attr, cat_attrs)), random_state=42)),
    ('one_hot', one_hot),
    ('classifier', LogisticRegression())
])

In [15]:
# z_x, z_y = pipeline.fit_resample(X_train, y_train)
# z_x, z_y

In [16]:
# (z_y == 0).sum(), (z_y == 1).sum()

In [17]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=y_train.unique(), y=y_train)
class_weights = dict(enumerate(class_weights))
class_weights

{0: 0.5087769277216532, 1: 28.98377107894344}

In [18]:
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from imblearn.over_sampling import RandomOverSampler

param_grid = [
    {
        'sampling': [
            SMOTENC(categorical_features=list(map(lambda attr: 'cat__' + attr, cat_attrs)), random_state=42),
            # SMOTE(random_state=42),
            'passthrough'
        ],

        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [100],  # 100
        'classifier__max_depth': [None, 20],  # None
        'classifier__min_samples_split': [2, 4],  # 2
        # 'classifier__min_samples_leaf': [1, 2],  # 1
        # 'classifier__max_features': ['sqrt', None],  # 'sqrt'
        # 'classifier__max_leaf_nodes': [None, 20],  # None
        # 'classifier__bootstrap': [True, False],  # True
        'classifier__criterion': ['gini'],  # 'gini'
        'classifier__class_weight': [None, class_weights],  # None
    },
    # {
    #     'sampling': [
    #         SMOTENC(categorical_features=list(map(lambda attr: 'cat__' + attr, cat_attrs)), random_state=42),
    #         # SMOTE(random_state=42),
    #         'passthrough'
    #     ],
    # 
    #     'classifier': [RandomForestClassifier()],
    #     'classifier__n_estimators': [100, 200],  # 100
    #     'classifier__max_depth': [None, 20],  # None
    #     'classifier__min_samples_split': [2, 4],  # 2
    #     'classifier__min_samples_leaf': [1, 2],  # 1
    #     'classifier__max_features': ['sqrt', None, 'log2'],  # 'sqrt'
    #     'classifier__max_leaf_nodes': [None, 10, 50],  # None
    #     'classifier__bootstrap': [True, False],  # True
    #     'classifier__criterion': ['gini', 'log_loss'],  # 'gini'
    #     'classifier__class_weight': [None, class_weights],  # None
    # },
]

In [19]:
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit

grid_search = GridSearchCV(
    pipeline, param_grid,
    cv=StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42),
    scoring=['f1', 'precision', 'recall'],
    refit='f1',
    return_train_score=True,
    n_jobs=n_jobs,
    verbose=verbose
)

grid_search.fit(X_train, y_train);

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [20]:
grid_search.best_params_

{'classifier': RandomForestClassifier(),
 'classifier__class_weight': None,
 'classifier__criterion': 'gini',
 'classifier__max_depth': 20,
 'classifier__min_samples_split': 2,
 'classifier__n_estimators': 100,
 'sampling': 'passthrough'}

In [21]:
grid_search.best_estimator_

In [22]:
grid_search.best_score_

0.7843903660807936

In [23]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values('rank_test_f1')
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier,param_classifier__class_weight,param_classifier__criterion,param_classifier__max_depth,param_classifier__min_samples_split,param_classifier__n_estimators,...,mean_test_recall,std_test_recall,rank_test_recall,split0_train_recall,split1_train_recall,split2_train_recall,split3_train_recall,split4_train_recall,mean_train_recall,std_train_recall
5,84.18,1.119,1.271,0.055,RandomForestClassifier(),,gini,20.0,2,100,...,0.746,0.004,12,0.915,0.911,0.911,0.909,0.917,0.913,0.003
7,85.695,1.116,1.273,0.068,RandomForestClassifier(),,gini,20.0,4,100,...,0.743,0.004,15,0.859,0.86,0.858,0.859,0.861,0.859,0.001
11,82.658,1.456,1.342,0.041,RandomForestClassifier(),"{0: 0.5087769277216532, 1: 28.98377107894344}",gini,,4,100,...,0.778,0.005,11,1.0,0.999,1.0,1.0,1.0,1.0,0.0
3,88.067,1.063,1.417,0.08,RandomForestClassifier(),,gini,,4,100,...,0.744,0.005,13,0.942,0.943,0.943,0.941,0.945,0.943,0.001
9,82.863,1.432,1.371,0.027,RandomForestClassifier(),"{0: 0.5087769277216532, 1: 28.98377107894344}",gini,,2,100,...,0.74,0.003,16,1.0,0.999,1.0,0.999,1.0,1.0,0.0
1,92.723,5.608,1.408,0.087,RandomForestClassifier(),,gini,,2,100,...,0.744,0.003,14,1.0,1.0,1.0,1.0,1.0,1.0,0.0
13,88.254,2.962,1.349,0.034,RandomForestClassifier(),"{0: 0.5087769277216532, 1: 28.98377107894344}",gini,20.0,2,100,...,0.841,0.006,4,0.977,0.976,0.977,0.974,0.978,0.976,0.001
15,86.927,1.799,1.241,0.036,RandomForestClassifier(),"{0: 0.5087769277216532, 1: 28.98377107894344}",gini,20.0,4,100,...,0.849,0.005,3,0.976,0.975,0.977,0.974,0.978,0.976,0.001
2,410.952,6.806,1.748,0.017,RandomForestClassifier(),,gini,,4,100,...,0.788,0.003,8,0.973,0.974,0.972,0.972,0.973,0.973,0.001
4,377.34,2.041,1.429,0.059,RandomForestClassifier(),,gini,20.0,2,100,...,0.84,0.006,5,0.955,0.954,0.956,0.953,0.957,0.955,0.002


In [24]:
from sklearn.metrics import f1_score, precision_score, recall_score

y_pred = grid_search.best_estimator_.predict(X_test)
f1 = f1_score(y_test, y_pred)
p = precision_score(y_test, y_pred)
r = recall_score(y_test, y_pred)

print("F1:", f1)
print("Precision:", p)
print("Recall:", r)

F1: 0.8101679797829642
Precision: 0.8579974811083123
Recall: 0.7673894677555618


In [25]:
from joblib import dump

if not os.path.exists('out'):
    os.makedirs('out')
dump(grid_search, os.path.join('out', 'GridSearchCV_dump.pkl'), compress=True)

['out\\GridSearchCV_dump.pkl']