In [1]:
import os
import pandas as pd

In [2]:
cat_attrs: str = "f12 f11"
balance_data: bool = False
clear_data: bool = True
n_jobs: int = 1
verbose: int = 10

In [3]:
# Parameters
n_jobs = 8


In [4]:
cat_attrs: list[str] = cat_attrs.split()
cat_attrs

['f12', 'f11']

In [5]:
pd.set_option('display.float_format', '{:.3f}'.format)

In [6]:
csv_path = os.path.join('data', 'data.csv')
data = pd.read_csv(csv_path)

In [7]:
X = data.drop(columns=['target', 'p', 'f16'])
y = data['target']

In [8]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = pd.Series(y, name='target')

In [9]:
from sklearn.model_selection import train_test_split

test_size_ratio = 0.2
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=test_size_ratio, 
    stratify=y, 
    random_state=42
)

print(f"Train: X={X_train.shape} y={y_train.shape}")
print(f"Test: X={X_test.shape} y={y_test.shape}")

Train: X=(838860, 16) y=(838860,)
Test: X=(209715, 16) y=(209715,)


In [10]:
from collections import Counter

if balance_data:
    train = pd.concat([X_train, y_train], axis=1)
    train_n = train.shape[0]
    
    class_counts = Counter(train['target'])
    min_class = min(class_counts, key=lambda x: class_counts[x])
    
    new_train = train.groupby('target').apply(
        lambda x: x.sample(class_counts[min_class])
    ).reset_index(drop=True)
    removed_n = train.shape[0] - new_train.shape[0]
    removed_ratio = removed_n / train_n
    
    print(f"Majority class: '{min_class}'")
    print(f"Records removed: {removed_n} ({removed_ratio * 100:.3f} %)")
    X_train = new_train.drop(columns=['target'])
    y_train = new_train['target']
    print(f"\nTrain: X={X_train.shape} y={y_train.shape}")
else:
    print("balance_data set to False")

balance_data set to False


In [11]:
y_train_pos_ratio = y_train[y_train == 1].shape[0] / y_train.shape[0]
y_test_pos_ratio = y_test[y_test == 1].shape[0] / y_test.shape[0]

print("Train positives ratio:", f"{y_train_pos_ratio * 100:.5f} %")
print("Test positives ratio: ", f"{y_test_pos_ratio * 100:.5f} %")

Train positives ratio: 1.69301 %
Test positives ratio:  1.69325 %


In [12]:
if clear_data:
    train = pd.concat([X_train, y_train], axis=1)
    train_n = train.shape[0]
    
    new_train = train.drop_duplicates()
    duplicates_n = train.shape[0] - new_train.shape[0]
    duplicates_ratio = duplicates_n / train_n
    print(f"Duplicates removed: {duplicates_n} ({duplicates_ratio * 100:.3f} %)")
    train = new_train
    
    new_train = train.drop_duplicates(subset=train.columns.difference(['target']))
    collisions_n = train.shape[0] - new_train.shape[0]
    collisions_ratio = collisions_n / train_n
    print(f"Collisions removed: {collisions_n} ({collisions_ratio * 100:.3f} %)")
    train = new_train
    
    print(f"Total removed:      {duplicates_n + collisions_n} "
          f"({(duplicates_ratio + collisions_ratio) * 100:.3f} %)")
    X_train = train.drop(columns=['target'])
    y_train = train['target']
    print(f"\nTrain: X={X_train.shape} y={y_train.shape}")
else:
    print("clear_data set to False")

Duplicates removed: 61923 (7.382 %)


Collisions removed: 56 (0.007 %)
Total removed:      61979 (7.388 %)

Train: X=(776881, 16) y=(776881,)


In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

cat_encoder = OneHotEncoder().fit(X[cat_attrs])
cat_encoder.categories_  # TODO

[array([0, 1], dtype=int64), array([0, 1], dtype=int64)]

In [14]:
from imblearn.over_sampling import SMOTE, SMOTENC
from imblearn.pipeline import Pipeline as ImPipeline

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', 'passthrough', cat_attrs)
    ],
    remainder=StandardScaler(),
    verbose_feature_names_out=True
)
preprocessor = preprocessor.set_output(transform='pandas')

one_hot = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), list(map(lambda attr: 'cat__' + attr, cat_attrs)))
    ],
    remainder='passthrough',
    verbose_feature_names_out=True
)
# one_hot = one_hot.set_output(transform='pandas')

pipeline = ImPipeline(steps=[
    ('preprocessor', preprocessor),
    ('sampling', SMOTENC(categorical_features=list(map(lambda attr: 'cat__' + attr, cat_attrs)), random_state=42)),
    ('one_hot', one_hot),
    ('classifier', LogisticRegression())
])

In [15]:
# z_x, z_y = pipeline.fit_resample(X_train, y_train)
# z_x, z_y

In [16]:
# (z_y == 0).sum(), (z_y == 1).sum()

In [17]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=y_train.unique(), y=y_train)
class_weights = dict(enumerate(class_weights))
class_weights

{0: 0.5087769277216532, 1: 28.98377107894344}

In [18]:
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from imblearn.over_sampling import RandomOverSampler

param_grid = [
    {
        'sampling': [
            SMOTENC(categorical_features=list(map(lambda attr: 'cat__' + attr, cat_attrs)), random_state=42),
            # SMOTE(random_state=42),
            'passthrough'
        ],

        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [100, 150],  # 100
        'classifier__max_depth': [15, 20],  # None
        'classifier__min_samples_split': [2],  # 2
        # 'classifier__min_samples_leaf': [1, 2],  # 1
        # 'classifier__max_features': ['sqrt', None],  # 'sqrt'
        # 'classifier__max_leaf_nodes': [None, 20],  # None
        # 'classifier__bootstrap': [True, False],  # True
        'classifier__criterion': ['gini'],  # 'gini'
        'classifier__class_weight': [None],  # None
    },
    # {
    #     'sampling': [
    #         SMOTENC(categorical_features=list(map(lambda attr: 'cat__' + attr, cat_attrs)), random_state=42),
    #         # SMOTE(random_state=42),
    #         'passthrough'
    #     ],
    # 
    #     'classifier': [RandomForestClassifier()],
    #     'classifier__n_estimators': [100, 200],  # 100
    #     'classifier__max_depth': [None, 20],  # None
    #     'classifier__min_samples_split': [2, 4],  # 2
    #     'classifier__min_samples_leaf': [1, 2],  # 1
    #     'classifier__max_features': ['sqrt', None, 'log2'],  # 'sqrt'
    #     'classifier__max_leaf_nodes': [None, 10, 50],  # None
    #     'classifier__bootstrap': [True, False],  # True
    #     'classifier__criterion': ['gini', 'log_loss'],  # 'gini'
    #     'classifier__class_weight': [None, class_weights],  # None
    # },
]

In [19]:
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit

grid_search = GridSearchCV(
    pipeline, param_grid,
    cv=StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42),
    scoring=['f1', 'precision', 'recall'],
    refit='f1',
    return_train_score=True,
    n_jobs=n_jobs,
    verbose=verbose
)

grid_search.fit(X_train, y_train);

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [20]:
grid_search.best_params_

{'classifier': RandomForestClassifier(),
 'classifier__class_weight': None,
 'classifier__criterion': 'gini',
 'classifier__max_depth': 20,
 'classifier__min_samples_split': 2,
 'classifier__n_estimators': 150,
 'sampling': 'passthrough'}

In [21]:
grid_search.best_estimator_

In [22]:
grid_search.best_score_

0.7852452416435567

In [23]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values('rank_test_f1')
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier,param_classifier__class_weight,param_classifier__criterion,param_classifier__max_depth,param_classifier__min_samples_split,param_classifier__n_estimators,...,mean_test_recall,std_test_recall,rank_test_recall,split0_train_recall,split1_train_recall,split2_train_recall,split3_train_recall,split4_train_recall,mean_train_recall,std_train_recall
7,133.78,3.985,1.827,0.055,RandomForestClassifier(),,gini,20,2,150,...,0.746,0.004,5,0.914,0.913,0.913,0.911,0.915,0.913,0.001
5,88.461,0.971,1.348,0.124,RandomForestClassifier(),,gini,20,2,100,...,0.743,0.004,6,0.913,0.913,0.913,0.908,0.913,0.912,0.002
3,126.436,1.735,1.638,0.023,RandomForestClassifier(),,gini,15,2,150,...,0.742,0.004,7,0.802,0.806,0.806,0.802,0.805,0.804,0.002
1,93.605,6.315,1.189,0.066,RandomForestClassifier(),,gini,15,2,100,...,0.739,0.003,8,0.805,0.805,0.805,0.802,0.804,0.804,0.002
6,542.592,8.249,1.837,0.07,RandomForestClassifier(),,gini,20,2,150,...,0.84,0.008,3,0.957,0.956,0.958,0.954,0.957,0.956,0.001
4,392.819,4.104,1.45,0.034,RandomForestClassifier(),,gini,20,2,100,...,0.839,0.007,4,0.956,0.954,0.955,0.953,0.957,0.955,0.001
2,507.237,2.862,1.724,0.037,RandomForestClassifier(),,gini,15,2,150,...,0.881,0.004,1,0.93,0.932,0.932,0.928,0.932,0.931,0.002
0,373.562,3.076,1.299,0.039,RandomForestClassifier(),,gini,15,2,100,...,0.88,0.005,2,0.93,0.931,0.931,0.928,0.932,0.93,0.002


In [24]:
from sklearn.metrics import f1_score, precision_score, recall_score

y_pred = grid_search.best_estimator_.predict(X_test)
f1 = f1_score(y_test, y_pred)
p = precision_score(y_test, y_pred)
r = recall_score(y_test, y_pred)

print("F1:", f1)
print("Precision:", p)
print("Recall:", r)

F1: 0.8114558472553699
Precision: 0.8626704725658103
Recall: 0.7659814136862856


In [25]:
from joblib import dump

if not os.path.exists('out'):
    os.makedirs('out')
dump(grid_search, os.path.join('out', 'GridSearchCV_dump.pkl'), compress=True)

['out\\GridSearchCV_dump.pkl']