In [1]:
import os
import pandas as pd

In [2]:
cat_attrs: str = "f12 f11"
balance_data: bool = False
clear_data: bool = True
n_jobs: int = 1
verbose: int = 10

In [3]:
# Parameters
n_jobs = 8


In [4]:
cat_attrs: list[str] = cat_attrs.split()
cat_attrs

['f12', 'f11']

In [5]:
pd.set_option('display.float_format', '{:.3f}'.format)

In [6]:
csv_path = os.path.join('data', 'data.csv')
data = pd.read_csv(csv_path)

In [7]:
X = data.drop(columns=['target', 'p', 'f16', 'f2', 'f5'])
y = data['target']

In [8]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = pd.Series(y, name='target')

In [9]:
from sklearn.model_selection import train_test_split

test_size_ratio = 0.2
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=test_size_ratio, 
    stratify=y, 
    random_state=42
)

print(f"Train: X={X_train.shape} y={y_train.shape}")
print(f"Test: X={X_test.shape} y={y_test.shape}")

Train: X=(838860, 14) y=(838860,)
Test: X=(209715, 14) y=(209715,)


In [10]:
from collections import Counter

if balance_data:
    train = pd.concat([X_train, y_train], axis=1)
    train_n = train.shape[0]
    
    class_counts = Counter(train['target'])
    min_class = min(class_counts, key=lambda x: class_counts[x])
    
    new_train = train.groupby('target').apply(
        lambda x: x.sample(class_counts[min_class])
    ).reset_index(drop=True)
    removed_n = train.shape[0] - new_train.shape[0]
    removed_ratio = removed_n / train_n
    
    print(f"Majority class: '{min_class}'")
    print(f"Records removed: {removed_n} ({removed_ratio * 100:.3f} %)")
    X_train = new_train.drop(columns=['target'])
    y_train = new_train['target']
    print(f"\nTrain: X={X_train.shape} y={y_train.shape}")
else:
    print("balance_data set to False")

balance_data set to False


In [11]:
y_train_pos_ratio = y_train[y_train == 1].shape[0] / y_train.shape[0]
y_test_pos_ratio = y_test[y_test == 1].shape[0] / y_test.shape[0]

print("Train positives ratio:", f"{y_train_pos_ratio * 100:.5f} %")
print("Test positives ratio: ", f"{y_test_pos_ratio * 100:.5f} %")

Train positives ratio: 1.69301 %
Test positives ratio:  1.69325 %


In [12]:
if clear_data:
    train = pd.concat([X_train, y_train], axis=1)
    train_n = train.shape[0]
    
    new_train = train.drop_duplicates()
    duplicates_n = train.shape[0] - new_train.shape[0]
    duplicates_ratio = duplicates_n / train_n
    print(f"Duplicates removed: {duplicates_n} ({duplicates_ratio * 100:.3f} %)")
    train = new_train
    
    new_train = train.drop_duplicates(subset=train.columns.difference(['target']))
    collisions_n = train.shape[0] - new_train.shape[0]
    collisions_ratio = collisions_n / train_n
    print(f"Collisions removed: {collisions_n} ({collisions_ratio * 100:.3f} %)")
    train = new_train
    
    print(f"Total removed:      {duplicates_n + collisions_n} "
          f"({(duplicates_ratio + collisions_ratio) * 100:.3f} %)")
    X_train = train.drop(columns=['target'])
    y_train = train['target']
    print(f"\nTrain: X={X_train.shape} y={y_train.shape}")
else:
    print("clear_data set to False")

Duplicates removed: 62020 (7.393 %)


Collisions removed: 57 (0.007 %)
Total removed:      62077 (7.400 %)

Train: X=(776783, 14) y=(776783,)


In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

cat_encoder = OneHotEncoder().fit(X[cat_attrs])
cat_encoder.categories_  # TODO

[array([0, 1], dtype=int64), array([0, 1], dtype=int64)]

In [14]:
from imblearn.over_sampling import SMOTE, SMOTENC
from imblearn.pipeline import Pipeline as ImPipeline

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', 'passthrough', cat_attrs)
    ],
    remainder=StandardScaler(),
    verbose_feature_names_out=True
)
preprocessor = preprocessor.set_output(transform='pandas')

one_hot = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), list(map(lambda attr: 'cat__' + attr, cat_attrs)))
    ],
    remainder='passthrough',
    verbose_feature_names_out=True
)
# one_hot = one_hot.set_output(transform='pandas')

pipeline = ImPipeline(steps=[
    ('preprocessor', preprocessor),
    ('sampling', SMOTENC(categorical_features=list(map(lambda attr: 'cat__' + attr, cat_attrs)), random_state=42)),
    ('one_hot', one_hot),
    ('classifier', LogisticRegression())
])

In [15]:
# z_x, z_y = pipeline.fit_resample(X_train, y_train)
# z_x, z_y

In [16]:
# (z_y == 0).sum(), (z_y == 1).sum()

In [17]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=y_train.unique(), y=y_train)
class_weights = dict(enumerate(class_weights))
class_weights

{0: 0.5087780544708344, 1: 28.980114908222653}

In [18]:
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from imblearn.over_sampling import RandomOverSampler

param_grid = [
    {
        'sampling': [
            # SMOTENC(categorical_features=list(map(lambda attr: 'cat__' + attr, cat_attrs)), random_state=42),
            'passthrough'
        ],

        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators': [150],  # 100
        'classifier__max_depth': [20],  # None
        'classifier__min_samples_split': [2],  # 2
        'classifier__criterion': ['gini'],  # 'gini'
        'classifier__class_weight': [None],  # None
    },
]

In [19]:
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit

grid_search = GridSearchCV(
    pipeline, param_grid,
    cv=StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42),
    scoring=['f1', 'precision', 'recall'],
    refit='f1',
    return_train_score=True,
    n_jobs=n_jobs,
    verbose=verbose
)

grid_search.fit(X_train, y_train);

Fitting 5 folds for each of 1 candidates, totalling 5 fits


KeyboardInterrupt: 

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
grid_search.best_score_

In [None]:
importances = grid_search.best_estimator_['classifier'].feature_importances_
importances

In [None]:
# feature_importance = pd.DataFrame(importances, index=X_train.columns, columns=['Importance'])
# feature_importance.sort_values(by='Importance')

In [None]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values('rank_test_f1')
cv_results

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

y_pred = grid_search.best_estimator_.predict(X_test)
f1 = f1_score(y_test, y_pred)
p = precision_score(y_test, y_pred)
r = recall_score(y_test, y_pred)

print("F1:", f1)
print("Precision:", p)
print("Recall:", r)

In [None]:
from joblib import dump

if not os.path.exists('out'):
    os.makedirs('out')
dump(grid_search, os.path.join('out', 'GridSearchCV_dump.pkl'), compress=True)