In [1]:
import os
import pandas as pd

In [2]:
cat_attrs: str = "f12 f11"
balance_data: bool = False
clear_data: bool = True
n_jobs: int = 1
verbose: int = 10

In [3]:
# Parameters
n_jobs = 8


In [4]:
pd.set_option('display.float_format', '{:.3f}'.format)

cat_attrs: list[str] = cat_attrs.split()
cat_attrs

['f12', 'f11']

In [5]:
csv_path = os.path.join('data', 'data.csv')
data = pd.read_csv(csv_path)

In [6]:
X = data.drop(columns=['target', 'p', 'f16', 'f2', 'f5'])
y = data['target']

In [7]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = pd.Series(y, name='target')

In [8]:
from sklearn.model_selection import train_test_split

test_size_ratio = 0.2
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=test_size_ratio, 
    stratify=y, 
    random_state=42
)

print(f"Train: X={X_train.shape} y={y_train.shape}")
print(f"Test: X={X_test.shape} y={y_test.shape}")

Train: X=(838860, 14) y=(838860,)
Test: X=(209715, 14) y=(209715,)


In [9]:
from collections import Counter

if balance_data:
    train = pd.concat([X_train, y_train], axis=1)
    train_n = train.shape[0]
    
    class_counts = Counter(train['target'])
    min_class = min(class_counts, key=lambda x: class_counts[x])
    
    new_train = train.groupby('target').apply(
        lambda x: x.sample(class_counts[min_class])
    ).reset_index(drop=True)
    removed_n = train.shape[0] - new_train.shape[0]
    removed_ratio = removed_n / train_n
    
    print(f"Majority class: '{min_class}'")
    print(f"Records removed: {removed_n} ({removed_ratio * 100:.3f} %)")
    X_train = new_train.drop(columns=['target'])
    y_train = new_train['target']
    print(f"\nTrain: X={X_train.shape} y={y_train.shape}")
else:
    print("balance_data set to False")

balance_data set to False


In [10]:
y_train_pos_ratio = y_train[y_train == 1].shape[0] / y_train.shape[0]
y_test_pos_ratio = y_test[y_test == 1].shape[0] / y_test.shape[0]

print("Train positives ratio:", f"{y_train_pos_ratio * 100:.5f} %")
print("Test positives ratio: ", f"{y_test_pos_ratio * 100:.5f} %")

Train positives ratio: 1.69301 %
Test positives ratio:  1.69325 %


In [11]:
if clear_data:
    train = pd.concat([X_train, y_train], axis=1)
    train_n = train.shape[0]
    
    new_train = train.drop_duplicates()
    duplicates_n = train.shape[0] - new_train.shape[0]
    duplicates_ratio = duplicates_n / train_n
    print(f"Duplicates removed: {duplicates_n} ({duplicates_ratio * 100:.3f} %)")
    train = new_train
    
    new_train = train.drop_duplicates(subset=train.columns.difference(['target']))
    collisions_n = train.shape[0] - new_train.shape[0]
    collisions_ratio = collisions_n / train_n
    print(f"Collisions removed: {collisions_n} ({collisions_ratio * 100:.3f} %)")
    train = new_train
    
    print(f"Total removed:      {duplicates_n + collisions_n} "
          f"({(duplicates_ratio + collisions_ratio) * 100:.3f} %)")
    X_train = train.drop(columns=['target'])
    y_train = train['target']
    print(f"\nTrain: X={X_train.shape} y={y_train.shape}")
else:
    print("clear_data set to False")

Duplicates removed: 62020 (7.393 %)
Collisions removed: 57 (0.007 %)
Total removed:      62077 (7.400 %)



Train: X=(776783, 14) y=(776783,)


In [12]:
len(y_train[y_train==0]) / len(y_train[y_train==1])

56.960229816445306

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

cat_encoder = OneHotEncoder().fit(X[cat_attrs])
cat_encoder.categories_  # TODO

[array([0, 1], dtype=int64), array([0, 1], dtype=int64)]

In [14]:
from imblearn.under_sampling import NeighbourhoodCleaningRule
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, SMOTENC
from imblearn.pipeline import Pipeline as ImPipeline
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, StackingClassifier
from imblearn.over_sampling import RandomOverSampler
from lightgbm import LGBMClassifier
import scipy.stats as stats

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', 'passthrough', cat_attrs)
    ],
    remainder=StandardScaler(),
    verbose_feature_names_out=True
)
preprocessor = preprocessor.set_output(transform='pandas')

one_hot = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), list(map(lambda attr: 'cat__' + attr, cat_attrs)))
    ],
    remainder='passthrough',
    verbose_feature_names_out=True
)

classifiers = [
    # ('boost', HistGradientBoostingClassifier(random_state=42)),
    ('xgb', XGBClassifier()),
    # ('lgbm', LGBMClassifier(random_state=42, class_weight='balanced')),
    # ('rf', RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')),
    ('svr', make_pipeline(StandardScaler(), 
                          LinearSVC(random_state=42, dual=False)))
]
clf = StackingClassifier(
    estimators=classifiers, final_estimator=LogisticRegression()
)

pipeline = ImPipeline(steps=[
    ('preprocessor', preprocessor),
    # ('sampling', SMOTE(random_state=42)),
    # ('sampling', NeighbourhoodCleaningRule()),
    # ('sampling', SMOTENC(categorical_features=list(map(lambda attr: 'cat__' + attr, cat_attrs)), random_state=42)),
    ('one_hot', one_hot),
    ('classifier', clf)
])

In [15]:
from sklearn.model_selection import StratifiedShuffleSplit

param_grid = [
    {      
        'classifier__xgb__n_estimators': [1000],
        'classifier__xgb__max_depth': [8],
        'classifier__xgb__scale_pos_weight': [1, 56.97],
        'classifier__xgb__learning_rate': [0.3],  # 0.3
    },
]

In [16]:
grid_search = GridSearchCV(
    pipeline, param_grid,
    cv=StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42),
    scoring=['f1', 'precision', 'recall'],
    refit='f1',
    return_train_score=True,
    n_jobs=n_jobs,
    verbose=verbose
)

grid_search.fit(X_train, y_train);

Fitting 5 folds for each of 2 candidates, totalling 10 fits


In [17]:
grid_search.best_params_

{'classifier__xgb__learning_rate': 0.3,
 'classifier__xgb__max_depth': 8,
 'classifier__xgb__n_estimators': 1000,
 'classifier__xgb__scale_pos_weight': 1}

In [18]:
grid_search.best_estimator_

In [19]:
grid_search.best_score_

0.7685613542658857

In [20]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values('rank_test_f1')
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__xgb__learning_rate,param_classifier__xgb__max_depth,param_classifier__xgb__n_estimators,param_classifier__xgb__scale_pos_weight,params,split0_test_f1,...,mean_test_recall,std_test_recall,rank_test_recall,split0_train_recall,split1_train_recall,split2_train_recall,split3_train_recall,split4_train_recall,mean_train_recall,std_train_recall
0,473.627,1.578,7.487,0.354,0.3,8,1000,1.0,"{'classifier__xgb__learning_rate': 0.3, 'class...",0.773,...,0.702,0.005,2,0.925,0.923,0.928,0.924,0.926,0.925,0.002
1,381.801,118.243,6.133,1.452,0.3,8,1000,56.97,"{'classifier__xgb__learning_rate': 0.3, 'class...",0.753,...,0.733,0.006,1,0.914,0.908,0.91,0.908,0.911,0.91,0.002


In [21]:
from sklearn.metrics import f1_score, precision_score, recall_score

y_pred = grid_search.best_estimator_.predict(X_test)
f1 = f1_score(y_test, y_pred)
p = precision_score(y_test, y_pred)
r = recall_score(y_test, y_pred)

print("F1:", f1)
print("Precision:", p)
print("Recall:", r)

F1: 0.7960957755070917
Precision: 0.8682634730538922
Recall: 0.7350042241622078


In [22]:
from joblib import dump

if not os.path.exists('out'):
    os.makedirs('out')
dump(grid_search, os.path.join('out', 'GridSearchCV_dump.pkl'), compress=True)

['out\\GridSearchCV_dump.pkl']