In [1]:
import os
import pandas as pd

In [2]:
cat_attrs: str = ""
balance_data: bool = False
clear_data: bool = True
n_jobs: int = 1
verbose: int = 10

In [3]:
# Parameters
n_jobs = 8


In [4]:
pd.set_option('display.float_format', '{:.3f}'.format)

cat_attrs: list[str] = cat_attrs.split()
cat_attrs

[]

In [5]:
csv_path = os.path.join('data', 'data.csv')
data = pd.read_csv(csv_path)

In [6]:
X = data.drop(columns=['target', 'p'])
y = data['target']

In [7]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = pd.Series(y, name='target')

In [8]:
from sklearn.model_selection import train_test_split

test_size_ratio = 0.2
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=test_size_ratio, 
    stratify=y, 
    random_state=42
)

print(f"Train: X={X_train.shape} y={y_train.shape}")
print(f"Test: X={X_test.shape} y={y_test.shape}")

Train: X=(838860, 17) y=(838860,)
Test: X=(209715, 17) y=(209715,)


In [9]:
from collections import Counter

if balance_data:
    train = pd.concat([X_train, y_train], axis=1)
    train_n = train.shape[0]
    
    class_counts = Counter(train['target'])
    min_class = min(class_counts, key=lambda x: class_counts[x])
    
    new_train = train.groupby('target').apply(
        lambda x: x.sample(class_counts[min_class])
    ).reset_index(drop=True)
    removed_n = train.shape[0] - new_train.shape[0]
    removed_ratio = removed_n / train_n
    
    print(f"Majority class: '{min_class}'")
    print(f"Records removed: {removed_n} ({removed_ratio * 100:.3f} %)")
    X_train = new_train.drop(columns=['target'])
    y_train = new_train['target']
    print(f"\nTrain: X={X_train.shape} y={y_train.shape}")
else:
    print("balance_data set to False")

balance_data set to False


In [10]:
y_train_pos_ratio = y_train[y_train == 1].shape[0] / y_train.shape[0]
y_test_pos_ratio = y_test[y_test == 1].shape[0] / y_test.shape[0]

print("Train positives ratio:", f"{y_train_pos_ratio * 100:.5f} %")
print("Test positives ratio: ", f"{y_test_pos_ratio * 100:.5f} %")

Train positives ratio: 1.69301 %
Test positives ratio:  1.69325 %


In [11]:
if clear_data:
    train = pd.concat([X_train, y_train], axis=1)
    train_n = train.shape[0]
    
    new_train = train.drop_duplicates()
    duplicates_n = train.shape[0] - new_train.shape[0]
    duplicates_ratio = duplicates_n / train_n
    print(f"Duplicates removed: {duplicates_n} ({duplicates_ratio * 100:.3f} %)")
    train = new_train
    
    new_train = train.drop_duplicates(subset=train.columns.difference(['target']))
    collisions_n = train.shape[0] - new_train.shape[0]
    collisions_ratio = collisions_n / train_n
    print(f"Collisions removed: {collisions_n} ({collisions_ratio * 100:.3f} %)")
    train = new_train
    
    print(f"Total removed:      {duplicates_n + collisions_n} "
          f"({(duplicates_ratio + collisions_ratio) * 100:.3f} %)")
    X_train = train.drop(columns=['target'])
    y_train = train['target']
    print(f"\nTrain: X={X_train.shape} y={y_train.shape}")
else:
    print("clear_data set to False")

Duplicates removed: 61923 (7.382 %)


Collisions removed: 56 (0.007 %)
Total removed:      61979 (7.388 %)

Train: X=(776881, 17) y=(776881,)


In [12]:
comb_attributes = list(X_train.columns)
# combinations_n = 3

train = pd.concat([X_train, y_train], axis=1)

attributes_combinations = []
for i in []:#range(len(comb_attributes)):
    atr1 = comb_attributes[i]    
    for j in range(i + 1, len(comb_attributes)):
        atr2 = comb_attributes[j]
        attributes_combinations.append(f"{atr1}_per_{atr2}")
        train[attributes_combinations[-1]] = train[atr1] / train[atr2]
        
corr_price = abs(train.corr(numeric_only=True)["target"]).sort_values(ascending=False)
corr_price
# corr_price_combinations = corr_price[corr_price.index.isin(attributes_combinations)]
# for atr in corr_price_combinations.iloc[combinations_n:].index:
#     X_train.drop(atr, axis=1, inplace=True)

target   1.000
f11      0.185
f12      0.185
f1       0.119
f15      0.117
f14      0.117
f13      0.116
f4       0.107
f10      0.054
f6       0.048
f0       0.034
f9       0.032
f8       0.023
f2       0.017
f16      0.010
f7       0.010
f5       0.005
f3       0.003
Name: target, dtype: float64

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

cat_encoder = OneHotEncoder().fit(X[cat_attrs])
cat_encoder.categories_  # TODO

[]

In [14]:
from imblearn.under_sampling import NeighbourhoodCleaningRule
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, StackingClassifier
from imblearn.over_sampling import RandomOverSampler
from lightgbm import LGBMClassifier
import scipy.stats as stats

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(categories=cat_encoder.categories_), cat_attrs)
    ],
    remainder=StandardScaler()
)
# preprocessor = preprocessor.set_output(transform='pandas')

classifiers = [
    # ('boost', HistGradientBoostingClassifier(random_state=42)),
    ('xgb', XGBClassifier()),
    # ('lgbm', LGBMClassifier(random_state=42, class_weight='balanced')),
    # ('rf', RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')),
    ('svr', make_pipeline(StandardScaler(), 
                          LinearSVC(random_state=42, dual=False)))
]
clf = StackingClassifier(
    estimators=classifiers, final_estimator=LogisticRegression()
)

pipeline = ImPipeline(steps=[
    ('preprocessor', preprocessor),
    # ('sampling', SMOTE(random_state=42)),
    # ('sampling', NeighbourhoodCleaningRule()),
    ('classifier', clf)
])

In [15]:
param_grid = [
    {
        # 'preprocessor__cat': ['passthrough'],
        # 'sampling': ['passthrough'],
        # 'sampling__sampling_strategy': [0.5],
        
        # 'classifier__boost__categorical_features': [list(map(lambda attr: 'cat__' + attr, cat_attrs))],
        # 'classifier__boost__learning_rate': [0.1],  # 0.1
        # 'classifier__boost__max_iter': [1000],  # 100
        # 'classifier__boost__max_depth': [20],  # 31
        # 'classifier__boost__l2_regularization': [1.0],  # 0.0
        
        'classifier__xgb__n_estimators': [150],
        'classifier__xgb__max_depth': [7],
        'classifier__xgb__scale_pos_weight': [1.69],
        # 'classifier__xgb__eval_metric': ['logloss'],
        # 'classifier__xgb__learning_rate': [0.5],  # 1.0
        
        # 'classifier__lgbm__n_estimators': [10],
        # 'classifier__lgbm__learning_rate': [0.01],
        # 'classifier__lgbm__num_iterations': [1000],
        # 'classifier__lgbm__max_depth': [20],

        # 'classifier__svr__linearsvc__C': [0.5],  # 1.0
        
        # 'classifier__final_estimator__C': [0.5],  # 1.0
        # 'classifier__final_estimator__penalty': ['l1'],
        # 'classifier__final_estimator__solver': ['saga'],
        # 'classifier__final_estimator__max_iter': [1000],
    },
]

grid_search = GridSearchCV(
    pipeline, param_grid,
    cv=5,
    scoring=['f1', 'precision', 'recall'],
    refit='f1',
    return_train_score=True,
    n_jobs=n_jobs,
    verbose=verbose
)

In [16]:
param_grid = {
    'classifier__xgb__n_estimators': stats.randint(50, 200),
    'classifier__xgb__max_depth': stats.randint(5, 10),
    'classifier__xgb__learning_rate': stats.loguniform(0.01, 0.1),
    'classifier__xgb__subsample': stats.uniform(0.5, 0.5),
    'classifier__xgb__colsample_bytree': stats.uniform(0.5, 0.5),
    'classifier__xgb__gamma': stats.loguniform(0.01, 0.5),
    'classifier__xgb__reg_alpha': stats.loguniform(0.01, 1.0),
    'classifier__xgb__reg_lambda': stats.loguniform(0.01, 1.0),
}

n_iter = 24
grid_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_grid,  # `param_grid` can be reused as it serves a similar purpose here
    n_iter=n_iter,
    cv=5,
    scoring=['f1', 'precision', 'recall'],
    refit='f1',
    return_train_score=True,
    n_jobs=n_jobs,
    verbose=verbose,
    random_state=42
)

In [17]:
grid_search.fit(X_train, y_train);

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [18]:
grid_search.best_params_

{'classifier__xgb__colsample_bytree': 0.9961057796456088,
 'classifier__xgb__gamma': 0.1119650718398326,
 'classifier__xgb__learning_rate': 0.040893394339653055,
 'classifier__xgb__max_depth': 9,
 'classifier__xgb__n_estimators': 138,
 'classifier__xgb__reg_alpha': 0.038234752246751866,
 'classifier__xgb__reg_lambda': 0.1673808578875213,
 'classifier__xgb__subsample': 0.569746930326021}

In [19]:
grid_search.best_estimator_

In [20]:
grid_search.best_score_

0.7750304012698745

In [21]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values('rank_test_f1')
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__xgb__colsample_bytree,param_classifier__xgb__gamma,param_classifier__xgb__learning_rate,param_classifier__xgb__max_depth,param_classifier__xgb__n_estimators,param_classifier__xgb__reg_alpha,...,mean_test_recall,std_test_recall,rank_test_recall,split0_train_recall,split1_train_recall,split2_train_recall,split3_train_recall,split4_train_recall,mean_train_recall,std_train_recall
2,104.026,2.206,1.388,0.25,0.996,0.112,0.041,9,138,0.038,...,0.731,0.003,13,0.755,0.749,0.756,0.755,0.754,0.754,0.002
13,98.253,2.372,1.448,0.103,0.601,0.333,0.03,9,145,0.246,...,0.732,0.004,11,0.752,0.747,0.751,0.752,0.752,0.751,0.002
12,113.545,2.229,1.243,0.079,0.88,0.09,0.059,7,186,0.264,...,0.729,0.003,18,0.749,0.745,0.748,0.749,0.749,0.748,0.002
20,106.229,2.037,1.157,0.137,0.993,0.026,0.047,7,172,0.031,...,0.732,0.004,12,0.744,0.738,0.745,0.746,0.745,0.744,0.003
8,70.859,2.37,0.822,0.041,0.793,0.436,0.04,9,90,0.039,...,0.731,0.005,14,0.748,0.744,0.747,0.747,0.748,0.747,0.001
7,65.707,2.233,0.609,0.023,0.97,0.331,0.04,8,89,0.489,...,0.734,0.003,6,0.744,0.739,0.744,0.745,0.744,0.743,0.002
22,116.432,4.256,1.155,0.157,0.663,0.024,0.051,7,197,0.195,...,0.728,0.004,19,0.745,0.743,0.745,0.748,0.746,0.745,0.002
14,90.9,2.272,0.871,0.03,0.77,0.022,0.088,7,135,0.577,...,0.726,0.002,21,0.747,0.74,0.745,0.748,0.747,0.745,0.003
19,117.061,2.577,2.159,0.216,0.805,0.071,0.011,9,178,0.03,...,0.734,0.005,8,0.748,0.742,0.749,0.748,0.748,0.747,0.002
0,69.723,2.18,0.697,0.072,0.687,0.412,0.054,9,70,0.021,...,0.729,0.003,17,0.748,0.743,0.746,0.748,0.747,0.746,0.002


In [22]:
from sklearn.metrics import f1_score, precision_score, recall_score

y_pred = grid_search.best_estimator_.predict(X_test)
f1 = f1_score(y_test, y_pred)
p = precision_score(y_test, y_pred)
r = recall_score(y_test, y_pred)

print("F1:", f1)
print("Precision:", p)
print("Recall:", r)

F1: 0.7940691927512356
Precision: 0.8480486244401791
Recall: 0.7465502675302732


In [23]:
from joblib import dump

if not os.path.exists('out'):
    os.makedirs('out')
dump(grid_search, os.path.join('out', 'GridSearchCV_dump.pkl'), compress=True)

['out\\GridSearchCV_dump.pkl']