In [1]:
import os
import pandas as pd

In [2]:
cat_attrs: str = ""
balance_data: bool = False
clear_data: bool = True
n_jobs: int = 1
verbose: int = 10

In [3]:
# Parameters
n_jobs = 8
clear_data = False


In [4]:
pd.set_option('display.float_format', '{:.3f}'.format)

cat_attrs: list[str] = cat_attrs.split()
cat_attrs

[]

In [5]:
csv_path = os.path.join('data', 'data.csv')
data = pd.read_csv(csv_path)

In [6]:
X = data.drop(columns=['target', 'p'])
y = data['target']

In [7]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = pd.Series(y, name='target')

In [8]:
from sklearn.model_selection import train_test_split

test_size_ratio = 0.2
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=test_size_ratio, 
    stratify=y, 
    random_state=42
)

print(f"Train: X={X_train.shape} y={y_train.shape}")
print(f"Test: X={X_test.shape} y={y_test.shape}")

Train: X=(838860, 17) y=(838860,)
Test: X=(209715, 17) y=(209715,)


In [9]:
from collections import Counter

if balance_data:
    train = pd.concat([X_train, y_train], axis=1)
    train_n = train.shape[0]
    
    class_counts = Counter(train['target'])
    min_class = min(class_counts, key=lambda x: class_counts[x])
    
    new_train = train.groupby('target').apply(
        lambda x: x.sample(class_counts[min_class])
    ).reset_index(drop=True)
    removed_n = train.shape[0] - new_train.shape[0]
    removed_ratio = removed_n / train_n
    
    print(f"Majority class: '{min_class}'")
    print(f"Records removed: {removed_n} ({removed_ratio * 100:.3f} %)")
    X_train = new_train.drop(columns=['target'])
    y_train = new_train['target']
    print(f"\nTrain: X={X_train.shape} y={y_train.shape}")
else:
    print("balance_data set to False")

balance_data set to False


In [10]:
y_train_pos_ratio = y_train[y_train == 1].shape[0] / y_train.shape[0]
y_test_pos_ratio = y_test[y_test == 1].shape[0] / y_test.shape[0]

print("Train positives ratio:", f"{y_train_pos_ratio * 100:.5f} %")
print("Test positives ratio: ", f"{y_test_pos_ratio * 100:.5f} %")

Train positives ratio: 1.69301 %
Test positives ratio:  1.69325 %


In [11]:
if clear_data:
    train = pd.concat([X_train, y_train], axis=1)
    train_n = train.shape[0]
    
    new_train = train.drop_duplicates()
    duplicates_n = train.shape[0] - new_train.shape[0]
    duplicates_ratio = duplicates_n / train_n
    print(f"Duplicates removed: {duplicates_n} ({duplicates_ratio * 100:.3f} %)")
    train = new_train
    
    new_train = train.drop_duplicates(subset=train.columns.difference(['target']))
    collisions_n = train.shape[0] - new_train.shape[0]
    collisions_ratio = collisions_n / train_n
    print(f"Collisions removed: {collisions_n} ({collisions_ratio * 100:.3f} %)")
    train = new_train
    
    print(f"Total removed:      {duplicates_n + collisions_n} "
          f"({(duplicates_ratio + collisions_ratio) * 100:.3f} %)")
    X_train = train.drop(columns=['target'])
    y_train = train['target']
    print(f"\nTrain: X={X_train.shape} y={y_train.shape}")
else:
    print("clear_data set to False")

clear_data set to False


In [12]:
len(y_train[y_train==0]) / len(y_train[y_train==1])

58.06632868610055

In [13]:
comb_attributes = list(X_train.columns)
# combinations_n = 3

train = pd.concat([X_train, y_train], axis=1)

attributes_combinations = []
for i in []:#range(len(comb_attributes)):
    atr1 = comb_attributes[i]    
    for j in range(i + 1, len(comb_attributes)):
        atr2 = comb_attributes[j]
        attributes_combinations.append(f"{atr1}_per_{atr2}")
        train[attributes_combinations[-1]] = train[atr1] / train[atr2]
        
corr_price = abs(train.corr(numeric_only=True)["target"]).sort_values(ascending=False)
corr_price
# corr_price_combinations = corr_price[corr_price.index.isin(attributes_combinations)]
# for atr in corr_price_combinations.iloc[combinations_n:].index:
#     X_train.drop(atr, axis=1, inplace=True)

target   1.000
f11      0.178
f12      0.178
f1       0.132
f14      0.130
f15      0.129
f13      0.127
f4       0.119
f10      0.062
f6       0.052
f0       0.040
f9       0.027
f2       0.024
f8       0.020
f5       0.005
f3       0.004
f16      0.002
f7       0.002
Name: target, dtype: float64

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

cat_encoder = OneHotEncoder().fit(X[cat_attrs])
cat_encoder.categories_  # TODO

[]

In [15]:
from imblearn.under_sampling import NeighbourhoodCleaningRule
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, StackingClassifier
from imblearn.over_sampling import RandomOverSampler
from lightgbm import LGBMClassifier
import scipy.stats as stats

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(categories=cat_encoder.categories_), cat_attrs)
    ],
    remainder=StandardScaler()
)
# preprocessor = preprocessor.set_output(transform='pandas')

classifiers = [
    # ('boost', HistGradientBoostingClassifier(random_state=42)),
    ('xgb', XGBClassifier()),
    # ('lgbm', LGBMClassifier(random_state=42, class_weight='balanced')),
    # ('rf', RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')),
    ('svr', make_pipeline(StandardScaler(), 
                          LinearSVC(random_state=42, dual=False)))
]
clf = StackingClassifier(
    estimators=classifiers, final_estimator=LogisticRegression()
)

pipeline = ImPipeline(steps=[
    ('preprocessor', preprocessor),
    # ('sampling', SMOTE(random_state=42)),
    # ('sampling', NeighbourhoodCleaningRule()),
    ('classifier', clf)
])

In [16]:
param_grid = [
    {
        # 'preprocessor__cat': ['passthrough'],
        # 'sampling': ['passthrough'],
        # 'sampling__sampling_strategy': [0.5],
        
        # 'classifier__boost__categorical_features': [list(map(lambda attr: 'cat__' + attr, cat_attrs))],
        # 'classifier__boost__learning_rate': [0.1],  # 0.1
        # 'classifier__boost__max_iter': [1000],  # 100
        # 'classifier__boost__max_depth': [20],  # 31
        # 'classifier__boost__l2_regularization': [1.0],  # 0.0
        
        'classifier__xgb__n_estimators': [150],
        'classifier__xgb__max_depth': [8],
        'classifier__xgb__scale_pos_weight': [1],
        # 'classifier__xgb__eval_metric': ['logloss'],
        'classifier__xgb__learning_rate': [0.3],  # 0.3
        
        # 'classifier__lgbm__n_estimators': [10],
        # 'classifier__lgbm__learning_rate': [0.01],
        # 'classifier__lgbm__num_iterations': [1000],
        # 'classifier__lgbm__max_depth': [20],

        # 'classifier__svr__linearsvc__C': [0.5],  # 1.0
        
        # 'classifier__final_estimator__C': [0.5],  # 1.0
        # 'classifier__final_estimator__penalty': ['l1'],
        # 'classifier__final_estimator__solver': ['saga'],
        # 'classifier__final_estimator__max_iter': [1000],
    },
]

grid_search = GridSearchCV(
    pipeline, param_grid,
    cv=5,
    scoring=['f1', 'precision', 'recall'],
    refit='f1',
    return_train_score=True,
    n_jobs=n_jobs,
    verbose=verbose
)

In [17]:
# param_grid = {
#     'classifier__xgb__n_estimators': stats.randint(50, 200),
#     'classifier__xgb__max_depth': stats.randint(5, 10),
#     'classifier__xgb__learning_rate': stats.loguniform(0.01, 0.1),
#     'classifier__xgb__subsample': stats.uniform(0.5, 0.5),
#     'classifier__xgb__colsample_bytree': stats.uniform(0.5, 0.5),
#     'classifier__xgb__gamma': stats.loguniform(0.01, 0.5),
#     'classifier__xgb__reg_alpha': stats.loguniform(0.01, 1.0),
#     'classifier__xgb__reg_lambda': stats.loguniform(0.01, 1.0),
# }
# 
# n_iter = 24
# grid_search = RandomizedSearchCV(
#     pipeline,
#     param_distributions=param_grid,  # `param_grid` can be reused as it serves a similar purpose here
#     n_iter=n_iter,
#     cv=5,
#     scoring=['f1', 'precision', 'recall'],
#     refit='f1',
#     return_train_score=True,
#     n_jobs=n_jobs,
#     verbose=verbose,
#     random_state=42
# )

In [18]:
grid_search.fit(X_train, y_train);

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [19]:
grid_search.best_params_

{'classifier__xgb__learning_rate': 0.3,
 'classifier__xgb__max_depth': 8,
 'classifier__xgb__n_estimators': 150,
 'classifier__xgb__scale_pos_weight': 1}

In [20]:
grid_search.best_estimator_

In [21]:
grid_search.best_score_

0.7823570837106631

In [22]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values('rank_test_f1')
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__xgb__learning_rate,param_classifier__xgb__max_depth,param_classifier__xgb__n_estimators,param_classifier__xgb__scale_pos_weight,params,split0_test_f1,...,mean_test_recall,std_test_recall,rank_test_recall,split0_train_recall,split1_train_recall,split2_train_recall,split3_train_recall,split4_train_recall,mean_train_recall,std_train_recall
0,89.121,4.752,1.038,0.184,0.3,8,150,1,"{'classifier__xgb__learning_rate': 0.3, 'class...",0.775,...,0.72,0.009,1,0.817,0.817,0.813,0.816,0.811,0.815,0.002


In [23]:
from sklearn.metrics import f1_score, precision_score, recall_score

y_pred = grid_search.best_estimator_.predict(X_test)
f1 = f1_score(y_test, y_pred)
p = precision_score(y_test, y_pred)
r = recall_score(y_test, y_pred)

print("F1:", f1)
print("Precision:", p)
print("Recall:", r)

F1: 0.7992689613158697
Precision: 0.8703150912106136
Recall: 0.7389467755561814


In [24]:
from joblib import dump

if not os.path.exists('out'):
    os.makedirs('out')
dump(grid_search, os.path.join('out', 'GridSearchCV_dump.pkl'), compress=True)

['out\\GridSearchCV_dump.pkl']