In [None]:
import os
import pandas as pd

In [None]:
cat_attrs: str = "f12 f11"
balance_data: bool = False
clear_data: bool = True
n_jobs: int = 1
verbose: int = 10

In [None]:
cat_attrs: list[str] = cat_attrs.split()
cat_attrs

In [None]:
pd.set_option('display.float_format', '{:.3f}'.format)

In [None]:
csv_path = os.path.join('data', 'data.csv')
data = pd.read_csv(csv_path)

In [None]:
X = data.drop(columns=['target', 'p'])
y = data['target']

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = pd.Series(y, name='target')

In [None]:
from sklearn.model_selection import train_test_split

test_size_ratio = 0.2
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=test_size_ratio, 
    stratify=y, 
    random_state=42
)

print(f"Train: X={X_train.shape} y={y_train.shape}")
print(f"Test: X={X_test.shape} y={y_test.shape}")

In [None]:
from collections import Counter

if balance_data:
    train = pd.concat([X_train, y_train], axis=1)
    train_n = train.shape[0]
    
    class_counts = Counter(train['target'])
    min_class = min(class_counts, key=lambda x: class_counts[x])
    
    new_train = train.groupby('target').apply(
        lambda x: x.sample(class_counts[min_class])
    ).reset_index(drop=True)
    removed_n = train.shape[0] - new_train.shape[0]
    removed_ratio = removed_n / train_n
    
    print(f"Majority class: '{min_class}'")
    print(f"Records removed: {removed_n} ({removed_ratio * 100:.3f} %)")
    X_train = new_train.drop(columns=['target'])
    y_train = new_train['target']
    print(f"\nTrain: X={X_train.shape} y={y_train.shape}")
else:
    print("balance_data set to False")

In [None]:
y_train_pos_ratio = y_train[y_train == 1].shape[0] / y_train.shape[0]
y_test_pos_ratio = y_test[y_test == 1].shape[0] / y_test.shape[0]

print("Train positives ratio:", f"{y_train_pos_ratio * 100:.5f} %")
print("Test positives ratio: ", f"{y_test_pos_ratio * 100:.5f} %")

In [None]:
if clear_data:
    train = pd.concat([X_train, y_train], axis=1)
    train_n = train.shape[0]
    
    new_train = train.drop_duplicates()
    duplicates_n = train.shape[0] - new_train.shape[0]
    duplicates_ratio = duplicates_n / train_n
    print(f"Duplicates removed: {duplicates_n} ({duplicates_ratio * 100:.3f} %)")
    train = new_train
    
    new_train = train.drop_duplicates(subset=train.columns.difference(['target']))
    collisions_n = train.shape[0] - new_train.shape[0]
    collisions_ratio = collisions_n / train_n
    print(f"Collisions removed: {collisions_n} ({collisions_ratio * 100:.3f} %)")
    train = new_train
    
    print(f"Total removed:      {duplicates_n + collisions_n} "
          f"({(duplicates_ratio + collisions_ratio) * 100:.3f} %)")
    X_train = train.drop(columns=['target'])
    y_train = train['target']
    print(f"\nTrain: X={X_train.shape} y={y_train.shape}")
else:
    print("clear_data set to False")

In [None]:
comb_attributes = list(X_train.columns)
# combinations_n = 3

train = pd.concat([X_train, y_train], axis=1)

attributes_combinations = []
for i in []:#range(len(comb_attributes)):
    atr1 = comb_attributes[i]    
    for j in range(i + 1, len(comb_attributes)):
        atr2 = comb_attributes[j]
        attributes_combinations.append(f"{atr1}_per_{atr2}")
        train[attributes_combinations[-1]] = train[atr1] / train[atr2]
        
corr_price = abs(train.corr(numeric_only=True)["target"]).sort_values(ascending=False)
corr_price
# corr_price_combinations = corr_price[corr_price.index.isin(attributes_combinations)]
# for atr in corr_price_combinations.iloc[combinations_n:].index:
#     X_train.drop(atr, axis=1, inplace=True)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

cat_encoder = OneHotEncoder().fit(X[cat_attrs])
cat_encoder.categories_  # TODO

In [None]:
from imblearn.over_sampling import SMOTE, SMOTENC
from imblearn.pipeline import Pipeline as ImPipeline

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(categories=cat_encoder.categories_), cat_attrs)
    ],
    remainder=StandardScaler(),
    verbose_feature_names_out=True
)
# preprocessor = preprocessor.set_output(transform='pandas')

pipeline = ImPipeline(steps=[
    ('preprocessor', preprocessor),
    ('sampling', SMOTENC(categorical_features=[0, 1], random_state=42)),
    ('classifier', LogisticRegression())
])

In [None]:
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from imblearn.over_sampling import RandomOverSampler

param_grid = [
    {
        'classifier': [XGBClassifier(random_state=42)],
        'classifier__n_estimators': [150],           # Number of boosting rounds
        'classifier__max_depth': [7],                   # Maximum depth of the trees
        # 'classifier__learning_rate': [0.01, 0.1, 0.2],        # Learning rate for weight updates
        # 'classifier__colsample_bytree': [0.7, 0.8, 1.0],      # Fraction of columns to be randomly sampled for each tree
        # 'classifier__gamma': [0, 0.1, 0.2],                   # Minimum loss reduction required to make a further partition on a leaf node
    },
]

In [None]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(
    pipeline, param_grid,
    cv=5,
    scoring=['f1', 'precision', 'recall'],
    refit='f1',
    return_train_score=True,
    n_jobs=n_jobs,
    verbose=verbose
)
grid_search.fit(X_train, y_train);

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
grid_search.best_score_

In [None]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values('rank_test_f1')
cv_results

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

y_pred = grid_search.best_estimator_.predict(X_test)
f1 = f1_score(y_test, y_pred)
p = precision_score(y_test, y_pred)
r = recall_score(y_test, y_pred)

print("F1:", f1)
print("Precision:", p)
print("Recall:", r)

In [None]:
from joblib import dump

if not os.path.exists('out'):
    os.makedirs('out')
dump(grid_search, os.path.join('out', 'GridSearchCV_dump.pkl'), compress=True)