> TL;DR <br>
В коде находятся:
* Catboost для разных видов задач + optuna + feats. importance
* Xgboost для разных видов задач + optuna
* Stacking
* Voiting ensemble
* Всякая предобработка данных (Fill NaN, feats. selection, etc.)
* Другое (Кроссвалидация, подбор трешхолда, etc.)

In [16]:
# imports
import pandas as pd
import numpy as np
import catboost
import xgboost
from sklearn.preprocessing import PolynomialFeatures
from sklearn.experimental import enable_iterative_imputer
from sklearn.linear_model import LinearRegression
from sklearn.impute import IterativeImputer
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score, classification_report
from sklearn.metrics import roc_auc_score, precision_recall_curve
import optuna
import sklearn
import pickle
import catboost as cat
from utils import set_seed_no_torch

In [17]:
SEED = 11
set_seed_no_torch(SEED)

In [18]:
def load_dfs(names):
    dfs = []
    for name in names:
        with open(f'data/{name}.pickle', 'rb') as f:
            dfs.append(pickle.load(f))
    return dfs

train, test = load_dfs(['train_processed', 'test_processed'])

In [19]:
train, val = sklearn.model_selection.train_test_split(
    train,
    test_size=0.15,
    # stratify=''
)

In [20]:
X_train, y_train = train.drop(columns=['fare_amount', 'pickup_datetime']).drop(columns=[]), train['fare_amount']
X_val, y_val = val.drop(columns=['fare_amount', 'pickup_datetime']), val['fare_amount']

In [21]:
cat_train_pool = cat.Pool(
    data=X_train,
    label=y_train,
)

cat_valid_pool = cat.Pool(
    data=X_val,
    label=y_val,
)

cat_test_pool = cat.Pool(
    data=test.drop(columns=['pickup_datetime']),
)

# **Catboosts**

## Defult catboost

**Catboost params**:
* eval_metric {'F1', 'RMSE'}
* auto_class_weights {'default', 'Balanced', 'SqrtBalanced'}
* text_features {None, text_features}
* loss_function {'MultiClass', 'LogLoss', 'CrossEntropy'}

In [None]:
# classification
cat_classif = catboost.CatBoostClassifier(eval_metric='F1', iterations=1000, random_seed=42, 
                                             task_type='GPU', auto_class_weights='default')

cat_classif.fit(X_train, y_train, eval_set=(X_val, y_val),
             verbose=100, early_stopping_rounds=300)

In [22]:
# regressor
cat_reg = catboost.CatBoostRegressor(
    eval_metric='RMSE', 
    iterations=1000, 
    random_seed=SEED, 
    task_type='GPU',
)

cat_reg.fit(
    X_train, 
    y_train, 
    eval_set=(X_val, y_val),
    verbose=100, 
    early_stopping_rounds=300
)

Learning rate set to 0.118628
0:	learn: 7.8717308	test: 7.8149516	best: 7.8149516 (0)	total: 261ms	remaining: 4m 21s
100:	learn: 4.0987639	test: 4.0414625	best: 4.0414625 (100)	total: 3.34s	remaining: 29.8s
200:	learn: 3.9597662	test: 3.9494369	best: 3.9494340 (199)	total: 6.44s	remaining: 25.6s
300:	learn: 3.8804194	test: 3.9186903	best: 3.9186903 (300)	total: 9.54s	remaining: 22.2s
400:	learn: 3.8262360	test: 3.9034796	best: 3.9034796 (400)	total: 12.7s	remaining: 19s
500:	learn: 3.7905484	test: 3.8991309	best: 3.8991309 (500)	total: 16s	remaining: 15.9s
600:	learn: 3.7639580	test: 3.8933820	best: 3.8933820 (600)	total: 19.2s	remaining: 12.7s
700:	learn: 3.7365328	test: 3.8858852	best: 3.8858602 (698)	total: 22.1s	remaining: 9.44s
800:	learn: 3.7159472	test: 3.8825382	best: 3.8820889 (788)	total: 25.2s	remaining: 6.26s
900:	learn: 3.6950012	test: 3.8773762	best: 3.8773762 (900)	total: 28s	remaining: 3.08s
999:	learn: 3.6766270	test: 3.8734903	best: 3.8731503 (994)	total: 30.9s	remain

<catboost.core.CatBoostRegressor at 0x1b51716be90>

In [23]:
sub = pd.concat(
    [
        pd.read_csv('test (1).csv')['key'],
        pd.Series(cat_reg.predict(cat_test_pool)).rename('fare_amount'),
    ],
    axis=1
)
sub.to_csv('taxi_second.csv', index=False)

## Catboost Optuna

In [24]:
def objective(trial):
    param = {
        'iterations': trial.suggest_int('iterations', 500, 5000),
        "objective": trial.suggest_categorical("objective", ["Logloss"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"])
        }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 20)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
    if param["objective"] == "Logloss":
        param["objective"] = trial.suggest_categorical("auto_class_weights", ["default", "Balanced", "SqrtBalanced"])
        
    cat_cls = catboost.CatBoostClassifier(**param, eval_metric='F1')

    cat_cls.fit(X_train, y_train, eval_set=[(X_val, y_val)] ,verbose=0, early_stopping_rounds=500)
    
    preds = cat_cls.predict(X_test)
    f1 = f1_score(y_test, preds)
    return f1

if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, timeout=7200)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[I 2024-11-12 23:41:07,386] A new study created in memory with name: no-name-9e4d7e8e-4805-4123-9bfb-ae5f8da8e562
[W 2024-11-12 23:41:07,398] Trial 0 failed with parameters: {'iterations': 1953, 'objective': 'Logloss', 'colsample_bylevel': 0.05730051197511123, 'learning_rate': 0.04230509319887437, 'depth': 6, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 15.449299899277744, 'auto_class_weights': 'SqrtBalanced'} because of the following error: CatBoostError('C:/Go_Agent/pipelines/BuildMaster/catboost.git/catboost/private/libs/options/loss_description.cpp:18: SqrtBalanced loss is not supported').
Traceback (most recent call last):
  File "C:\Users\Mi\AppData\Roaming\Python\Python311\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\Mi\AppData\Local\Temp\ipykernel_24064\4127364583.py", line 20, in objective
    cat_cls.fit(X_train, y_train, eval_set=[(X_val, y_val)] ,verbose=0, earl

CatBoostError: C:/Go_Agent/pipelines/BuildMaster/catboost.git/catboost/private/libs/options/loss_description.cpp:18: SqrtBalanced loss is not supported

In [None]:
sorted_trials = sorted(study.trials, key=lambda trial: -trial.value)
top_10_trials = sorted_trials[:50] 
top_trials_params = []
for trial in top_10_trials:
    top_trials_params.append(trial.params)
    print(f"Trial number: {trial.number}")
    print(f"Parameters: {trial.params}")
    print(f"Value: {trial.value}")
    print(f"---")

## Feature importance catboost

In [None]:
feature_importance = catboost.get_feature_importance()
feature_names = X_train.columns
names_more_treshold = []
top_feats = []
cnt = 0
# # Display feature importance
for importance, name in sorted(list(zip(feature_importance, feature_names))):
    if importance >= 0.05:
        cnt += 1
        print(f"Feature: {name}, Importance: {importance:.2f}")
        names_more_treshold.append(name)
print(cnt)

## Batch training

In [None]:
model = cat_classif

batch_size = 512

for i in range(0, len(X_train), batch_size):
    X_batch = X_train[i:i + batch_size]
    y_batch = y_train[i:i + batch_size]
    
    model.fit(X_batch, y_batch, verbose=False)

## Stratified model training

In [None]:
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
models = []

for train_index, test_index in skf.split(X_train, y_train):
    X_train, X_test = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train, y_test = y_train.iloc[train_index], y_train.iloc[test_index]

    model = catboost.CatBoostClassifier()
    model.fit(X_train, y_train)
    models.append(model)
    
    model.save_model(f'catboost_model_{len(models)}.cbm')

## Read chunks

In [None]:
destribiution = pd.DataFrame('your_file.csv',  usecols=['Target'])
n_splits = 10 # столько чтобы не полетела оперативка
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
id = [[x] for x in range(len(destribiution))]

for _, chunk_index in skf.split(id, destribiution):
    df_chunk = pd.read_csv('your_file.csv', skiprows=lambda x: x not in id)

# **Xgboost**

## Defult xgboost

In [None]:
# classification
xgb_classif = xgboost.XGBClassifier(random_state=42, tree_method = 'gpu_hist', device='CUDA')
xgb_classif.fit(X_train, y_train)

In [None]:
# regression
xgb_reg = xgboost.XGBRegressor(random_state=42, tree_method = 'gpu_hist', device='CUDA')
xgb_reg.fit(X_train, y_train)

## Optuna xgboost

In [None]:
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 5000),
        "objective": trial.suggest_categorical("objective", ["binary:logistic"]),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.01, 1),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 100),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 100.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 100.0),
        "subsample": trial.suggest_float("subsample", 0.1, 1),
        "gamma": trial.suggest_float("gamma", 0, 10),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1, 10),
        'max_delta_step': trial.suggest_float('max_delta_step', 0.0, 10.0),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'device': 'cuda',  
        'tree_method': 'hist'
    }

    xgb_cls = xgb.XGBClassifier(**param, eval_metric='f1')

    xgb_cls.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0, early_stopping_rounds=500)
    
    preds = xgb_cls.predict(X_test)
    f1 = f1_score(y_test, preds)
    return f1

if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, timeout=7200)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

# **Stacking**

> Код грязный, но пока так

In [25]:
class DjStacking(BaseEstimator, ClassifierMixin):
    """Стэкинг моделей scikit-learn"""

    def __init__(self, models, ens_model):
        """
        Инициализация
        models - базовые модели для стекинга
        ens_model - мета-модель
        """
        self.models = models
        self.ens_model = ens_model
        self.n = len(models)
        self.valid = None
        
    def fit(self, X, y=None, p=0.25, cv=3, err=0.001, random_state=None, dop_models=None):
        """
        Обучение стекинга
        p - в каком отношении делить на обучение / тест
            если p = 0 - используем всё обучение!
        cv  (при p=0) - сколько фолдов использовать
        err (при p=0) - величина случайной добавки к метапризнакам
        random_state - инициализация генератора
            
        """
        if (p > 0): # делим на обучение и тест
            # разбиение на обучение моделей и метамодели
            train, valid, y_train, y_valid = train_test_split(X, y, test_size=p, random_state=random_state)
            
            # заполнение матрицы для обучения метамодели
            self.valid = np.zeros((valid.shape[0], self.n))
            for t, clf in enumerate(self.models):
                clf.fit(train, y_train)
                self.valid[:, t] = clf.predict_proba(valid)[:, 1]
                
            # обучение метамодели
            self.ens_model.fit(self.valid, y_valid)
            print(f'F1: {round(f1_score(self.ens_model.predict(self.valid), y_valid), 3)}')

        else: # используем всё обучение
        
            # для регуляризации - берём случайные добавки
            self.valid = err*np.random.randn(X.shape[0], self.n)
            
            for t, clf in enumerate(self.models):
                # это oob-ответы алгоритмов
                self.valid[:, t] += cross_val_predict(clf, X, y, cv=cv, method='predict_proba')[:, 1]
                # но сам алгоритм надо настроить
                clf.fit(X, y)
            
            # обучение метамодели
            self.ens_model.fit(self.valid, y)
            print(f'F1: {round(f1_score(self.ens_model.predict(self.valid), y), 3)}')
        

        return self
    

    def predict(self, X, y=None):
        """
        Работа стэкинга
        """
        # заполение матрицы для мета-классификатора
        X_meta = np.zeros((X.shape[0], self.n))
        
        for t, clf in enumerate(self.models):
            X_meta[:, t] = clf.predict(X)
        
        a = self.ens_model.predict(X_meta)
        
        return (a)
    
    def predict_proba(self, X, y=None):
        """
        Работа стэкинга
        """
        # заполение матрицы для мета-классификатора
        X_meta = np.zeros((X.shape[0], self.n))
        
        for t, clf in enumerate(self.models):
            X_meta[:, t] = clf.predict_proba(X)[:, 1]

        a = self.ens_model.predict_proba(X_meta)
        
        return (a)
    
    def fit_ens_model(self, X, y=None, cv=3, err=0.001):
        self.valid = err*np.random.randn(X.shape[0], self.n)
        
        for t, clf in enumerate(self.models):
            self.valid[:, t] += cross_val_predict(clf, X, y, cv=cv, n_jobs=-1, method='predict_proba')[:, 1]

        # Полиномиальные признаки до второй степени

        self.ens_model.fit(self.valid, y)
        
        return self

In [None]:
# базовые модели для стекинга

'''
gbm1 = lgb.LGBMClassifier(random_state=54, device="gpu", learning_rate=0.143)

gbm2 = lgb.LGBMClassifier(random_state=8743, device="gpu", learning_rate=0.1)    

gbm3 = lgb.LGBMClassifier(random_state=2367, device="gpu", learning_rate=0.3)

xgb1 = XGBClassifier(random_state=13, tree_method = 'gpu_hist', device='CUDA', learning_rate=0.15)

xgb2 = XGBClassifier(random_state=74, tree_method = 'gpu_hist', device='CUDA', learning_rate=0.1)

xgb3 = XGBClassifier(random_state=788, tree_method = 'gpu_hist', device='CUDA', learning_rate=0.19)
'''
cat1 = catboost.CatBoostRegressor(random_seed=42, verbose=200, eval_metric='RMSE', task_type="GPU")

cat2 = catboost.CatBoostRegressor(random_seed=472, verbose=200, eval_metric='RMSE', task_type="GPU")

cat3 = catboost.CatBoostRegressor(random_seed=12, verbose=200, eval_metric='RMSE', task_type="GPU")

cat4 = catboost.CatBoostRegressor(random_seed=125, verbose=200, eval_metric='RMSE', task_type="GPU")

cat5 = catboost.CatBoostRegressor(random_seed=132, verbose=200, eval_metric='RMSE', task_type="GPU")

In [None]:
#models = [gbm1, gbm2, gbm3, xgb1, xgb2, xgb3, cat1, cat2, cat3]
models = [cat1, cat2, cat3, cat4, cat5]
ens_model = catboost.CatBoostClassifier(verbose=200, task_type="GPU", random_seed=62)

s2 = DjStacking(models, ens_model)
s2.fit(X_train, y_train, p=-1, cv=5, random_state=42)
#print(f'F1: {round(f1_score(y_test, preds), 3)}')

# **Cross_val emsemble**

In [None]:
# top_trials_params - лучшее из optuna
top_trials_params = []

In [None]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
data = X_train
metrics_stratified = []
cv_models = []
indx = 0
for train_index, test_index in skf.split(data, y_train):
    x_train_fold, x_test_fold = data.iloc[train_index, :], data.iloc[test_index, :]
    y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]
    cat_cv = catboost.CatBoostClassifier(**top_trials_params[indx], eval_metric='F1')
    cat_cv.fit(x_train_fold, y_train_fold,
             verbose=100, early_stopping_rounds=500)
    pred = cat_cv.predict(x_test_fold)
    metrics_stratified.append((f1_score(pred, y_test_fold).round(3), roc_auc_score(pred, y_test_fold).round(3)))
    cv_models.append(cat_cv)
    indx += 1

print('\n'.join(map(str, metrics_stratified)))

In [None]:
# Для большего  кол-ва моделей, для которых фолдов уже не хватает
# Здесь несколько моделей обучаются на одинаковых фолдах
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
data = X_train
metrics_stratified = []
cv_models = []
indx = 0
for train_index, test_index in skf.split(data, y_train):
    x_train_fold, x_test_fold = data.iloc[train_index, :], data.iloc[test_index, :]
    y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]

    catboost1 = catboost.CatBoostClassifier(**top_trials_params[indx], eval_metric='F1')
    catboost2 = catboost.CatBoostClassifier(**top_trials_params[indx + 1], eval_metric='F1')
    catboost3 = catboost.CatBoostClassifier(**top_trials_params[indx + 2], eval_metric='F1')

    catboost1.fit(x_train_fold, y_train_fold, verbose=300, early_stopping_rounds=500),
    catboost2.fit(x_train_fold, y_train_fold, verbose=300, early_stopping_rounds=500)
    catboost3.fit(x_train_fold, y_train_fold, verbose=300, early_stopping_rounds=500)

    pred1, pred2, pred3 = catboost1.predict(x_test_fold), catboost2.predict(x_test_fold), catboost3.predict(x_test_fold)
    metrics_stratified.append((f1_score(pred1, y_test_fold).round(3), roc_auc_score(pred1, y_test_fold).round(3)))
    metrics_stratified.append((f1_score(pred2, y_test_fold).round(3), roc_auc_score(pred2, y_test_fold).round(3)))
    metrics_stratified.append((f1_score(pred3, y_test_fold).round(3), roc_auc_score(pred3, y_test_fold).round(3)))
    cv_models.append(catboost1)
    cv_models.append(catboost2)
    cv_models.append(catboost3)
    indx += 3

print('\n'.join(map(str, metrics_stratified)))

In [None]:
mean_auc = 0
mean_f1 = 0
for metric in metrics_stratified:
    mean_auc += metric[1]
    mean_f1 += metric[0]

print('ROC_AUC:', (mean_auc / len(metrics_stratified)).round(3))
print('F1:', (mean_f1 / len(metrics_stratified)).round(3))

In [None]:
# Cross Val predict
preds = []
for model in cv_models:
    pred = model.predict_proba(X_test)[:,1]
    preds.append(pred)

arr_np = np.array(preds)
mean_arr = np.mean(arr_np, axis=0)

pred = (mean_arr >= 0.5).astype(int)

# **Data preprocess**

## Fill NaN

In [None]:
def fillna_mice(trainX):
    scaler = StandardScaler()
    trainX_mice = trainX.copy()
    trainX_mice = pd.DataFrame(scaler.fit_transform(trainX_mice), columns = trainX.columns)
    mice_imputer = IterativeImputer(initial_strategy = 'mean',
                                    estimator = LinearRegression(n_jobs=-1),
                                    random_state = 42, verbose=2, max_iter=10)

    mice = mice_imputer.fit_transform(trainX_mice)
    return pd.DataFrame(scaler.inverse_transform(mice), columns = trainX.columns), mice_imputer, scaler


# new dataframe
trainX_mice, mice_imputer, scaler = fillna_mice(X_train)
trainX_mice.head()

In [None]:
# Simple Imputer
imputer = SimpleImputer(strategy='mean')
X_simple = imputer.fit_transform(X_train)
X_simple = pd.DataFrame(X_simple, columns = X_train.columns)

## Feature engineering

In [None]:
poly_features = X_train[top_feats] # Предварительно отобрать топ
poly_transformer = PolynomialFeatures(degree = 3)

poly_transformer.fit(poly_features)
poly_features = poly_transformer.transform(poly_features)
print('Polynomial Features shape: ', poly_features.shape)

## Feature selection

In [None]:
# статичные фичи
sel = VarianceThreshold(threshold=0)
columns_df = X_train.columns
sel.fit(X_train)
get_sup_col = sel.get_support()

# мультикор.
df_transformed = sel.transform(X_train)
df_transformed = pd.DataFrame(df_transformed, columns=columns_df[get_sup_col])
corr_matrix = df_transformed.corr()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
df_transformed = df_transformed.drop(to_drop, axis=1)

## Class Disbalance

> Если метрика F1, то лучше будет просто использовать class_weights + подбор трешхолда

In [None]:
# Создание экземпляра RandomOverSampler
ros = RandomOverSampler()

# Применение увеличения выборки к данным
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [None]:
# Создание экземпляра RandomUnderSampler
rus = RandomUnderSampler()

# Применение уменьшения выборки к данным
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

In [None]:
# Создание экземпляра SMOTE
smote = SMOTE()

# Применение SMOTE к данным
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# **Other**

## Treshold

In [None]:
pred = cat_reg.predict_proba(X_val)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_val, pred)
fscore = (2 * precision * recall) / (precision + recall)
best_threshold = thresholds[np.argmax(fscore)]
print(f"Best threshold: {best_threshold}")

## Validation model

In [None]:
model = catboost
pred = model.predict(X_val)
# pred = [1 if x >= 0.5 else 0 for x in pred]
print(classification_report(pred, y_val))
print(f'F1_score: {f1_score(pred, y_val).round(3)}')
print(f'Roc_auc: {roc_auc_score(pred, y_val).round(3)}')

## Stratified cross-val

In [None]:
model = cat_classif
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(model, X_train, y_train, cv=skf, scoring='f1_macro')

# Выводим результаты
print("Стратифицированная кросс-валидация F1-метрика (macro):", cv_scores)
print("Среднее значение F1:", np.mean(cv_scores))
print("Стандартное отклонение F1:", np.std(cv_scores))