In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score
from sklearn.model_selection import GridSearchCV
import joblib

In [None]:
def prepare_dataset(filepath, scaler_filepath, pca_filepath): #подготовка данных для последующего обучения
  balanced_data = pd.read_parquet(filepath)
  X = balanced_data.drop(['target'], axis=1)
  y = balanced_data['target']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify = y, random_state=42)

  scaler = joblib.load(scaler_filepath)
  pca = joblib.load(pca_filepath)

  X_train_scaled = scaler.transform(X_train)
  X_train_pca = pca.transform(X_train_scaled)

  X_test_scaled = scaler.transform(X_test)
  X_test_pca = pca.transform(X_test_scaled)
  return X_train_pca, X_test_pca, y_train, y_test
  #return X_train, X_test, y_train, y_test

In [None]:
def get_metrics(y_test, y_pred): #подсчет метрик
  roc_auc = roc_auc_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)

  print(f"ROC AUC: {roc_auc}")
  print(f"F1 Score: {f1}")
  print(f"Recall: {recall}")
  print(f"Precision: {precision}")

In [None]:
def make_test(model, X_train, y_train, X_test, y_test): #тестим что получили
  model.fit(X_train, y_train)
  predict = model.predict(X_test)
  get_metrics(y_test, predict)

In [None]:
X_train, X_test, y_train, y_test = prepare_dataset('../data/balanced_data_80_20_new.parquet', 'scaler_80_20_new.joblib', 'pca_80_20_new.joblib')
#путь к датасету и к файлам scaller и pca

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_parameters = {
    'n_neighbors': [2, 3, 5, 7, 10, 15],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2]
}

knn_model = KNeighborsClassifier()
grid_search_knn = GridSearchCV(estimator=knn_model, param_grid=knn_parameters, cv=5, scoring='roc_auc')
grid_search_knn.fit(X_train, y_train)

# Лучшие параметры
best_params_knn = grid_search_knn.best_params_
best_knn_model = grid_search_knn.best_estimator_
print(f'Best Hyperparameters for KNN: {best_params_knn}')
print(f'ROC AUC for KNN: {grid_search_knn.best_score_}')


In [None]:
#если пишут что No module named, то его нужно скачать
%pip install catboost

In [None]:
from catboost import CatBoostClassifier #тут загружаем нужную нам модель

In [None]:

#тут пишем объект параметров, где ключ это то, что мы хотим изменять, а значения массив возможных значений
parameters = {
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [3, 5, 7, 10],
    'l2_leaf_reg': [5, 7, 9, 20, 40],
    "min_data_in_leaf": [1, 3, 5, 10]
}

model = CatBoostClassifier(loss_function ='Logloss',iterations=200, random_state=42,early_stopping_rounds=50, silent=True) #заменяем на тестируюмую модель, обязательно random_state = 42
grid_search_clas = GridSearchCV(estimator=model, param_grid=parameters, cv=3, scoring='roc_auc', verbose=3)
grid_search_clas.fit(X_train, y_train)

# Лучшие параметры
best_params_clas = grid_search_clas.best_params_

print(f'Best Hyperparameters: {best_params_clas}')

In [None]:
make_test(best_log_model, X_train, y_train, X_test, y_test)
best_log_model.save_model('best_catboost_model')
#joblib.dump(best_log_model, 'best_catboost.joblib')
#cтрочка для сохранения модели. Раскоментируйте, если захотите сохранить (лучше сохранять всегда, когда делаете). Вторым параметром передаем имя файла

ROC AUC: 0.6520465142023415
F1 Score: 0.6425272492148532
Recall: 0.6259899208063355
Precision: 0.6599620493358633


In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
model = CatBoostClassifier(depth =  5, l2_leaf_reg = 7, early_stopping_rounds=100,
                           learning_rate = 0.01, min_data_in_leaf = 3, random_state=42,
                           iterations=10000, loss_function ='Logloss')
model.fit(X_train, y_train, eval_set=(X_valid, y_valid),verbose=True)

In [None]:
pred = model.predict_proba(X_test)

pred = pred[:, 1]
pred_binary = (pred >= 0.1)

In [None]:
get_metrics(y_test, pred_binary)

ROC AUC: 0.5701493669043178
F1 Score: 0.3660098865139595
Recall: 0.9453335730983636
Precision: 0.22693718972587956


In [None]:
best_threshold = 0.5
best_f1_score = 0.0

for threshold in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    pred_binary = (pred >= threshold)
    f1 = roc_auc_score(y_test, pred_binary)
    if f1 > best_f1_score:
        best_f1_score = f1
        best_threshold = threshold

print(f"Best F1 Score: {best_f1_score} at threshold {best_threshold}")
pred_binary = (pred >= best_threshold)
get_metrics(y_test, pred_binary)

Best F1 Score: 0.6540708419976339 at threshold 0.2
ROC AUC: 0.6540708419976339
F1 Score: 0.4329070433288795
Recall: 0.6117604747347599
Precision: 0.33497439936983064


In [None]:
model.save_model('best_catboost_model_80_20_new_more_iter')