<a href="https://colab.research.google.com/github/FedyaBadyilo/predictive-assessment-of-the-occurrence-of-forest-fires/blob/main/experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# data import

In [1]:
pip install rasterio

Collecting rasterio
  Downloading rasterio-1.3.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting affine (from rasterio)
  Downloading affine-2.4.0-py3-none-any.whl.metadata (4.0 kB)
Collecting snuggs>=1.4.1 (from rasterio)
  Downloading snuggs-1.4.7-py3-none-any.whl.metadata (3.4 kB)
Downloading rasterio-1.3.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.7/21.7 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading snuggs-1.4.7-py3-none-any.whl (5.4 kB)
Downloading affine-2.4.0-py3-none-any.whl (15 kB)
Installing collected packages: snuggs, affine, rasterio
Successfully installed affine-2.4.0 rasterio-1.3.11 snuggs-1.4.7


In [2]:
import rasterio
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import pandas as pd
import os
import json

import warnings
warnings.filterwarnings('ignore')
from warnings import simplefilter
simplefilter("ignore", category=RuntimeWarning)

кастомные функции для работы со снимками

In [3]:
def print_geotiff_info(path):
    try:
        # Открываем файл
        with rasterio.open(path) as src:
            # Основные метаданные
            print(f"File Path: {path}")
            print(f"Driver: {src.driver}")
            print(f"Width: {src.width}")
            print(f"Height: {src.height}")
            print(f"Count (Bands): {src.count}")
            print(f"CRS: {src.crs}")
            print(f"Transform: {src.transform}")
            print(f"Bounding Box: {src.bounds}")
            print(f"Datum: {src.dtypes}")

            # Информация по каждому каналу
            for i in range(1, src.count + 1):
                band = src.read(i)
                print(f"\nBand {i}:")
                print(f"  Data Type: {src.dtypes[i - 1]}")
                print(f"  Min Value: {band.min()}")
                print(f"  Max Value: {band.max()}")
                print(f"  Mean Value: {band.mean()}")
                print(f"  Standard Deviation: {band.std()}")

    except Exception as e:
        print(f'Error: {e}')



# In[5]:

def visualize_rgb_geotiff(file_path, r_band, g_band, b_band, ik_band, mask_band):
    try:
        with rasterio.open(file_path) as src:
            num_bands = src.count
            print(f"Number of bands: {num_bands}")
            red = src.read(r_band)  # B02 - Blue
            green = src.read(g_band)  # B03 - Green
            blue = src.read(b_band)  # B04 - Red
            ik = src.read(ik_band)
            mask = src.read(mask_band)

            photos = [
              np.stack([red, green, blue], axis=-1),
              np.stack([ik], axis=-1), # Отрисовка ИК-слоя изображения
              np.stack([mask], axis=-1)  # Отрисовка маски изображения
            ]

            for i in range(3):
                fig, ax = plt.subplots(figsize=(10, 10))
                if i == 0:
                    p2 = np.percentile(photos[i], 1)    # Lower cutoff
                    p98 = np.percentile(photos[i], 99)  # Upper cutoff

                    photos[i] = np.clip(photos[i], p2, p98)
                    photos[i] = ((photos[i] - p2) / (p98 - p2)) * 255

                photos[i] = photos[i].astype(np.uint8)
                ax.imshow(photos[i])
                ax.axis('off')
                plt.show()

    except Exception as e:
        print(f'Ошибка: {e}')



def get_mask(path, r_band, g_band, b_band, ik_band, mask_band, task='mask'):
    try:

      # Открываем файл
        with rasterio.open(path) as src:
            red = src.read(r_band)  # B02 - Blue
            green = src.read(g_band)  # B03 - Green
            blue = src.read(b_band)  # B04 - Red
            ik = src.read(ik_band)
            mask = src.read(mask_band)
            if task == 'mask':
              return mask
            elif task == 'ir':
              return ik
            else:
              return np.stack([red, green, blue], axis=-1)
            # Основные метаданные
            print(f"File Path: {path}")
            print(f"Driver: {src.driver}")
            print(f"Width: {src.width}")
            print(f"Height: {src.height}")
            print(f"Count (Bands): {src.count}")
            print(f"CRS: {src.crs}")
            print(f"Transform: {src.transform}")
            print(f"Bounding Box: {src.bounds}")
            print(f"Datum: {src.dtypes}")

            # Информация по каждому каналу
            for i in range(1, src.count + 1):
                band = src.read(i)
                print(f"\nBand {i}:")
                print(f"  Data Type: {src.dtypes[i - 1]}")
                print(f"  Min Value: {band.min()}")
                print(f"  Max Value: {band.max()}")
                print(f"  Mean Value: {band.mean()}")
                print(f"  Standard Deviation: {band.std()}")

    except Exception as e:
        print(f'Error: {e}')

# набор данных - реестр пожаров

In [96]:
root_dir = '/content/drive/MyDrive/Colab_Notebooks/forest_fires_hack/minprirody_train/'
general_info_path = '/content/drive/MyDrive/Colab_Notebooks/forest_fires_hack/minprirody_train/Реестр пожаров 2015-2021.xls'

In [97]:
df_gen = pd.read_excel(general_info_path, skiprows=7)

In [98]:
new_cols = ['date', 'time', 'name', 'azimuth', 'coordinates', 'wind_speed', 'curr_fire_area', 'overall_fire_area', 'forest_fire_area', 'vegetation_fire_area', 'cause']

In [99]:
df_gen.rename(columns={old_col: new_col for old_col, new_col in zip(df_gen.columns, new_cols)}, inplace=True)

In [100]:
df_gen.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1971 entries, 0 to 1970
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   date                  1971 non-null   object 
 1   time                  1971 non-null   object 
 2   name                  1967 non-null   object 
 3   azimuth               1966 non-null   float64
 4   coordinates           1971 non-null   object 
 5   wind_speed            1971 non-null   object 
 6   curr_fire_area        1902 non-null   float64
 7   overall_fire_area     1971 non-null   float64
 8   forest_fire_area      1971 non-null   float64
 9   vegetation_fire_area  1971 non-null   float64
 10  cause                 1971 non-null   float64
 11  Unnamed: 11           1971 non-null   object 
dtypes: float64(6), object(6)
memory usage: 184.9+ KB


# набор данных - все снимки из тренировочного набора данных

сопоставим все точки с их координатами из собранного набора pixel_coordinate_mappings.json

In [101]:
coordinates_path = '/content/drive/MyDrive/Colab_Notebooks/forest_fires_hack/pixel_coordinate_mappings.json'

In [102]:
with open(coordinates_path) as f:
    json_file = json.load(f)

функция, которая поможет в будущем преобразовывать исторические данные

In [103]:
def min_max_avg(arr):
  if not np.any(arr):
    return -666, -666, -666
  return min(arr), max(arr), np.mean(arr)

функция для сбора датасета исторических данных, снимков и их координат

In [104]:
def set_dataset(number):
  if number > 9:
    curr_number = f'{number}'
  else:
    curr_number = f'0{number}'

  curr_dir = root_dir + curr_number
  curr_file = f'/{"".join(filter(lambda x: False if "csv" in x else True, os.listdir(curr_dir)))}'
  date =  curr_file.split('.tiff')[0][1:]
  number_date = curr_number + curr_file.split('.tiff')[0]
  file_name = curr_dir + curr_file

  rgb = get_mask(file_name, 1, 2, 3, 4, 5, task = 'img') # достаем значения каналов
  mask = get_mask(file_name, 1, 2, 3, 4, 5, task = 'mask')
  ir = get_mask(file_name, 1, 2, 3, 4, 5, task = 'ir')
  curr_lat, curr_lon = np.array(json_file[0][number_date + '.tiff']).reshape(-1, 2).T # получаем координаты

  data_dct = {'red': rgb[:, :, 0].flatten(), # добавляем значеня каналов и координаты в словарь
                      'green': rgb[:, :, 1].flatten(),
                      'blue': rgb[:, :, 2].flatten(),
                      'info_red': ir.flatten(),
                      'mask': mask.flatten(),
                      'latitude': curr_lat,
                      'longitude': curr_lon
                      }

  hist_data_curr_file = f'/{"".join(filter(lambda x: True if "csv" in x else False, os.listdir(curr_dir)))}' # импортируем исторические данные в датасет
  his_data_file = curr_dir + hist_data_curr_file
  hist_data = pd.read_csv(his_data_file)

  new_hist_cols = ['time', 't_avg', 't_min', 't_max', 'total_precipitation', 'wind_dir', 'wind_speed_hist', 'wind_gust', 'sea_level_pressure'] # заменяем старые названия колонок на новые
  hist_data.rename(columns={old_col: new_col for old_col, new_col in zip(hist_data.columns, new_hist_cols)}, inplace=True)

  df = pd.DataFrame(data_dct)
  df['date'] = [number_date] * len(df) # добавляем в новый датасет название папки в качестве элемента группировки

  for col in new_hist_cols[1:]:
    mi, ma, av = min_max_avg(hist_data[col]) # получаем минимальное, максимальное и средние значения каждого из исторических признаков и добавляем в новый датасет
    df[f'{col}_min'] = [mi] * len(df)
    df[f'{col}_max'] = [ma] * len(df)
    df[f'{col}_avg'] = [av] * len(df)

  hist_data['time'] = pd.to_datetime(hist_data['time'])
  specific_date = pd.to_datetime(date)
  end_date = specific_date
  last_month = end_date - pd.DateOffset(months=1)
  last_last_month = end_date - pd.DateOffset(months=2)

  filtered_df = hist_data[(hist_data['time'] >= last_month) & (hist_data['time'] < end_date)] # отбираем признаки только за последний месяц до пожара и проводим ту же операцию
  for col in new_hist_cols[1:]:
    mi, ma, av = min_max_avg(filtered_df[col])
    df[f'{col}_min_last_month'] = [mi] * len(df)
    df[f'{col}_max_last_month'] = [ma] * len(df)
    df[f'{col}_avg_last_month'] = [av] * len(df)


  FEATURE_INGENEERING = True
  # немного инженерии признаков
  if FEATURE_INGENEERING:
    # 1. Температурный градиент
    df['t_gradient'] = np.mean(hist_data['t_max'] - hist_data['t_min']) * len(df)
    # 2. Среднедневная температура
    df['t_daily_avg'] = np.mean((hist_data['t_min'] + hist_data['t_max']) / 2) * len(df)
    # 3. Усредненные параметры ветра
    df['v_avg'] = np.mean((hist_data['wind_speed_hist'] + hist_data['wind_gust']) / 2) * len(df)
    # 4. Эффективная скорость ветра
    df['v_eff'] = np.mean(np.sqrt(hist_data['wind_speed_hist']**2 + hist_data['wind_gust']**2)) * len(df)
    # 6. Вертикальный градиент давления (разница между текущим и предыдущим значением)
    df['p_gradient'] = np.mean(hist_data['sea_level_pressure'].diff() * len(df))
    # 7. Интенсивность осадков
    df['rain_intensity'] = np.mean(hist_data['total_precipitation'] / (len(hist_data) * 24)) * len(df) #
    # 8. Вектор ветра
    df['u'] = np.mean(hist_data['wind_speed_hist'] * np.cos(np.radians(hist_data['wind_dir']))) * len(df)
    df['v'] = np.mean(hist_data['wind_speed_hist'] * np.sin(np.radians(hist_data['wind_dir']))) * len(df)


  # filtered_df_2 = hist_data[(hist_data['time'] >= last_last_month) & (hist_data['time'] < last_month)]
  # for col in new_hist_cols[1:]:
  #   mi, ma, av = min_max_avg(filtered_df_2[col])
  #   df[f'{col}_min_last_last_month'] = [mi] * len(df)
  #   df[f'{col}_max_last_last_month'] = [ma] * len(df)
  #   df[f'{col}_avg_last_last_month'] = [av] * len(df)

  print(number_date, len(df.columns))

  cols = list(set(df.columns) - set(['date']))
  df[cols] = df[cols].astype('float16')
  return df

обхединение всех данных из папок 0-20 в один датасет

In [105]:
try:
  del train_data
except:
  pass

for ind in range(1, 21): # итерируемся по номерам папок
  try:
    if ind > 1:
      train_data = pd.concat([train_data, set_dataset(ind)], ignore_index = True, axis = 0)
    else:
      train_data = set_dataset(ind)
  except:
    continue

train_data.replace([np.inf, -np.inf], -66666, inplace=True)
train_data.fillna(-666, inplace = True);

train_data

01/2021-05-26 64
02/2021-06-03 64
03/2021-06-03 64
04/2021-06-06 64
05/2021-06-03 64
06/2021-06-03 64
07/2021-06-03 64
08/2021-06-08 64
09/2021-06-16 64
10/2021-06-16 64
11/2018-06-01 64
12/2019-05-10 64
13/2019-05-17 64
16/2020-05-09 64
18/2021-05-02 64
19/2021-05-04 64
20/2021-05-15 64


Unnamed: 0,red,green,blue,info_red,mask,latitude,longitude,date,t_avg_min,t_avg_max,...,sea_level_pressure_max_last_month,sea_level_pressure_avg_last_month,t_gradient,t_daily_avg,v_avg,v_eff,p_gradient,rain_intensity,u,v
0,12.0,14.0,17.0,29.0,0.0,75.1250,55.78125,01/2021-05-26,-7.300781,20.500000,...,1024.0,1016.5,-66666.0,-66666.0,-666.0,-666.0,2788.0,13.390625,-66666.0,-66666.0
1,11.0,14.0,14.0,27.0,0.0,75.1250,55.78125,01/2021-05-26,-7.300781,20.500000,...,1024.0,1016.5,-66666.0,-66666.0,-666.0,-666.0,2788.0,13.390625,-66666.0,-66666.0
2,11.0,13.0,14.0,27.0,0.0,75.1250,55.78125,01/2021-05-26,-7.300781,20.500000,...,1024.0,1016.5,-66666.0,-66666.0,-666.0,-666.0,2788.0,13.390625,-66666.0,-66666.0
3,12.0,14.0,14.0,29.0,0.0,75.1250,55.78125,01/2021-05-26,-7.300781,20.500000,...,1024.0,1016.5,-66666.0,-66666.0,-666.0,-666.0,2788.0,13.390625,-66666.0,-66666.0
4,11.0,14.0,13.0,28.0,0.0,75.1250,55.78125,01/2021-05-26,-7.300781,20.500000,...,1024.0,1016.5,-66666.0,-66666.0,-666.0,-666.0,2788.0,13.390625,-66666.0,-66666.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3900917,6.0,8.0,13.0,22.0,1.0,71.1875,55.12500,20/2021-05-15,-24.796875,7.601562,...,1024.0,1020.0,-66666.0,-66666.0,-666.0,-666.0,-11096.0,129.375000,-66666.0,-66666.0
3900918,6.0,8.0,12.0,22.0,1.0,71.1875,55.12500,20/2021-05-15,-24.796875,7.601562,...,1024.0,1020.0,-66666.0,-66666.0,-666.0,-666.0,-11096.0,129.375000,-66666.0,-66666.0
3900919,5.0,8.0,13.0,21.0,1.0,71.1875,55.12500,20/2021-05-15,-24.796875,7.601562,...,1024.0,1020.0,-66666.0,-66666.0,-666.0,-666.0,-11096.0,129.375000,-66666.0,-66666.0
3900920,5.0,8.0,13.0,22.0,1.0,71.1875,55.12500,20/2021-05-15,-24.796875,7.601562,...,1024.0,1020.0,-66666.0,-66666.0,-666.0,-666.0,-11096.0,129.375000,-66666.0,-66666.0


In [95]:
# train_data.to_csv('/content/drive/MyDrive/Colab_Notebooks/forest_fires_hack/train_data')

In [27]:
# import requests

# url = "https://weatherapi-com.p.rapidapi.com/current.json"

# querystring = {"q":f"{lat}, {lon}"}

# headers = {
# 	"x-rapidapi-key": "2cb2c64b1cmshfc30b41f0cfc97ap170310jsn734cc4d88d58",
# 	"x-rapidapi-host": "weatherapi-com.p.rapidapi.com"
# }

# response = requests.get(url, headers=headers, params=querystring)

# print(response.json())

{'location': {'name': 'Boston', 'region': 'Lincolnshire', 'country': 'United Kingdom', 'lat': 53.1, 'lon': -0.13, 'tz_id': 'Europe/London', 'localtime_epoch': 1725742119, 'localtime': '2024-09-07 21:48'}, 'current': {'last_updated_epoch': 1725741900, 'last_updated': '2024-09-07 21:45', 'temp_c': 18.2, 'temp_f': 64.8, 'is_day': 0, 'condition': {'text': 'Mist', 'icon': '//cdn.weatherapi.com/weather/64x64/night/143.png', 'code': 1030}, 'wind_mph': 5.6, 'wind_kph': 9.0, 'wind_degree': 50, 'wind_dir': 'NE', 'pressure_mb': 1008.0, 'pressure_in': 29.77, 'precip_mm': 0.0, 'precip_in': 0.0, 'humidity': 88, 'cloud': 50, 'feelslike_c': 18.2, 'feelslike_f': 64.8, 'windchill_c': 16.2, 'windchill_f': 61.1, 'heatindex_c': 16.2, 'heatindex_f': 61.1, 'dewpoint_c': 15.2, 'dewpoint_f': 59.4, 'vis_km': 6.0, 'vis_miles': 3.0, 'uv': 1.0, 'gust_mph': 12.8, 'gust_kph': 20.6}}


In [106]:
num_cols = list(filter(lambda x: False if x == 'date' or x == 'mask' else True, train_data.columns))
target_col = 'mask'
date_col = 'date'
groups = 'date'

# ml approach

использование различных ml моделей для предсказаний

In [None]:
pip install catboost; pip install optuna; pip install dask[dataframe]; pip install shap

In [109]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import StratifiedGroupKFold, cross_val_score, train_test_split, cross_validate
from sklearn.metrics import f1_score, matthews_corrcoef
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.pipeline import make_pipeline

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier, Pool

# import optuna

In [None]:
# def custom_data_loader(X, y):
#   data = pd.concat([X, y], axis = 1)
#   unique_dates = data['date'].unique()

#   batches = []
#   for i in range(len(unique_dates)):
#     batches.append((data[data['date'] == unique_dates[i]].drop([target_col], axis = 1).values, data[data['date'] == unique_dates[i]][target_col].values.ravel()))

#   return batches

# train_dl = custom_data_loader(X_train, y_train)

In [None]:
# for x_batch, y_batch in train_dl:
#   print(x_batch.shape, y_batch.shape)


In [107]:
import gc

gc.collect()

0

разбиение данных на тренировочный, валидационный и тестовый датасеты

In [110]:
X_train, X_test, y_train, y_test = train_test_split(train_data[num_cols], train_data[target_col], random_state = 42, test_size = 0.4)
# X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size = .5)

первые результаты

In [116]:
for model in [
              LGBMClassifier(),
              XGBClassifier()]:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    print(matthews_corrcoef(y_val, y_pred))

[LightGBM] [Info] Number of positive: 100577, number of negative: 2239976
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.379917 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1215
[LightGBM] [Info] Number of data points in the train set: 2340553, number of used features: 53
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.042971 -> initscore=-3.103297
[LightGBM] [Info] Start training from score -3.103297
0.3140033093713797
0.26045436538302114


значимость признаков для XGBClassifier

In [93]:
sorted(list(zip(num_cols, model.feature_importances_)), key = lambda x: x[1], reverse = True)

[('total_precipitation_avg_last_month', 0.3813037),
 ('total_precipitation_max_last_month', 0.17081237),
 ('t_max_avg_last_month', 0.12591946),
 ('t_avg_min_last_month', 0.047218792),
 ('wind_speed_hist_avg', 0.04474059),
 ('latitude', 0.023847276),
 ('t_avg_avg', 0.018727954),
 ('wind_dir_max', 0.017762804),
 ('t_avg_min', 0.016339304),
 ('red', 0.011789361),
 ('t_avg_max', 0.011589689),
 ('longitude', 0.0101243025),
 ('total_precipitation_max', 0.009878924),
 ('p_gradient', 0.008591981),
 ('blue', 0.008502832),
 ('rain_intensity', 0.008339441),
 ('green', 0.0069074864),
 ('t_max_max', 0.006579744),
 ('wind_dir_min_last_month', 0.005797334),
 ('wind_speed_hist_avg_last_month', 0.005510876),
 ('sea_level_pressure_avg', 0.005189037),
 ('info_red', 0.004905478),
 ('total_precipitation_avg', 0.004420612),
 ('t_max_min', 0.0042112786),
 ('wind_dir_max_last_month', 0.0038148814),
 ('sea_level_pressure_max_last_month', 0.0033260973),
 ('sea_level_pressure_max', 0.0030651568),
 ('t_avg_avg_la

# hypo tuning

воспользуемся фреймворком optuna, чтобы получить наилучшие гиперпараметры для бустингов

In [97]:
def objective_lgb(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', -1, 30),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 20, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    }

    model = LGBMClassifier(**params)
    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    mcc = matthews_corrcoef(y_test, preds)
    return mcc

# Оптимизация гиперпараметров LightGBM
lgb_study = optuna.create_study(direction='maximize')
lgb_study.optimize(objective_lgb, n_trials=50)

print("Best parameters for LightGBM:", lgb_study.best_params)
print("Best MCC score for LightGBM:", lgb_study.best_value)

[I 2024-09-07 22:04:11,266] A new study created in memory with name: no-name-d3ea0806-2e7e-4ab6-938d-c08ee0d101d9


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.331758 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:04:29,247] Trial 0 finished with value: 0.0 and parameters: {'num_leaves': 110, 'max_depth': 1, 'learning_rate': 0.04665435979160618, 'n_estimators': 24, 'subsample': 0.7695921103985472, 'colsample_bytree': 0.5092636851981066}. Best is trial 0 with value: 0.0.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.426766 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:05:23,184] Trial 1 finished with value: 0.4954100100019296 and parameters: {'num_leaves': 42, 'max_depth': 22, 'learning_rate': 0.07040347568522215, 'n_estimators': 59, 'subsample': 0.8605141541312862, 'colsample_bytree': 0.7852932961160316}. Best is trial 1 with value: 0.4954100100019296.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.315887 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:05:52,053] Trial 2 finished with value: 0.3435964848437707 and parameters: {'num_leaves': 39, 'max_depth': 11, 'learning_rate': 0.04501381496326246, 'n_estimators': 34, 'subsample': 0.5883872426330732, 'colsample_bytree': 0.5520327730254913}. Best is trial 1 with value: 0.4954100100019296.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.453190 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:07:12,153] Trial 3 finished with value: 0.3751392912500508 and parameters: {'num_leaves': 98, 'max_depth': 11, 'learning_rate': 0.014081233679489345, 'n_estimators': 83, 'subsample': 0.6657278074731843, 'colsample_bytree': 0.8611289841204062}. Best is trial 1 with value: 0.4954100100019296.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.405679 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:07:41,175] Trial 4 finished with value: 0.4732996141778475 and parameters: {'num_leaves': 131, 'max_depth': 22, 'learning_rate': 0.09098644264820112, 'n_estimators': 29, 'subsample': 0.9446418380136525, 'colsample_bytree': 0.5246107204350607}. Best is trial 1 with value: 0.4954100100019296.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.427167 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:08:41,056] Trial 5 finished with value: 0.2678124763616096 and parameters: {'num_leaves': 130, 'max_depth': 0, 'learning_rate': 0.015541354823367427, 'n_estimators': 56, 'subsample': 0.721933023028055, 'colsample_bytree': 0.8030280487280539}. Best is trial 1 with value: 0.4954100100019296.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.680983 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:10:23,350] Trial 6 finished with value: 0.5046460006581245 and parameters: {'num_leaves': 135, 'max_depth': 27, 'learning_rate': 0.03015336178654314, 'n_estimators': 95, 'subsample': 0.6929379237322278, 'colsample_bytree': 0.863495223787241}. Best is trial 6 with value: 0.5046460006581245.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.682040 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:11:38,285] Trial 7 finished with value: 0.5237023217523016 and parameters: {'num_leaves': 119, 'max_depth': 16, 'learning_rate': 0.06229030047083742, 'n_estimators': 68, 'subsample': 0.890847220762397, 'colsample_bytree': 0.8499040933639068}. Best is trial 7 with value: 0.5237023217523016.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.399527 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:12:07,590] Trial 8 finished with value: 0.28633173449045746 and parameters: {'num_leaves': 110, 'max_depth': 2, 'learning_rate': 0.0639857979278736, 'n_estimators': 68, 'subsample': 0.7753067636214088, 'colsample_bytree': 0.691415367249377}. Best is trial 7 with value: 0.5237023217523016.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.506648 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:12:32,787] Trial 9 finished with value: 0.2270975426644161 and parameters: {'num_leaves': 59, 'max_depth': 6, 'learning_rate': 0.03312587862436182, 'n_estimators': 25, 'subsample': 0.9874538621748712, 'colsample_bytree': 0.8950411083120052}. Best is trial 7 with value: 0.5237023217523016.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.525660 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:13:21,674] Trial 10 finished with value: 0.5036747700788264 and parameters: {'num_leaves': 73, 'max_depth': 19, 'learning_rate': 0.08800984514334438, 'n_estimators': 46, 'subsample': 0.8955740224970792, 'colsample_bytree': 0.9975851618392695}. Best is trial 7 with value: 0.5237023217523016.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.500291 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:15:07,289] Trial 11 finished with value: 0.5092748974268306 and parameters: {'num_leaves': 144, 'max_depth': 30, 'learning_rate': 0.0304848335778432, 'n_estimators': 98, 'subsample': 0.5215696678516148, 'colsample_bytree': 0.9478041179012795}. Best is trial 7 with value: 0.5237023217523016.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.470879 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:16:45,008] Trial 12 finished with value: 0.5350628912728176 and parameters: {'num_leaves': 150, 'max_depth': 30, 'learning_rate': 0.0725891144091528, 'n_estimators': 100, 'subsample': 0.5056544749922685, 'colsample_bytree': 0.979474197308235}. Best is trial 12 with value: 0.5350628912728176.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.390147 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:18:11,671] Trial 13 finished with value: 0.5335323373891188 and parameters: {'num_leaves': 146, 'max_depth': 17, 'learning_rate': 0.07472078188782408, 'n_estimators': 78, 'subsample': 0.8375146651490033, 'colsample_bytree': 0.6667348006746951}. Best is trial 12 with value: 0.5350628912728176.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.398064 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:19:43,901] Trial 14 finished with value: 0.5314553276155024 and parameters: {'num_leaves': 149, 'max_depth': 25, 'learning_rate': 0.07306206307671242, 'n_estimators': 83, 'subsample': 0.8341084692713856, 'colsample_bytree': 0.6902743644293171}. Best is trial 12 with value: 0.5350628912728176.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.383533 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:21:03,157] Trial 15 finished with value: 0.5211851883907953 and parameters: {'num_leaves': 81, 'max_depth': 14, 'learning_rate': 0.07986855553636288, 'n_estimators': 85, 'subsample': 0.6188452031926133, 'colsample_bytree': 0.6047855085348777}. Best is trial 12 with value: 0.5350628912728176.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.763947 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:21:56,571] Trial 16 finished with value: 0.49276178761654976 and parameters: {'num_leaves': 23, 'max_depth': 30, 'learning_rate': 0.09730297770004756, 'n_estimators': 73, 'subsample': 0.532745400581323, 'colsample_bytree': 0.6917305473435659}. Best is trial 12 with value: 0.5350628912728176.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.382639 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:23:21,351] Trial 17 finished with value: 0.520966220159487 and parameters: {'num_leaves': 150, 'max_depth': 8, 'learning_rate': 0.07940215610990047, 'n_estimators': 90, 'subsample': 0.8048343777756106, 'colsample_bytree': 0.6300038426872249}. Best is trial 12 with value: 0.5350628912728176.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.420764 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:25:05,714] Trial 18 finished with value: 0.5265673882691934 and parameters: {'num_leaves': 120, 'max_depth': 18, 'learning_rate': 0.05403173057071387, 'n_estimators': 100, 'subsample': 0.6178092172639945, 'colsample_bytree': 0.7316724947740405}. Best is trial 12 with value: 0.5350628912728176.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.396774 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:26:21,799] Trial 19 finished with value: 0.5230926645279981 and parameters: {'num_leaves': 97, 'max_depth': 24, 'learning_rate': 0.08164376295139472, 'n_estimators': 73, 'subsample': 0.7159492240004055, 'colsample_bytree': 0.6430953884435167}. Best is trial 12 with value: 0.5350628912728176.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.419478 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:27:44,962] Trial 20 finished with value: 0.5246366177237536 and parameters: {'num_leaves': 134, 'max_depth': 27, 'learning_rate': 0.05940170496071944, 'n_estimators': 76, 'subsample': 0.5734776025148978, 'colsample_bytree': 0.766570882824184}. Best is trial 12 with value: 0.5350628912728176.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.405646 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:29:16,464] Trial 21 finished with value: 0.5348619921063791 and parameters: {'num_leaves': 149, 'max_depth': 25, 'learning_rate': 0.07462635790771016, 'n_estimators': 84, 'subsample': 0.8303892189450517, 'colsample_bytree': 0.6856033046970633}. Best is trial 12 with value: 0.5350628912728176.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.315903 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:30:37,718] Trial 22 finished with value: 0.5317843546624181 and parameters: {'num_leaves': 140, 'max_depth': 20, 'learning_rate': 0.07250129156310922, 'n_estimators': 91, 'subsample': 0.8207952823787049, 'colsample_bytree': 0.5788173270388811}. Best is trial 12 with value: 0.5350628912728176.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.416915 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:32:10,800] Trial 23 finished with value: 0.5317218269999301 and parameters: {'num_leaves': 123, 'max_depth': 27, 'learning_rate': 0.06904671426237913, 'n_estimators': 88, 'subsample': 0.9276647690914339, 'colsample_bytree': 0.7236060771036227}. Best is trial 12 with value: 0.5350628912728176.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.399247 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:33:37,341] Trial 24 finished with value: 0.5381899173122265 and parameters: {'num_leaves': 149, 'max_depth': 24, 'learning_rate': 0.08847877238689464, 'n_estimators': 80, 'subsample': 0.8601882473816211, 'colsample_bytree': 0.6639178583135487}. Best is trial 24 with value: 0.5381899173122265.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.443888 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:34:32,239] Trial 25 finished with value: 0.520068085258249 and parameters: {'num_leaves': 106, 'max_depth': 30, 'learning_rate': 0.08812918771531045, 'n_estimators': 50, 'subsample': 0.7877882774173365, 'colsample_bytree': 0.823305386604873}. Best is trial 24 with value: 0.5381899173122265.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.411119 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:36:02,699] Trial 26 finished with value: 0.5413397794086484 and parameters: {'num_leaves': 125, 'max_depth': 24, 'learning_rate': 0.09965662548937204, 'n_estimators': 96, 'subsample': 0.8779686128434079, 'colsample_bytree': 0.7452699761318937}. Best is trial 26 with value: 0.5413397794086484.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.461978 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:37:28,633] Trial 27 finished with value: 0.5361862000461813 and parameters: {'num_leaves': 125, 'max_depth': 22, 'learning_rate': 0.09450432275372428, 'n_estimators': 95, 'subsample': 0.9946730388894407, 'colsample_bytree': 0.9376191797961411}. Best is trial 26 with value: 0.5413397794086484.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.461376 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:38:47,795] Trial 28 finished with value: 0.5297733818202398 and parameters: {'num_leaves': 92, 'max_depth': 22, 'learning_rate': 0.09708688324824108, 'n_estimators': 91, 'subsample': 0.9962473697247536, 'colsample_bytree': 0.925389738418493}. Best is trial 26 with value: 0.5413397794086484.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.425993 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:40:14,728] Trial 29 finished with value: 0.5361694387450318 and parameters: {'num_leaves': 111, 'max_depth': 14, 'learning_rate': 0.0995645178778326, 'n_estimators': 94, 'subsample': 0.9648035147923388, 'colsample_bytree': 0.7577899360218158}. Best is trial 26 with value: 0.5413397794086484.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.435460 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:41:23,930] Trial 30 finished with value: 0.5307413580464374 and parameters: {'num_leaves': 117, 'max_depth': 23, 'learning_rate': 0.0917038945502325, 'n_estimators': 65, 'subsample': 0.9122445651001845, 'colsample_bytree': 0.8262904755529576}. Best is trial 26 with value: 0.5413397794086484.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.449109 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:43:00,443] Trial 31 finished with value: 0.532902591895751 and parameters: {'num_leaves': 108, 'max_depth': 15, 'learning_rate': 0.0997324759102568, 'n_estimators': 94, 'subsample': 0.9592042076644465, 'colsample_bytree': 0.7500461946598118}. Best is trial 26 with value: 0.5413397794086484.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.426890 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:44:22,531] Trial 32 finished with value: 0.5341144695032948 and parameters: {'num_leaves': 126, 'max_depth': 20, 'learning_rate': 0.08509606983586714, 'n_estimators': 80, 'subsample': 0.8739779777685737, 'colsample_bytree': 0.7796142428496466}. Best is trial 26 with value: 0.5413397794086484.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.411616 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:45:50,751] Trial 33 finished with value: 0.5349150478501551 and parameters: {'num_leaves': 112, 'max_depth': 12, 'learning_rate': 0.09404801064512351, 'n_estimators': 94, 'subsample': 0.9632031678020174, 'colsample_bytree': 0.7227152026988908}. Best is trial 26 with value: 0.5413397794086484.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.392037 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:47:20,615] Trial 34 finished with value: 0.5373717742540675 and parameters: {'num_leaves': 137, 'max_depth': 21, 'learning_rate': 0.09384612858710749, 'n_estimators': 87, 'subsample': 0.8649561811984362, 'colsample_bytree': 0.6425370461770894}. Best is trial 26 with value: 0.5413397794086484.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.315779 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:48:35,407] Trial 35 finished with value: 0.5341624092324224 and parameters: {'num_leaves': 138, 'max_depth': 22, 'learning_rate': 0.08354534532145917, 'n_estimators': 87, 'subsample': 0.8622929378649183, 'colsample_bytree': 0.5880497031323718}. Best is trial 26 with value: 0.5413397794086484.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.412021 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:50:01,520] Trial 36 finished with value: 0.5335569447202544 and parameters: {'num_leaves': 127, 'max_depth': 20, 'learning_rate': 0.09225040766908765, 'n_estimators': 80, 'subsample': 0.7530860775779535, 'colsample_bytree': 0.6391775509373191}. Best is trial 26 with value: 0.5413397794086484.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.424046 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:50:37,154] Trial 37 finished with value: 0.49075247899180074 and parameters: {'num_leaves': 139, 'max_depth': 26, 'learning_rate': 0.08765243409562973, 'n_estimators': 39, 'subsample': 0.9213582440476777, 'colsample_bytree': 0.5044610378819137}. Best is trial 26 with value: 0.5413397794086484.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.337843 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:51:29,557] Trial 38 finished with value: 0.5205181935149557 and parameters: {'num_leaves': 133, 'max_depth': 22, 'learning_rate': 0.09399565799331132, 'n_estimators': 60, 'subsample': 0.8651105473958973, 'colsample_bytree': 0.534826415579813}. Best is trial 26 with value: 0.5413397794086484.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.308099 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:52:48,961] Trial 39 finished with value: 0.5090391927752445 and parameters: {'num_leaves': 103, 'max_depth': 28, 'learning_rate': 0.04211929494193901, 'n_estimators': 97, 'subsample': 0.8922480359519097, 'colsample_bytree': 0.5626372751219899}. Best is trial 26 with value: 0.5413397794086484.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.916797 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:54:09,051] Trial 40 finished with value: 0.5338540296061135 and parameters: {'num_leaves': 130, 'max_depth': 24, 'learning_rate': 0.09580985870853065, 'n_estimators': 74, 'subsample': 0.7939808416597732, 'colsample_bytree': 0.6605200849190624}. Best is trial 26 with value: 0.5413397794086484.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.380913 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:55:36,335] Trial 41 finished with value: 0.5337505516383387 and parameters: {'num_leaves': 114, 'max_depth': 13, 'learning_rate': 0.09944726555777801, 'n_estimators': 93, 'subsample': 0.9653868174242549, 'colsample_bytree': 0.6165983677562559}. Best is trial 26 with value: 0.5413397794086484.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.670387 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:56:59,790] Trial 42 finished with value: 0.5365070162369693 and parameters: {'num_leaves': 123, 'max_depth': 18, 'learning_rate': 0.08877783297866153, 'n_estimators': 88, 'subsample': 0.9461710852524068, 'colsample_bytree': 0.8829863034879264}. Best is trial 26 with value: 0.5413397794086484.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.458134 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:58:26,812] Trial 43 finished with value: 0.5417509096912376 and parameters: {'num_leaves': 141, 'max_depth': 18, 'learning_rate': 0.0898623914049592, 'n_estimators': 88, 'subsample': 0.9394283269044714, 'colsample_bytree': 0.9097759487542745}. Best is trial 43 with value: 0.5417509096912376.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.545675 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 22:59:52,795] Trial 44 finished with value: 0.5403645237391166 and parameters: {'num_leaves': 142, 'max_depth': 18, 'learning_rate': 0.08866861976777299, 'n_estimators': 88, 'subsample': 0.9253045672933954, 'colsample_bytree': 0.8861831128080011}. Best is trial 43 with value: 0.5417509096912376.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.706947 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 23:01:17,279] Trial 45 finished with value: 0.5368199823946174 and parameters: {'num_leaves': 142, 'max_depth': 16, 'learning_rate': 0.07821715043901568, 'n_estimators': 80, 'subsample': 0.9077480189257079, 'colsample_bytree': 0.9068717257143546}. Best is trial 43 with value: 0.5417509096912376.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.449546 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 23:02:27,911] Trial 46 finished with value: 0.0 and parameters: {'num_leaves': 143, 'max_depth': 19, 'learning_rate': 0.010254320406453472, 'n_estimators': 67, 'subsample': 0.9282581605217852, 'colsample_bytree': 0.8636308000815527}. Best is trial 43 with value: 0.5417509096912376.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.518167 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 23:03:57,002] Trial 47 finished with value: 0.5314220777642306 and parameters: {'num_leaves': 135, 'max_depth': 21, 'learning_rate': 0.0676265852136463, 'n_estimators': 86, 'subsample': 0.8781784868718445, 'colsample_bytree': 0.9666324532020054}. Best is trial 43 with value: 0.5417509096912376.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.536464 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 23:05:19,357] Trial 48 finished with value: 0.517997445349591 and parameters: {'num_leaves': 61, 'max_depth': 10, 'learning_rate': 0.08496808199935389, 'n_estimators': 82, 'subsample': 0.8512128078781721, 'colsample_bytree': 0.8196758849649648}. Best is trial 43 with value: 0.5417509096912376.


[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.438453 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


[I 2024-09-07 23:06:35,595] Trial 49 finished with value: 0.53722906763715 and parameters: {'num_leaves': 142, 'max_depth': 17, 'learning_rate': 0.09008181985910747, 'n_estimators': 71, 'subsample': 0.8936023940973493, 'colsample_bytree': 0.7884999094969503}. Best is trial 43 with value: 0.5417509096912376.


Best parameters for LightGBM: {'num_leaves': 141, 'max_depth': 18, 'learning_rate': 0.0898623914049592, 'n_estimators': 88, 'subsample': 0.9394283269044714, 'colsample_bytree': 0.9097759487542745}
Best MCC score for LightGBM: 0.5417509096912376


In [99]:
lgbm_best_params = {'num_leaves': 141, 'max_depth': 18, 'learning_rate': 0.0898623914049592, 'n_estimators': 88, 'subsample': 0.9394283269044714, 'colsample_bytree': 0.9097759487542745}

In [98]:
def objective_xgb(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 20, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
    }

    model = XGBClassifier(**params)
    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    mcc = matthews_corrcoef(y_test, preds)
    return mcc

# Оптимизация гиперпараметров XGBoost
xgb_study = optuna.create_study(direction='maximize')
xgb_study.optimize(objective_xgb, n_trials=50)

print("Best parameters for XGBoost:", xgb_study.best_params)
print("Best MCC score for XGBoost:", xgb_study.best_value)

[I 2024-09-07 23:06:35,622] A new study created in memory with name: no-name-e9501b3f-2bab-4359-bcb9-d8f250a3a26f
[I 2024-09-07 23:07:03,072] Trial 0 finished with value: 0.49065098579628613 and parameters: {'max_depth': 6, 'learning_rate': 0.15355381575759622, 'n_estimators': 36, 'subsample': 0.5311407353244333, 'colsample_bytree': 0.59982035326213}. Best is trial 0 with value: 0.49065098579628613.
[I 2024-09-07 23:07:52,348] Trial 1 finished with value: 0.5459055809276583 and parameters: {'max_depth': 9, 'learning_rate': 0.16983101942399112, 'n_estimators': 58, 'subsample': 0.6297069828729402, 'colsample_bytree': 0.7596211231073581}. Best is trial 1 with value: 0.5459055809276583.
[I 2024-09-07 23:08:31,623] Trial 2 finished with value: 0.3552392782782319 and parameters: {'max_depth': 3, 'learning_rate': 0.05478793017356273, 'n_estimators': 86, 'subsample': 0.8933938338208862, 'colsample_bytree': 0.8171611685660667}. Best is trial 1 with value: 0.5459055809276583.
[I 2024-09-07 23:09

Best parameters for XGBoost: {'max_depth': 10, 'learning_rate': 0.2626502900369224, 'n_estimators': 56, 'subsample': 0.8657708954490477, 'colsample_bytree': 0.8407201782750098}
Best MCC score for XGBoost: 0.5580452235301456


In [104]:
xgb_best_params = {'max_depth': 10, 'learning_rate': 0.2626502900369224, 'n_estimators': 56, 'subsample': 0.8657708954490477, 'colsample_bytree': 0.8407201782750098}

In [105]:
xgb_model = XGBClassifier(**xgb_best_params)
lgbm_model = LGBMClassifier(**lgbm_best_params)

voting_clf = VotingClassifier(estimators=[
    ('xgb', xgb_model),
    ('lgbm', lgbm_model)
], voting='hard')

In [106]:
voting_clf.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 148884, number of negative: 2857552
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.488682 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1252
[LightGBM] [Info] Number of data points in the train set: 3006436, number of used features: 54
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049522 -> initscore=-2.954553
[LightGBM] [Info] Start training from score -2.954553


In [107]:
y_pred = voting_clf.predict(X_test)

matthews_corrcoef(y_test, y_pred)

0.5382830095969036

итак, в итоге валидационная метрика равна 0.538

In [13]:
from joblib import dump, load

In [None]:
dump(voting_clf, '/content/drive/MyDrive/Colab_Notebooks/forest_fires_hack.voting_classifier_model.joblib')

In [15]:
voting_clf = load('/content/drive/MyDrive/Colab_Notebooks/forest_fires_hack.voting_classifier_model.joblib')

# test

переопределим функцию сбора датасета, чтобы собрать тестоывй датасет

In [18]:
import math

In [36]:
TEST_DATA_DIR = '/content/drive/MyDrive/Colab_Notebooks/forest_fires_hack/test_dataset_minprirody_test_dataset'
METERS_PER_DEGREE_LAT = 111320
IMAGE_SCALE_M = 10 # one pixel corresponds to 10x10 meters squared

def map_pixels_to_coordinates(train_data_dir):
    mapping = dict()

    for file_name in os.listdir(train_data_dir):
      print(file_name)
      if file_name.endswith('.tiff'):
          full_path = os.path.join(train_data_dir, file_name)
          with rasterio.open(full_path) as src:
              band1 = src.read(1)
              transform = src.transform

              coordinates = [[[0, 0] for j in range(src.shape[1])] for i in range(src.shape[0])]
              coordinates[0][0] = (src.bounds.left, src.bounds.top)

              for i in range(band1.shape[0]):
                  for j in range(band1.shape[1]):
                      if j == 0:
                          continue

                      lat = coordinates[i][j - 1][1]
                      lat_radians = math.radians(lat)
                      meters_per_degree_lon = METERS_PER_DEGREE_LAT * math.cos(lat_radians)
                      lon_offset = IMAGE_SCALE_M / meters_per_degree_lon

                      coordinates[i][j] = [coordinates[i][j - 1][0] + lon_offset,
                                            lat] # moving east

                  if i + 1 < band1.shape[0]:
                      lat_offset = IMAGE_SCALE_M / METERS_PER_DEGREE_LAT
                      coordinates[i + 1][0] = [coordinates[i][0][0],
                                                coordinates[i][0][1] - lat_offset] # moving south

              mapping[os.path.join(file_name)] = coordinates

    return mapping

test_pixels = map_pixels_to_coordinates(TEST_DATA_DIR)

sample.csv
test.tiff
test_weather.csv


In [17]:
test_root_dir = '/content/drive/MyDrive/Colab_Notebooks/forest_fires_hack/test_dataset_minprirody_test_dataset'

In [49]:
for i, j in test_pixels.items():
  print(i)

test.tiff


In [66]:
def set_dataset(path):
  curr_file_pic = path + '/test.tiff'
  curr_file_data = path + '/test_weather.csv'


  rgb = get_mask(curr_file_pic, 1, 1, 1, 1, 1, task = 'img') # достаем значения каналов
  mask = get_mask(curr_file_pic, 1, 1, 1, 1, 1, task = 'mask')
  ir = get_mask(curr_file_pic, 1, 1, 1, 1, 1, task = 'ir')
  curr_lat, curr_lon = np.array(test_pixels['test.tiff']).reshape(-1, 2).T # получаем координаты



  data_dct = {'red': rgb[:, :, 0].flatten(), # добавляем значеня каналов и координаты в словарь
                      'green': rgb[:, :, 1].flatten(),
                      'blue': rgb[:, :, 2].flatten(),
                      'info_red': ir.flatten(),
                      'mask': mask.flatten(),
                      'latitude': curr_lat,
                      'longitude': curr_lon
                      }

  hist_data = pd.read_csv(curr_file_data, sep=';')


  new_hist_cols = ['time', 't_avg', 't_min', 't_max', 'total_precipitation', 'wind_dir', 'wind_speed_hist', 'wind_gust', 'sea_level_pressure'] # заменяем старые названия колонок на новые
  hist_data.rename(columns={old_col: new_col for old_col, new_col in zip(hist_data.columns, new_hist_cols)}, inplace=True)

  date = 0
  df = pd.DataFrame(data_dct)
  df['date'] = 0 * len(df) # добавляем в новый датасет название папки в качестве элемента группировки

  for col in new_hist_cols[1:]:
    mi, ma, av = min_max_avg(hist_data[col]) # получаем минимальное, максимальное и средние значения каждого из исторических признаков и добавляем в новый датасет
    df[f'{col}_min'] = [mi] * len(df)
    df[f'{col}_max'] = [ma] * len(df)
    df[f'{col}_avg'] = [av] * len(df)

  hist_data['time'] = pd.to_datetime(hist_data['time'])
  specific_date = pd.to_datetime(date)
  end_date = specific_date
  last_month = end_date - pd.DateOffset(months=1)
  last_last_month = end_date - pd.DateOffset(months=2)

  filtered_df = hist_data[(hist_data['time'] >= last_month) & (hist_data['time'] < end_date)] # отбираем признаки только за последний месяц до пожара и проводим ту же операцию
  for col in new_hist_cols[1:]:
    mi, ma, av = min_max_avg(filtered_df[col])
    df[f'{col}_min_last_month'] = [mi] * len(df)
    df[f'{col}_max_last_month'] = [ma] * len(df)
    df[f'{col}_avg_last_month'] = [av] * len(df)


  FEATURE_INGENEERING = True
  # немного инженерии признаков
  if FEATURE_INGENEERING:
    # 1. Температурный градиент
    df['t_gradient'] = np.mean(hist_data['t_max'] - hist_data['t_min']) * len(df)
    # 2. Среднедневная температура
    df['t_daily_avg'] = np.mean((hist_data['t_min'] + hist_data['t_max']) / 2) * len(df)
    # 3. Усредненные параметры ветра
    df['v_avg'] = np.mean((hist_data['wind_speed_hist'] + hist_data['wind_gust']) / 2) * len(df)
    # 4. Эффективная скорость ветра
    df['v_eff'] = np.mean(np.sqrt(hist_data['wind_speed_hist']**2 + hist_data['wind_gust']**2)) * len(df)
    # 6. Вертикальный градиент давления (разница между текущим и предыдущим значением)
    df['p_gradient'] = np.mean(hist_data['sea_level_pressure'].diff() * len(df))
    # 7. Интенсивность осадков
    df['rain_intensity'] = np.mean(hist_data['total_precipitation'] / (len(hist_data) * 24)) * len(df) #
    # 8. Вектор ветра
    df['u'] = np.mean(hist_data['wind_speed_hist'] * np.cos(np.radians(hist_data['wind_dir']))) * len(df)
    df['v'] = np.mean(hist_data['wind_speed_hist'] * np.sin(np.radians(hist_data['wind_dir']))) * len(df)


  # filtered_df_2 = hist_data[(hist_data['time'] >= last_last_month) & (hist_data['time'] < last_month)]
  # for col in new_hist_cols[1:]:
  #   mi, ma, av = min_max_avg(filtered_df_2[col])
  #   df[f'{col}_min_last_last_month'] = [mi] * len(df)
  #   df[f'{col}_max_last_last_month'] = [ma] * len(df)
  #   df[f'{col}_avg_last_last_month'] = [av] * len(df)

  cols = list(set(df.columns) - set(['date']))
  df[cols] = df[cols].astype('float16')
  return df

test_dataset = set_dataset(test_root_dir)

test_dataset.replace([np.inf, -np.inf], -66666, inplace=True)
test_dataset.fillna(-666, inplace = True);

test_dataset.head()

Unnamed: 0,red,green,blue,info_red,mask,latitude,longitude,date,t_avg_min,t_avg_max,...,sea_level_pressure_max_last_month,sea_level_pressure_avg_last_month,t_gradient,t_daily_avg,v_avg,v_eff,p_gradient,rain_intensity,u,v
0,9.0,9.0,9.0,9.0,9.0,0.0,0.0,0,7.300781,22.90625,...,-666.0,-666.0,-66666.0,-66666.0,-666.0,-666.0,43040.0,107.4375,-66666.0,-66666.0
1,8.0,8.0,8.0,8.0,8.0,9e-05,0.0,0,7.300781,22.90625,...,-666.0,-666.0,-66666.0,-66666.0,-666.0,-666.0,43040.0,107.4375,-66666.0,-66666.0
2,9.0,9.0,9.0,9.0,9.0,0.00018,0.0,0,7.300781,22.90625,...,-666.0,-666.0,-66666.0,-66666.0,-666.0,-666.0,43040.0,107.4375,-66666.0,-66666.0
3,9.0,9.0,9.0,9.0,9.0,0.000269,0.0,0,7.300781,22.90625,...,-666.0,-666.0,-66666.0,-66666.0,-666.0,-666.0,43040.0,107.4375,-66666.0,-66666.0
4,9.0,9.0,9.0,9.0,9.0,0.000359,0.0,0,7.300781,22.90625,...,-666.0,-666.0,-66666.0,-66666.0,-666.0,-666.0,43040.0,107.4375,-66666.0,-66666.0


In [68]:
num_cols = list(filter(lambda x: False if x == 'date' or x == 'mask' else True, test_dataset.columns))

In [71]:
y_pred = voting_clf.predict(test_dataset[num_cols])

In [72]:
print_geotiff_info(test_root_dir + '/test.tiff')

File Path: /content/drive/MyDrive/Colab_Notebooks/forest_fires_hack/test_dataset_minprirody_test_dataset/test.tiff
Driver: GTiff
Width: 516
Height: 316
Count (Bands): 4
CRS: None
Transform: | 10.00, 0.00, 0.00|
| 0.00,-10.00, 0.00|
| 0.00, 0.00, 1.00|
Bounding Box: BoundingBox(left=0.0, bottom=-3160.0, right=5160.0, top=0.0)
Datum: ('float32', 'float32', 'float32', 'float32')

Band 1:
  Data Type: float32
  Min Value: 0.0
  Max Value: 44.0
  Mean Value: 8.811193466186523
  Standard Deviation: 2.267102003097534

Band 2:
  Data Type: float32
  Min Value: 0.0
  Max Value: 40.0
  Mean Value: 15.68852424621582
  Standard Deviation: 3.6198506355285645

Band 3:
  Data Type: float32
  Min Value: 0.0
  Max Value: 57.0
  Mean Value: 18.702402114868164
  Standard Deviation: 4.5382561683654785

Band 4:
  Data Type: float32
  Min Value: 0.0
  Max Value: 126.0
  Mean Value: 58.862972259521484
  Standard Deviation: 14.22008991241455


In [86]:
y_pred = y_pred.reshape(316, -1)

In [81]:
sample = pd.read_csv(test_root_dir + '/sample.csv')

In [87]:
y_pred.shape

(316, 516)

In [88]:
df_pred = pd.DataFrame(y_pred)
df_pred

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,506,507,508,509,510,511,512,513,514,515
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
312,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [90]:
df_pred.to_csv('/content/drive/MyDrive/Colab_Notebooks/forest_fires_hack/pred.csv', header = False, index = False)