In [13]:
# Подгрузка стандартных библиотек
import logging
import os
import time
import requests
logging.basicConfig(format='[%(asctime)s] (%(levelname)s): %(message)s', level=logging.INFO)

# Подгрузка бибилиотек
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch

# Подгрузка lightautoml
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task

In [None]:
# Функция сопоставления данных с гидро- и метео- станций
def hydro_meteo_combine(hydro: pd.DataFrame, meteo: pd.DataFrame) -> pd.DataFrame:
    
    meteo = meteo.groupby(['date_local', 'station_id'])
    meteo = meteo.aggregate({'air_temperature': np.nanmean, 'precipitation': np.nansum, 'wind_speed_aver': np.nanmean})
    meteo = meteo.reset_index().pivot(index='date_local', columns='station_id')
    # add features
    # delta_air_temperature = meteo['air_temperature'].diff()
    # delta_air_temperature.columns = ['delta_air_temperature_' + str(station_id) for station_id in delta_air_temperature.columns]
    # delta_precipitaton = meteo['precipitation'].diff()
    # delta_precipitaton.columns = ['delta_precipitaton_' + str(station_id) for station_id in delta_precipitaton.columns]
    meteo.columns = [str(feature) + '_' + str(station_id) for feature, station_id in meteo.columns]
    # meteo = pd.concat([meteo, delta_air_temperature, delta_precipitaton], axis=1)

    hydro = hydro.pivot(index='date', columns='station_id', values='delta_stage_max')
    hydro.columns = [str(station_id) for station_id in hydro.columns]

    res = pd.merge(meteo, hydro, left_index=True, right_index=True)
    return res

In [12]:
# Загрузка данных
train_data = pd.read_csv('meteo_3_hours_agg_hydro_full.csv')
test_data = hydro_meteo_combine(pd.read_csv("cp4_test.csv"),pd.read_csv("forecast_meteo_3hours.csv"))
test_data = test_data.reset_index()
test_data.rename(columns={'index':'date_local'},  
           inplace=True)
train_data.rename(columns={train_data.columns[-27:][i]: train_data.columns[-27:][i].split('_')[-1] for i in range(0, 27)},  
           inplace=True)
train_data

Unnamed: 0,date_local,air_temperature_24538,air_temperature_24641,air_temperature_24643,air_temperature_24661,air_temperature_24671,air_temperature_24713,air_temperature_24726,air_temperature_24738,air_temperature_24763,...,3048,3050,3087,3106,3169,3180,3229,3230,3554,3555
0,1985-01-01,-43.4375,-40.5625,-39.4625,-39.5500,-46.0875,-43.4250,-32.4625,-40.5125,-45.4250,...,,,,,,,,,,
1,1985-01-02,-47.4875,-41.9625,-41.0750,-44.7125,-43.5875,-41.1250,-34.2500,-43.5375,-47.1625,...,,-1.0,0.0,-4.0,0.0,0.0,0.0,-1.0,-1.0,6.0
2,1985-01-03,-47.0125,-41.2625,-37.8750,-26.7125,-25.7375,-41.0250,-34.9625,-43.0375,-41.0250,...,,-2.0,0.0,-1.0,0.0,-1.0,-2.0,-1.0,-1.0,5.0
3,1985-01-04,-44.7125,-38.8500,-38.2500,-43.3000,-33.5375,-39.8625,-35.3000,-42.1750,-45.1375,...,,-1.0,0.0,-2.0,-1.0,-1.0,-4.0,-2.0,0.0,4.0
4,1985-01-05,-44.8000,-36.1250,-37.2125,-46.3750,-41.1375,-39.7875,-32.7750,-36.7750,-45.5750,...,,-1.0,0.0,-1.0,0.0,-1.0,-5.0,-2.0,-1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12419,2019-12-27,,-42.9125,-40.4125,-45.7500,-41.5000,-31.4125,-39.7250,-41.8625,-47.4250,...,,-2.0,0.0,-1.0,-2.0,-1.0,,,0.0,-1.0
12420,2019-12-28,,-42.6875,-44.5625,-43.4000,-44.6000,-38.7125,-42.0750,-43.0250,-43.8875,...,,-3.0,0.0,-1.0,-3.0,0.0,,,0.0,1.0
12421,2019-12-29,,-40.4625,-45.4250,-44.8625,-45.7125,-33.8250,-37.8250,-40.2125,-43.6625,...,,-2.0,0.0,-1.0,-4.0,-1.0,,,0.0,0.0
12422,2019-12-30,,-30.6750,-42.3625,-47.2875,-46.6750,-27.7500,-28.2375,-25.5375,-44.5250,...,,-2.0,0.0,-1.0,-3.0,0.0,,,0.0,0.0


In [10]:
#Вспомогательный список с названиями колонок
df_list = []
for i in range(0, 27):
    t = list(train_data.columns[0: len(train_data.columns)-27])
    t.append(train_data.columns[-27: ][i])
    df_list.append(t)

### Обучение моделей
Здесь мы используем automl фреймворк LightAutoML от Сбера. Обучаем 10 моделей для каждого гидропоста. И решаем задачу регрессии. 

In [None]:
N_THREADS = 4 # кол-во используемых ядер ЦПУ
N_FOLDS = 5 # количество фолдов для кросс-валидации
RANDOM_STATE = 42 
TEST_SIZE = 0.2 # Размер тестовой выборки для обучения
TIMEOUT = 200 # Кол-во секунд на обучение каждой модели

automl = []
oof_pred = []
for i in [0, 3, 4, 5, 6, 9, 13, 15, 18, 24]:
    print(' Модель ' + str(i))
    TARGET_NAME = df_list[i][-1] # Target column name
    task = Task('reg', loss='mse', metric='mse')
    roles = {'target': TARGET_NAME}
  
    train = train_data[df_list[i]]
    train = train[~train[df_list[i][-1]].isna()]
  
   
    automl.append(TabularUtilizedAutoML(task = task, 
                        timeout = TIMEOUT,
                        cpu_limit = N_THREADS,
                        general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
                        reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                        ))


    oof_pred.append(automl[-1].fit_predict(train[df_list[i]], roles = roles))
    logging.info('oof_pred:\n{}\nShape = {}'.format(oof_pred[-1], oof_pred[-1].shape))
    

In [None]:
df_test = pd.read_csv('task2_cp4.csv')


In [None]:
for i in range(0, len(df_test)):
    df_test['delta_stage_max'].loc[i] = float(test_data[str(df_test['station_id'].iloc[i])].loc[test_data['date_local'] == df_test['date'].iloc[i]])

In [None]:
j = 0
for i in [0, 3, 4, 5, 6, 9, 13, 15, 18, 24]:
    test_pred = automl[j].predict(test_data[df_list[i][:-1]]).data[:,0]
    test_data[df_list[i][-1]] = test_pred
    j += 1
test_data

In [None]:
df_test.to_csv('sub.csv')

In [None]:
import joblib
[joblib.dump(automl[i], 'sub_4_model_' + str(i) + '.pkl') for i in range(0, len(automl))]