In [51]:
import os
import re
import time
import pandas as pd
from tqdm import tqdm
from typing import Dict, Tuple, List
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

In [39]:
POS_NUM_PATTERN = r'(?<=_)\d+(?=_)'
DROP_COLS = ['salary_from_rub']
INDEX_COL = 'id'
TARGET = 'new_salary'

def open_data_set(path: str) -> (int, pd.DataFrame):
    """Считывание одного датасета типовой позиции с преобразованием категориальных данных"""
    data = pd.read_csv(path, index_col=INDEX_COL)
    pos_num = int(re.search(POS_NUM_PATTERN, path).group(0))
    data = data.drop(columns=DROP_COLS, errors='ignore')
    return pos_num, data
    
def open_files(basepath: str) -> Dict[int, pd.DataFrame]:
    """Считывание всех датасетов в словарь"""
    datasets = {}
    for file in tqdm(os.listdir(basepath), ascii=True):
        key, value = open_data_set(basepath + file)
        datasets.setdefault(key, value)
    return datasets

In [41]:
datasets = open_files('./train/')

100%|##################################################################################| 75/75 [00:35<00:00,  2.10it/s]


In [67]:
def train_model(data: pd.DataFrame) -> Tuple[int, float, float]:
    """Тренировка одной модели. Воозращает время обучения (сек), метрику rmse, метрику mape"""
    categorical_columns = []
    for col in data.columns[data.dtypes == object]:
        data[col] = LabelEncoder().fit_transform(data[col].values)
        categorical_columns.append(col)
    for col in data.columns[data.dtypes == bool]:
        data[col] = data[col].astype(int)
    features = [col for col in data.columns if col not in [TARGET]]
    cat_idxs = [i for i, f in enumerate(features) if f in categorical_columns]
    x = data.drop(columns=[TARGET])
    y = data[TARGET]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.33,
                                                        random_state=42)
    model = CatBoostRegressor(loss_function='RMSE')
    start_time = time.time()
    model.fit(
        x_train,
        y_train,
        cat_features=cat_idxs,
        eval_set=(x_test, y_test),
        verbose=False,
        plot=False
    )
    training_time = int(time.time() - start_time)
    test_preds = model.predict(x_test)
    rmse = mean_squared_error(y_test, test_preds, squared=False)
    mape = mean_absolute_percentage_error(y_test, test_preds)
    return training_time, rmse, mape

def get_model_stats(datasets: Dict[int, pd.DataFrame]) -> pd.DataFrame:
    """Получение статистических данных для всех старых типовых позиций"""
    result: List[List[int, int, float, float, int]] = []
    cols = ["Типовая позиция", "Время обучения", "RMSE", "MAPE", "Размер датасета"]
    for key in tqdm(datasets, ascii=True):
        dataset = datasets[key]
        training_time, rmse, mape = train_model(dataset)
        result.append([key, training_time, rmse, mape, dataset.shape[0]])
    result = pd.DataFrame(result, columns=cols)
    return result

In [70]:
final_table = get_model_stats(datasets)

100%|################################################################################| 75/75 [1:11:42<00:00, 57.36s/it]


In [71]:
final_table.head()

Unnamed: 0,Типовая позиция,Время обучения,RMSE,MAPE,Размер датасета
0,102,8,7915.132237,0.17886,6383
1,111,69,11407.967743,0.216463,41603
2,116,58,4032.037188,0.214494,30370
3,119,44,11699.558876,0.174828,5270
4,124,50,8001.137343,0.166153,566


In [75]:
final_table = final_table.sort_values(by='Типовая позиция')
final_table.to_excel("final_table.xlsx", index=False, sheet_name="Метаданные по старым датасетам")