In [4]:
from my_models_pipline import ModelsPipline
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import create_dataset_functions as f
import numpy as np
import pandas as pd
from collections import defaultdict
import pickle


class MainNavigation:
    def __init__(self, models_grid = None, param_grids = None, agg_func = None):
        if models_grid is not None:
            self.models_grid = models_grid
        else:
            self.models_grid = {'CatBoost': lambda: CatBoostClassifier(),
                                # 'RandomForest' : lambda: RandomForestClassifier()
                                }
        if param_grids is not None:
            self.param_grids = param_grids
        else:
    
            catboost_param_grid = {
                'depth': [4, 6],
                'learning_rate': [0.01],
                'iterations': [5000],
                'early_stopping_rounds': [100]
                }


            # random_forest_param_grid = {
            #     'n_estimators': [100],
            #     'max_depth': [10, 20],
            #     'min_samples_split': [5]
            #     }

            self.param_grids = {
                            'CatBoost': catboost_param_grid,
                            # 'RandomForest': random_forest_param_grid
                            }
                
        if agg_func is not None:
            self.features = agg_func
        else: 
            self.features = [
                ("Range", lambda x: x.max() - x.min()),  # Размах
                ('Q1', lambda x: np.percentile(x, 25) if len(x) != 0 else np.nan),  # Первый квартиль (Q1)
                ("Q3", lambda x: np.percentile(x, 75) if len(x) != 0 else np.nan),  # Третий квартиль (Q3)
                ("IQR", lambda x: (np.percentile(x, 75) - np.percentile(x, 25)) if len(x) != 0 else np.nan),  # Межквартильный размах (IQR)
                ("Autocorr", lambda x: x.autocorr() if len(x) > 1 else np.nan),  # Автокорреляция
                ("Skewness", lambda x: x.skew() if len(x) != 0 else np.nan),  # Скос
                ("Kurtosis", lambda x: x.kurt() if len(x) != 0 else np.nan),  # Куртозис
                ("Mean", lambda x: x.mean() if len(x) != 0 else np.nan),  # Среднее значение
                ("Median", lambda x: x.median() if len(x) != 0 else np.nan),  # Медиана
                ("Std", lambda x: x.std() if len(x) != 0 else np.nan),  # Стандартное отклонение
                ("Min", lambda x: x.min() if len(x) != 0 else np.nan),  # Минимум
                ("Max", lambda x: x.max() if len(x) != 0 else np.nan)  # Максимум
                ]
        self.grid_data = defaultdict()
        self.trained_model_name = 'trained_BestModel_catboost_on_rolling_Intervals_12Stats_0.pkl'

    def create_dataset(self, data_type = 'rolling', train_path = None, test_path = None):
        if data_type == 'rolling':
            id_convert_ts_train, id_convert_ts_test = f.convert_rolling_time_series(train_path=train_path, 
                                                                                    test_path= test_path)
        else:
            id_convert_ts_train, id_convert_ts_test = f.convert_time_series(train_path=train_path, 
                                                                            test_path= test_path)
        
        if train_path is not None:
            train_data = f.create_dataset(self.features, id_ts= id_convert_ts_train)
            train_data_file = f'intervals_dataframe{len(self.grid_data)}.csv'
            train_data_name = f'{data_type}_Intervals_12Stats_{len(self.grid_data)}'
            train_data.to_csv(train_data_file)
            self.grid_data[train_data_name] = train_data_file
        else:
            self.grid_data['rolling_Intervals_12Stats_0'] = 'intervals_dataframe0.csv'
        
        if test_path is not None:
            test_data = f.create_dataset(self.features, id_ts= id_convert_ts_test)
            test_data_file = 'test_data.csv'
            test_data.to_csv(test_data_file)
            self.grid_data['Test'] = test_data_file
        else:
            self.grid_data['Test'] = 'test_data.csv'



    def run_model_selection(self):
        model_selector = ModelsPipline(
                                    models = self.models_grid, 
                                    param_grids= self.param_grids,
                                    dataframes_path = self.grid_data)
        print('Start model selection session')
        model_selector.run()
        results = pd.DataFrame(model_selector.results)
        results = results.sort_values(by='score')
        print('Fit and save final model')
        self.trained_model_name = model_selector.final_model_fit_save(results.loc[0,'dataset'])
        return results
        
    def get_submission(self):
        with open(self.trained_model_name, 'rb') as file:
            model = pickle.load(file)
        s_data = pd.read_csv(self.grid_data['Test'], index_col= False)
        print('Prediction')
        if 'Unnamed: 0' in s_data.columns:
            s_data.drop('Unnamed: 0', axis = 1, inplace = True)
        predictions = model.predict(s_data.drop('id' , axis = 1))
        submission = pd.DataFrame({
            'id': s_data['id'],
            'score': predictions  
            })
        submission.to_csv('submission.csv', index=False)
        return submission



In [5]:
nagivation = MainNavigation()
nagivation.create_dataset(train_path= None, test_path= None)
result = nagivation.run_model_selection()
submission = nagivation.get_submission()

Start model selection session
run() function


  0%|          | 0/2 [00:00<?, ?it/s]

rolling_Intervals_12Stats_0
intervals_dataframe0.csv




0:	learn: 0.6889418	total: 26.6ms	remaining: 2m 13s
1:	learn: 0.6848577	total: 54.4ms	remaining: 2m 16s
2:	learn: 0.6808734	total: 81.1ms	remaining: 2m 15s
3:	learn: 0.6769875	total: 107ms	remaining: 2m 13s
4:	learn: 0.6730886	total: 133ms	remaining: 2m 12s
5:	learn: 0.6692477	total: 164ms	remaining: 2m 16s
6:	learn: 0.6654212	total: 194ms	remaining: 2m 18s
7:	learn: 0.6618530	total: 222ms	remaining: 2m 18s
8:	learn: 0.6583873	total: 251ms	remaining: 2m 18s
9:	learn: 0.6548464	total: 282ms	remaining: 2m 20s
10:	learn: 0.6514145	total: 315ms	remaining: 2m 22s
11:	learn: 0.6480361	total: 343ms	remaining: 2m 22s
12:	learn: 0.6447166	total: 370ms	remaining: 2m 22s
13:	learn: 0.6415295	total: 399ms	remaining: 2m 22s
14:	learn: 0.6384593	total: 427ms	remaining: 2m 21s
15:	learn: 0.6353839	total: 454ms	remaining: 2m 21s
16:	learn: 0.6323607	total: 487ms	remaining: 2m 22s
17:	learn: 0.6294286	total: 518ms	remaining: 2m 23s
18:	learn: 0.6265715	total: 545ms	remaining: 2m 22s
19:	learn: 0.623659

100%|██████████| 1/1 [17:43<00:00, 1063.63s/it]
100%|██████████| 2/2 [17:44<00:00, 532.41s/it] 


Fit and save final model
rolling_Intervals_12Stats_0
intervals_dataframe0.csv
final_model_fit_save
0:	learn: 0.6891192	total: 37.3ms	remaining: 3m 6s
1:	learn: 0.6850069	total: 72.3ms	remaining: 3m
2:	learn: 0.6810003	total: 104ms	remaining: 2m 52s
3:	learn: 0.6771284	total: 135ms	remaining: 2m 48s
4:	learn: 0.6732808	total: 168ms	remaining: 2m 47s
5:	learn: 0.6693948	total: 202ms	remaining: 2m 48s
6:	learn: 0.6656579	total: 236ms	remaining: 2m 48s
7:	learn: 0.6620890	total: 275ms	remaining: 2m 51s
8:	learn: 0.6584789	total: 308ms	remaining: 2m 50s
9:	learn: 0.6550060	total: 340ms	remaining: 2m 49s
10:	learn: 0.6516392	total: 372ms	remaining: 2m 48s
11:	learn: 0.6483305	total: 402ms	remaining: 2m 47s
12:	learn: 0.6449540	total: 434ms	remaining: 2m 46s
13:	learn: 0.6417054	total: 465ms	remaining: 2m 45s
14:	learn: 0.6384021	total: 503ms	remaining: 2m 47s
15:	learn: 0.6352595	total: 539ms	remaining: 2m 47s
16:	learn: 0.6320526	total: 573ms	remaining: 2m 47s
17:	learn: 0.6292581	total: 60

In [6]:
result

Unnamed: 0,dataset,model,score
0,rolling_Intervals_12Stats_0,"(CatBoost, trained_CatBoost_on_rolling_Interva...",0.768714
