In [81]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

from catboost import *
from xgboost import XGBClassifier
#from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [84]:
class Processing():
    
    def __init__(self, filename = 'train_csv', df_type = 'classification', target = 'target', mode = 'train'):
        
        df = pd.read_csv(filename)
        self.mode = mode
        
        if mode == 'train':
            self.target = df[[target]]
            df = df.drop(tagret, axis=1)
        
        self.df = df
        
        cols = df.columns
        num_columns = list(self.df._get_numeric_data().columns)
        cat_columns = list(set(cols) - set(num_cols))
            
        self.num_columns = num_columns
        self.cat_columns = cat_columns
        
        print('numerical columns: ', len(num_columns))
        print('categorical columns: ', len(cat_columns))
        
    
    def feature_encoding(self):
        
        def feature_transform(self):
            pass
        
        pass
        
    
    def fix_missed_values(self, method = 'pro', nn = 3):
        
        def missings(self):
            return self.df.isna().sum().any()
        
        if not missings:
            print('no missings here')
            return 
        
        match method:
            case "pro":
                #imputing numerical data
                imputer = KNNImputer(n_neighbors=nn, weights="uniform")
                self.df[self.num_columns] = imputer.fit_transform(self.df[self.num_columns])
                
                #imputing categorical data, good only if count of categories not very big
                dummies = pd.get_dummies(self.df)
                imputer = KNNImputer(n_neighbors=nn, weights="uniform")
                dummies[:] = imputer.fit_transform(dummies)
                data = pd.from_dummies(dummies, sep="_")
                self.df = data
                
                print('KNN imputing complete')
            
            case "base":
 
                print('simple imputing complete')
                
            case "mediana":
                
                print('Missed data filling by mediana values complete')
                
            case "mean":
                
                print('Missed data filling by mean values complete')
                
            case "mode":
                
                print('Missed data filling by mode values complete')
                
            case _:
                
                print("incorrect method")
        

    def data_scale(self):
        
        scaler = MinMaxScaler()
        self.df = pd.DataFrame(scaler.fit_transform(self.df), columns = self.df.columns)
        
    
    def get_noisy_features(self):
        
        def create_models():
            
            cb = CatBoostClassifier(
                iterations = 100,
                max_depth = 7,
                learning_rate = 0.07,
                random_strength = 1,
                l2_leaf_reg = 5,
                random_state = 63,
                verbose = False
            )
            
            rf = RandomForestClassifier(
                n_estimators = 300,
                max_depth = 24,
                criterion = 'gini',
                n_jobs = -1,
                random_state = 63,
            )

            lr = LogisticRegression(
                solver = 'saga',
                C = 0.5,
            )

            knn = KNeighborsClassifier(
                n_neighbors = 15,
                metric = 'euclidean',
                weights = 'distance',
            )

            pc = MLPClassifier(
                hidden_layer_sizes = 100,
                activation = 'relu',
                learning_rate = 'adaptive',
                solver = 'adam',
            )
            
            return [cb, rf, lr, knn, pc,]
            
        """
        generate noise to check features with importance lower than noises
        df_with_noise
        """
        
        importances = dict()
        
        for model in create_models():
            model.fit(df_with_noise, self.target)
            importances[type(model)] = model.feature_importance()
        
        for model, f_importances in importances.items():
            """
            check features with low importance
            """
        
        
        return features_to_delete
    
    
    def delete_features():
        pass
        


In [None]:
class Metamodel():
    
    def __init__(self, 
                 xg_estimators = 50,
                 cb_estimators = 100,
                 lgbm_estimators = 50,
                 rf_estimators = 300,
                 max_depth_xg = 3,
                 max_depth_cb = 3,
                 max_depth_lgbm = 3,
                 max_depth_rf = 3,
                 lr_xg = 0.1,
                 lr_cb = 0.1,
                 lr_lgbm = 0.1,
                 knneighs = 7,
                 RS=63,
                ):
        
        self.xgb = XGBClassifier(
            n_estimators = xg_estimators,
            max_depth = max_depth_xg,
            colsample_bytree=0.5,
            learning_rate=lr_xg,
            alpha = 0.6,
            random_state=RS,
            verbose = 0
        )
        
        self.cb = CatBoostClassifier(
            iterations = cb_estimators,
            max_depth = max_depth_cb,
            learning_rate = lr_cb,
            random_strength = 1,
            l2_leaf_reg = 3,
            random_state = RS,
            verbose = False
        )
        
        self.lgbm = LGBMClassifier(
            n_estimators = lgbm_estimators,
            objective = 'binary',
            max_depth = max_depth_lgbm,
            learning_rate = lr_lgbm,
            reg_lambda = 0.3,
            random_state = RS,
            verbose = None
        )
        
        self.rf = RandomForestClassifier(
            n_estimators = rf_estimators,
            max_depth = max_depth_rf,
            criterion = 'gini',
            n_jobs = -1,
            random_state = RS,
        )
        
        self.lr = LogisticRegression(
            solver = 'saga',
            C = 0.5,
        )
        
        self.knn = KNeighborsClassifier(
            n_neighbors = knneighs,
            metric = 'euclidean',
            weights = 'distance',
        )
        
        self.pc = MLPClassifier(
            hidden_layer_sizes = 150,
            activation = 'relu',
            learning_rate = 'adaptive',
            solver = 'adam',
        )
        
        
        self.models = [self.xgb, self.cb, self.lgbm, self.rf, self.lr, self.knn, self.pc,]
        
        self.head = LogisticRegression(
            solver = 'saga',
            C = 0.5,
        )
        
    
    def new_head(self, model):
        self.head = model
        
        
    def fit_models(self, X, y):
        for model in self.models:
            model.fit(X, y)
            
            
    def fit_head(self, X, y): # можно заменить predict на predict_proba что позволит мете работать лучше, но это не точно
        
        predicts = []
        
        for model in self.models:
            predicts.append(np.array(model.predict(X)))
        
        model_predicts = np.column_stack(predicts)
        
        self.head.fit(model_probas, y)
        
        return self.head.predict(model_predicts)
    
    
    def meta_test(self, X_test):
        
        predicts = []
        
        for model in self.models:
            predicts.append(np.array(model.predict(X)))
        
        model_predict = np.column_stack(predicts)
        
        return np.array(self.head.predict(model_predict))