In [107]:
import pandas as pd
import numpy as np

In [108]:
dataset = pd.read_excel('check.xlsx')

In [7]:
X = pd.DataFrame(
    {'fea1':[0,0,0,2,2,2,1,1,1],
     'fea2':['a',np.nan,'b','b','b','b',np.nan,'c','c'],
     'fea3':['Магазин','Магазин1','Магази','Что','ЧТОт','ЧТК','ЧАШК','ЧАШКК','чаш']})
Y = pd.DataFrame({'target':[1,1,1,1,0,1,0,0,0]})

In [8]:
X_test = pd.DataFrame(
    {'fea1':[0,0,1,2,5],
     'fea2':['a',np.nan,'b',np.nan,'c'],
     'fea3':['1','1','1','1','1']})
Y_test = pd.DataFrame({'target':[1,1,1,1,0]})

In [145]:
from sklearn.inspection import permutation_importance
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OrdinalEncoder
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from gensim.models import Word2Vec
from typing import List, Callable
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from tpot import TPOTRegressor
import pandas as pd
import numpy as np
import pymorphy2
import shelve
import time
import shap
import os 

snowball = SnowballStemmer(language="russian")
morph = pymorphy2.MorphAnalyzer()
stop_words = stopwords.words("russian")
############################################################################################################

def most_frequency(x:List[List[int]]):
    x_ = [i[0] for i in x]
    return np.argmax(np.bincount(x_))

def lead_time(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result =  func(*args, **kwargs)
        print('lead time {} = {:.3f}'.format(func.__name__, time.time() - start_time))
        return result 
    return wrapper
def shelve_save(save_data:object, key:str, path:str = 'model_property/data'):
    if not os.path.exists("/".join(path.split('/'))):
        os.makedirs("/".join(path.split('/')))
    with shelve.open(path) as save:
        save[key] = save_data

def shelve_load(key:str, path:str = 'model_property/data'):
    with shelve.open(path) as load:
        return load[key]
############################################################################################################
from sklearn.metrics import roc_auc_score
class Conveyor:
    """ Подобие sklearn.Pipeline, адаптированный под простоту и добавленный функционал

    Parameters
    ----------
    *block : object
        Объекты классов, что будут использоваться при обработке, и моделирование

    """
    ################################################################
    def __init__(self, *blocks, **params):
        self.blocks = list(blocks)
    
    @lead_time
    def fit(self, X:pd.DataFrame, Y:pd.DataFrame or pd.Series):
        X_, Y_  = (X.copy(), Y.copy())
        for block in self.blocks[:-1]:
            block.fit(X_, Y_)
            X_, Y_ = self._transform(block, X_, Y_)
        self.blocks[-1].fit(X_, Y_)
        return X_, Y_

    @lead_time
    def fit_transform(self, X:pd.DataFrame, Y:pd.DataFrame or pd.Series):
        X_, Y_  = (X.copy(), Y.copy())
        for block in self.blocks:
            block.fit(X_, Y_)
            X_, Y_ = self._transform(block, X_, Y_)
        return X_, Y_
    ################################################################
    @lead_time
    def transform(self, X:pd.DataFrame, Y:pd.DataFrame or pd.Series = pd.DataFrame()):
        X_, Y_  = (X.copy(), Y.copy())
        for block in self.blocks[:-1]:
            X_, Y_ = self._transform(block, X_, Y_)
        return X_, Y_

    def _transform(self, block, X:pd.DataFrame, Y:pd.DataFrame or pd.Series = pd.DataFrame()):
        X = block.transform(X)
        if not Y.empty and 'target_transform' in dir(block):
            Y = block.target_transform(Y)
        return X, Y

    ################################################################

    @lead_time
    def predict(self, X:pd.DataFrame):
        return self.blocks[-1].predict(self.transform(X.copy())[0])

    ################################################################
    @lead_time
    def score(self,
                X:pd.DataFrame,
                Y:pd.DataFrame or pd.Series,
                sklearn_function:List[str] = ['roc_auc_score', 'r2_score', 'accuracy_score'],
                precision_function:List[Callable] = []):

        X_, Y_ = self.transform(X.copy(), Y.copy())
        result = self.blocks[-1].predict(X_)

        for func in sklearn_function:
            try:
                exec('from sklearn.metrics import ' + func)
                print("function - {} = ".format(func), eval("{}(result, Y_)".format(func)))
            except Exception as e:
                print("function - {} = ERROR: {}".format(func, e))
        for func in precision_function:
            try:
                print("function - {} = ".format(func.__name__), func(result, Y_))
            except Exception as e:
                print("function - {} = ERROR: {}".format(func.__name__, e))
    @lead_time
    def feature_importances(self,
                            X:pd.DataFrame,
                            Y:pd.DataFrame or pd.Series, show:str = 'all'): # all, sklearn, shap
                            
        X_, Y_ = self.transform(X.copy(), Y.copy())
        estimator = self.blocks[-1][-1] if type(self.blocks[-1]) == Pipeline else self.blocks[-1]

        if show == 'all' or show == 'shap':
            explainer = shap.Explainer(estimator)
            shap_values = explainer(X_)
            shap.plots.bar(shap_values[0])

        if show == "all" or show == "sklearn":
            try:
                result = permutation_importance(estimator, X_, Y_, n_repeats=2, random_state=42)
                index = X_.columns if type(X_) == pd.DataFrame else X.columns
                forest_importances = pd.Series(result.importances_mean, index=index)
                fig, ax = plt.subplots(figsize=(20, 10))
                forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
                ax.set_title("Feature importances using permutation on full model")
                ax.set_ylabel("Mean accuracy decrease")
                fig.tight_layout()
                plt.show()
            except Exception as e:
                print('Sklearn plot - ERROR: ', e)
    ################################################################
    @lead_time
    def fit_model(self, 
                    X:pd.DataFrame, Y:pd.DataFrame or pd.Series,
                    type_model:str = 'regressor', estimator:bool = False,
                    generations:int = 5, population_size:int = 50, n_jobs:int = -1):

        tpot = TPOTRegressor(generations=1, population_size=20, n_jobs = -1, random_state=42)
        if not estimator:
            X_, Y_ = self.fit_transform(X, Y)
        else:
            X_, Y_ = self.fit(X, Y)
        # X_, Y_ = self.fit_transform(X, Y) if not estimator else self.fit(X, Y)
        tpot.fit(X_, Y_)
        make_pipe, import_libs = tpot.export('', get_pipeline=True)

        exec(import_libs)
        tpot_model = eval(make_pipe)
        tpot_model = tpot_model if (type(tpot_model) == Pipeline) else make_pipeline(tpot_model)

        if estimator:
            del self.blocks[-1]
        
        for step in tpot_model:
            self.blocks.append(step)
            self.blocks[-1].fit(X_, Y_)
            if step != tpot_model[-1]:
                X_, Y_ = self._transform(self.blocks[-1], X_, Y_)
            
        self.blocks[-1].fit(X_, Y_)
        print(self.blocks)
    ################################################################
    @lead_time
    def export(self):
        pass

class CategoricalEncoder():
    """ Класс кодирования категориальных данных, с заполнение пропусков на некоторое значение определенное сратегией

    Parameters
    ----------
    columns : List[str]
        Названия столбцов, которые будут подвегнуты обработке

    straegy : str
        Строка указывающая на используемую стратегию заполнения пропусков
    
    fill_value : float or str
        Заполнитель, которым будут заполняться пропущенные значения,
        при использование стратегии const
        
    """
    encoder = {}

    def __init__(self, columns:List[str], strategy:str='mean', fill_value:float or str = np.nan): # strategy in mean, median, most_frequency, const, iterative inputer?
        self.columns = columns
        self.fill_value = {'mean':np.mean, 'median':np.median, 'most_freq':most_frequency, 'const':(lambda x:fill_value)}
        self.fill_value = self.fill_value[strategy]

    def fit(self, X:pd.DataFrame, Y:pd.DataFrame or pd.Series):
        for column in self.columns:
            self.encoder[column] = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
            X_fit = pd.DataFrame(X[column].loc[~X[column].isnull()])
            self.encoder[column].fit(X_fit)
            X_transform = self.encoder[column].transform(pd.DataFrame(X_fit))
            self.encoder[column].unknown_value = self.fill_value(X_transform)
        shelve_save(self.encoder, 'CategoricalEncoder')
        return self
    def transform(self, X:pd.DataFrame, Y:pd.DataFrame or pd.Series = None):
        self.encoder = shelve_load('CategoricalEncoder')
        for column in self.columns:
            X[column] = self.encoder[column].transform(pd.DataFrame(X[column].fillna('NAN')))
        return X

class Word2Vectorization():

    len_sentence = {} 
    mean_word = {} 
    word2 = {} 

    def __init__(self, columns:List[str], level_formatting:int = 0):
        self.columns = columns

    def fit(self, X:pd.DataFrame, Y:pd.DataFrame or pd.Series):
        for column in self.columns: 
            filtered = [self.refactor_string(str(i)) for i in X[column]]
            self.word2[column] = Word2Vec(sentences=filtered, epochs=5000, 
                                    min_count=1, window=5, vector_size=1,
                                    sg=1, cbow_mean=1, alpha=0.1,
                                    seed=self.seed)

            word_vac = [self.word2[column].wv[i] for i in self.word2[column].wv.key_to_index.values()]
            self.len_sentence[column] = max([len(sentence) for sentence in filtered]) 
            self.mean_word[column] = np.mean(word_vac)                              
        return self
    def transform(self, X:pd.DataFrame, Y:pd.DataFrame or pd.Series = None):
        
        return X

    def refactor_string(self, string:str)->List[str]:
        string = string if str(string) != 'nan' else ""                          # Проверка на NAN
        string = word_tokenize(str(string).lower())                              # Нижний регистр и токенизация
        if self.level_formatting > 0:
            string = [i for i in string if i.isalpha()]                              # Избавления от знаков пунктуации
            if self.level_formatting > 1:
                string = [i for i in string if not i in stop_words]                      # Избавления от стоп слов
                if self.level_formatting > 2:
                    string = [snowball.stem(morph.parse(i)[0].normal_form) for i in string]  # СТЭММИНГ и ЛЕММАТИЗАЦИЯ
        return string

    def mean_word2vec(self, sentence:str, column:str) ->List[float]:
        vector = self.refactor_string(sentence)
        vector = [self.word2[token] for token in vector if token in self.word2.wv.key_to_index.keys()]
        return np.mean(vector) if vector != [] else 0
################################################################################################
    ## Выборка слов из датасета, и подача их на вход 
    def fit(self, X:pd.DataFrame, y:pd.Series or List[float]):
        for column in self.columns: 
            filtered = [self.refactor_string(str(i)) for i in X[column]]
            self.word2[column] = Word2Vec(sentences=filtered, epochs=5000, 
                                    min_count=1, window=5, vector_size=1,
                                    sg=1, cbow_mean=1, alpha=0.1,
                                    seed=self.seed)

            word_vac = [self.word2[column].wv[i] for i in self.word2[column].wv.key_to_index.values()]
            self.len_sentence[column] = max([len(sentence) for sentence in filtered]) 
            self.mean_word[column] = np.mean(word_vac)                                  

        if not os.path.exists('model_new_property'):
            os.makedirs('model_new_property')
        with open('model_new_property/len_sentence', 'wb') as file:
            pickle.dump(self.len_sentence, file)
        with open('model_new_property/mean_word', 'wb') as file:
            pickle.dump(self.mean_word, file)
        with open('model_new_property/word2', 'wb') as file:
            pickle.dump(self.word2, file)

        return self

    def transform(self, X:pd.DataFrame, y = None)->pd.Series:
        with open('model_new_property/len_sentence', 'rb') as file:
            self.len_sentence = pickle.load(file)
        with open('model_new_property/mean_word', 'rb') as file:
            self.mean_word = pickle.load(file)
        with open('model_new_property/word2', 'rb') as file:
            self.word2 = pickle.load(file)

        for column in self.columns:
            X[column] = list([self.get_vector_sentence(val, column) for val in X[column]])
        return X

# class Imputer():
class user_transform():
    def __init__(self):
        pass
    def fit(self, X:pd.DataFrame, Y:pd.DataFrame or pd.Series):
        return self
    def transform(self, X:pd.DataFrame, y = None) -> pd.DataFrame:
        X = X[['closed_credits_count', 'ubki_week_queries','loan_amount','loan_days',
        'ubki_email_deltatime','ubki_phone_deltatime','ubki_maxnowexp','ubki_expyear',
        'marital_status_id']]
        X = X.fillna(0)
        return X

    def target_transform(self, dataset:pd.DataFrame) -> pd.Series or List[float or int]:
        y = dataset['user_id']
        return y
    # def fit(self, X:pd.DataFrame, Y:pd.DataFrame or pd.Series):
    #     return self

    # def transform(self, X:pd.DataFrame):
    #     return X
    
    # def target_transform(self, Y:pd.DataFrame or pd.Series):
    #     Y['target'] = [Y.loc[i, 'target']*-1 for i in range(len(Y['target']))]
    #     return Y


In [148]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.pipeline import make_pipeline
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.kernel_approximation import RBFSampler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive 

pipe1 = make_pipeline(StandardScaler())

model = Conveyor(user_transform(),
                CategoricalEncoder(columns=['marital_status_id']),
                # pipe1,
                # pipe2
                RandomForestRegressor()
                 )

In [149]:
# model.fit(X, Y)
model.fit(dataset[:60], dataset[:60])
print()

lead time fit = 0.222



In [156]:
model.fit_model(dataset[:60], dataset[:60], estimator = True)

lead time fit = 0.156
[<__main__.user_transform object at 0x000002255BF0B130>, <__main__.CategoricalEncoder object at 0x000002255BF0B4F0>, Binarizer(threshold=0.65), StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.8500000000000001, tol=0.1)), RidgeCV(alphas=array([ 0.1,  1. , 10. ]))]
lead time fit_model = 6.999


In [152]:
model.blocks

[<__main__.user_transform at 0x2255bf0b130>,
 <__main__.CategoricalEncoder at 0x2255bf0b4f0>,
 Binarizer(threshold=0.65),
 RidgeCV(alphas=array([ 0.1,  1. , 10. ]))]

In [153]:
model.feature_importances(dataset[40:], dataset[40:])

lead time transform = 0.007


Exception: The passed model is not callable and cannot be analyzed directly with the given masker! Model: RidgeCV(alphas=array([ 0.1,  1. , 10. ]))

In [105]:
import pickle
with open('model_' , 'wb') as save_model:
    pickle.dump(model, save_model)