In [4]:
import sys
import pandas as pd
import pickle
from AMLpp.transformers import *
from AMLpp.conveyor import *
from AMLpp.architect import *

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\analytic6\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\analytic6\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
df = pd.read_excel('test.xlsx')

In [6]:
import sys
sys.path.insert(0,'C:\\Users\\analytic6\\Desktop\\Work Space Analitic 6 (Asir)')
sys.path.insert(0,'C:\\Users\\User\\Desktop\\work')

from AMLpp.conveyor import Conveyor

from typing import List
import pandas as pd
import pickle 
import os

class Experimenter():

    def __init__(self, experiment:str):
        self.path_experiment = "experiments/" + experiment
        if not os.path.exists(self.path_experiment):
            os.makedirs(self.path_experiment)
            self.model = None
        else:
            self.model = self._load_model()
            print("load model successful!" if self.model else "model not found!")

    def create_experiment(self, 
                            model:Conveyor, 
                            description:str,
                            trainset:str, 
                            X_test:pd.DataFrame = None, 
                            Y_test:pd.DataFrame = None,
                            testset_name:str = "",
                            feature_importances:bool = True,
                            X_test_features:List[str] = None):

        with open(self.path_experiment + "/model", 'wb') as file:
            pickle.dump(model, file)
        self.model = model
        description += "\ntrainset = {}".format(trainset)
        description +=  "\n" + repr(self.model)
        self.add_description(description, 'w')
        if type(X_test) == pd.DataFrame:
            self.make_experiment(X_test, Y_test, testset_name,
                        feature_importances = feature_importances, X_test_features = X_test_features)

        
    def make_experiment(self, 
                            X_test:pd.DataFrame,
                            Y_test:pd.DataFrame = None, 
                            testset_name:str = "", 
                            add_description:str = "", 
                            feature_importances:bool = True,
                            X_test_features:List[str] = None):
        if self.model:
            score, pred, Y = self.model.score(X_test, Y_test, _return = True) 
            description =  '\n' +"*"*60
            description += "\ntestset = " + testset_name
            description += "\n" + score
            description += add_description
            self.add_description(description)
            print(description)

            result_data = X_test[X_test_features] if X_test_features else pd.DataFrame()
            result_data['target'] = Y
            result_data['result'] = pred
            result_data.to_excel(self.path_experiment + "/{}.xlsx".format(testset_name))
            
            if feature_importances:
                plot_path = self.path_experiment + "/{}.jpeg".format(testset_name)
                self.model.feature_importances(X_test, Y_test, save = True, name_plot = plot_path)
        else:
            print("You need to start to the experiment !")
            print("Connect to existing experimnet or create experiment !")

    def add_description(self, add_description:str, mod:str = "a"):
        with open(self.path_experiment + "/desc.txt", mod, encoding="utf-8") as file:
            file.write(add_description)

    def _load_model(self) -> Conveyor:
        path_model = self.path_experiment + "/model"
        if os.path.exists(path_model):
             with open(path_model, 'rb') as file:
                    return pickle.load(file)
        else:
            return None

In [37]:
from sklearn.inspection import permutation_importance
from sklearn.pipeline import Pipeline, make_pipeline

import sys
sys.path.insert(0,'C:\\Users\\analytic6\\Desktop\\Work Space Analitic 6 (Asir)')
sys.path.insert(0,'C:\\Users\\User\\Desktop\\work')

from typing import List, Callable

import matplotlib.pyplot as plt

from tpot import TPOTRegressor

from matplotlib.pyplot import figure

from datetime import datetime
import pandas as pd
import warnings
import pickle
import shap

import tqdm 

##############################################################################
class Conveyor:
    """ Подобие sklearn.Pipeline, адаптированный под простоту и добавленный функционал

    Parameters
    ----------
    *block : object
        Объекты классов, что будут использоваться при обработке, и моделирование

    """
    ##############################################################################
    def __init__(self, *blocks, **params):
        self.blocks = list(blocks) # Список трансформаторов
        self.iter = 0              # Итератор для представления класса как итерируемого
        warnings.filterwarnings('ignore')
    
    def __repr__(self):
        _repr = self.__class__.__name__ + "= (\n"
        indent = " " * (len(_repr) - 1)
        for block in self.blocks:
            _repr += "{}{}, \n".format(indent, repr(block))
        _repr = _repr[:-3] + "\n{} )".format(indent)
        return _repr

    def __next__(self):
        if self.iter < len(self.blocks):
            self.iter +=1 
            return self.block[iter]
        else:
            self.iter = 0
            return StopIteration

    def __getitem__(self, key):
        if isinstance(key, slice):
            return self.__class__(self.blocks[key])
        else:
            return self.blocks[key]
    ##############################################################################
    def fit(self, X:pd.DataFrame,
                  Y:pd.DataFrame or pd.Series,
                  feature_importances:str = False):
        self._fit(X, Y)
        if feature_importances:
            self.feature_importances(X, Y, transform = False)

    def fit_transform(self, X:pd.DataFrame, Y:pd.DataFrame or pd.Series):
        X_, Y_  = (X.copy(), Y.copy())
        for block in self.blocks:
            block.fit(X_, Y_)
            X_, Y_ = self._transform(block, X_, Y_)
        return X_, Y_

    def _fit(self, X:pd.DataFrame, Y:pd.DataFrame or pd.Series):
        X_, Y_  = (X.copy(), Y.copy())

        pbar = tqdm.tqdm(self.blocks[:-1])
        for block in pbar:
            pbar.set_postfix({'transform': block.__class__.__name__})
            block.fit(X_, Y_)
            X_, Y_ = self._transform(block, X_, Y_)
        pbar.close()
        
        self.blocks[-1].fit(X_, Y_)
        return X_, Y_
    ##############################################################################
    def transform(self,
                        X:pd.DataFrame,
                        Y:pd.DataFrame or pd.Series = pd.DataFrame()):
        X_, Y_  = (X.copy(), Y.copy())
        for block in self.blocks[:-1]:
            X_, Y_ = self._transform(block, X_, Y_)
        return X_, Y_

    def _transform(self, 
                        block:Callable,
                        X:pd.DataFrame,
                        Y:pd.DataFrame or pd.Series = pd.DataFrame()):
        X = block.transform(X)
        if not Y.empty and 'target_transform' in dir(block):
            Y = block.target_transform(Y)
        return X, Y
    ##############################################################################
    def predict(self, X:pd.DataFrame):
        X_, Y_ = self.transform(X.copy())
        return self.blocks[-1].predict(X_)
    ##############################################################################
    # @lead_time
    def score(self,
                X:pd.DataFrame,
                Y:pd.DataFrame or pd.Series,
                sklearn_function:List[str] = ['roc_auc_score', 'r2_score', 'accuracy_score'],
                precision_function:List[Callable] = [],
                _return:bool = False):
        """
        X:pd.DataFrame,
        Y:pd.DataFrame or pd.Series,
        sklearn_function:List[str] = ['roc_auc_score', 'r2_score', 'accuracy_score'],
        precision_function:List[Callable] = []
        """
        X_, Y_ = self.transform(X.copy(), Y.copy())
        result = self.blocks[-1].predict(X_)
        score = ""
        for func in sklearn_function:
            try:
                exec('from sklearn.metrics import ' + func)
                score += "function - {} = {}\n".format(func, eval("{}(Y_, result)".format(func)))
            except Exception as e:
                score += "function - {} = ERROR: {}\n".format(func, e)
        for func in precision_function:
            try:
                score = "function - {} = {}\n".format(func.__name__, func(Y_, result))
            except Exception as e:
                score = "function - {} = ERROR: {}\n".format(func.__name__, e)

        if _return:
            return score, result, Y_
        else:
            print(score)

    def feature_importances(self,
                            X:pd.DataFrame,
                            Y:pd.DataFrame or pd.Series, 
                            show:str = 'all', # all, sklearn, shap
                            save:bool = True,
                            name_plot:str = "",
                            transform = True): 
                            
        if transform:
            X_, Y_ = self.transform(X.copy(), Y.copy())
            estimator = self.blocks[-1][-1] if type(self.blocks[-1]) == Pipeline else self.blocks[-1]

        if show == 'all' or show == 'shap':
            try:
                explainer = shap.Explainer(estimator)
                shap_values = explainer(X_)
                
                shap.plots.bar(shap_values[0], show = False)
                if save:
                    name_plot = name_plot if name_plot != "" else datetime.now().strftime("%Y-%m-%d_%M")
                    plt.savefig('{}_shap.jpeg'.format(name_plot), dpi = 150,  pad_inches=0)
                plt.show()
            except Exception as e:
                print('shap plot - ERROR: ', e)

        if show == "all" or show == "sklearn":
            try:
                result = permutation_importance(estimator, X_, Y_, n_repeats=2, random_state=42)
                index = X_.columns if type(X_) == pd.DataFrame else X.columns
                forest_importances = pd.Series(result.importances_mean, index=index)
                fig, ax = plt.subplots(figsize=(20, 10))
                forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
                ax.set_title("Feature importances using permutation on full model")
                ax.set_ylabel("Mean accuracy decrease")
                fig.tight_layout()
                if save:
                    name_plot = name_plot if name_plot != "" else datetime.now().strftime("%Y-%m-%d_%M")
                    plt.savefig('{}_sklearn.jpeg'.format(name_plot))
                plt.show()
            except Exception as e:
                print('Sklearn plot - ERROR: ', e)
    ##############################################################################
    def fit_model(self, 
                    X:pd.DataFrame, Y:pd.DataFrame or pd.Series,
                    type_model:str = 'regressor',
                    estimator:bool = True,
                    only_estimator:bool = True,
                    export_model:str = "default",
                    show_best_estimators:bool = True,
                    generations:int = 5, population_size:int = 50, n_jobs:int = -1):

        tpot = TPOTRegressor(generations=generations, 
                             population_size=population_size,
                             n_jobs = n_jobs,
                             random_state=42)
                            
        X_, Y_ = self.fit_transform(X, Y) if not estimator else self._fit(X, Y)
        print('start fit model !!!!')
        tpot.fit(X_, Y_)
        make_pipe, import_libs = tpot.export('', get_pipeline=True)

        exec(import_libs)
        tpot_model = eval(make_pipe)
        tpot_model = tpot_model if (type(tpot_model) == Pipeline) else make_pipeline(tpot_model)
        if show_best_estimators:
            print('BEST TPOT:\n' + str(tpot_model))

        if estimator:
            del self.blocks[-1]
        
        for step in tpot_model if not only_estimator else tpot_model[-1:]:
            self.blocks.append(step)
            self.blocks[-1].fit(X_, Y_)
            if step != tpot_model[-1]:
                X_, Y_ = self._transform(self.blocks[-1], X_, Y_)
            
        self.blocks[-1].fit(X_, Y_)
        if show_best_estimators:
            print('RESULT CONVEYOR:\n'  + str(self))
        if export_model != "":
            if export_model == "default":
                export_model = "model_" + datetime.now().strftime("%Y_%m_%d_m%M")
            with open(export_model, 'wb') as save_file:
                pickle.dump(self, save_file)

In [38]:
from experiments.exp1.user_transform import UserTransform
from sklearn.ensemble import RandomForestRegressor
import numpy as np

categorical_columns = ['organization_type_other', 'position_other', 'email', 'ceduc', 'family', 'sstate', 'cgrag', 'income_source_id',
                      'income_frequency_id', 'has_prior_employment', 'empoyees_count_id', 'organization_branch_id', 'organization_type_id', 'position_id',
                      'employment_type_id', 'has_movables', 'has_immovables', 'fact_addr_owner_type_id', 'fact_addr_region_id', 'fact_addr_same', 'addr_owner_type_id',
                      'addr_region_id', 'education_id', 'children_count_id', 'marital_status_id', 'gender_id', 
                      'country_det','city_det', 'region_det', 'isp', 'browser', 'system', 'brand'
                      ]

model = Conveyor (
                  UserTransform(),
                  Word2Vectorization(columns=['purpose_other'], epochs = 100),
                  CategoricalEncoder(columns=categorical_columns),
                  ImputerIterative(),
                  RandomForestRegressor(random_state=1)
                  )

In [40]:
model

Conveyor= (
           <experiments.exp1.user_transform.UserTransform object at 0x000001C2E0BD0190>, 
           Word2Vectorization(columns=['purpose_other'], level_formatting=1, epochs=100, min_count=1, window=5, vector_size=20), 
           CategoricalEncoder(columns=['organization_type_other', 'position_other', 'email', 'ceduc', 'family', 'sstate', 'cgrag', 'income_source_id', 'income_frequency_id', 'has_prior_employment', 'empoyees_count_id', 'organization_branch_id', 'organization_type_id', 'position_id', 'employment_type_id', 'has_movables', 'has_immovables', 'fact_addr_owner_type_id', 'fact_addr_region_id', 'fact_addr_same', 'addr_owner_type_id', 'addr_region_id', 'education_id', 'children_count_id', 'marital_status_id', 'gender_id', 'country_det', 'city_det', 'region_det', 'isp', 'browser', 'system', 'brand']), 
           ImputerIterative(columns=None, max_iter=10, initial_strategy=mean, missing_values=nan), 
           ElasticNetCV(l1_ratio=0.05, tol=0.1)
            )

In [39]:
model.fit(df, df)

100%|██████████| 4/4 [00:01<00:00,  3.45it/s, transform=ImputerIterative]


start fit model !!!!
BEST TPOT:
Pipeline(steps=[('stackingestimator',
                 StackingEstimator(estimator=GradientBoostingRegressor(alpha=0.85,
                                                                       learning_rate=0.5,
                                                                       loss='lad',
                                                                       max_depth=1,
                                                                       max_features=0.5,
                                                                       min_samples_leaf=12,
                                                                       min_samples_split=18,
                                                                       subsample=0.2))),
                ('elasticnetcv', ElasticNetCV(l1_ratio=0.05, tol=0.1))])
RESULT CONVEYOR:
Conveyor= (
           <experiments.exp1.user_transform.UserTransform object at 0x000001C2E0BD0190>, 
           Word2Vectorization(colum

In [35]:
experiment = Experimenter("exp1")

load model successful!


In [36]:
description = \
"""
Иморт user_transform из __main__
Расширенная модель для новых пользователей, использует только статус 5 и 6.
Расширения подразумевает исользование user_agent, detection.
"""
testset_name = 'testset_2_5k_y2021_m6_new'
X_test_features = ['backend_application_id', 'overdue_days','status_id']
experiment.create_experiment(model, description, 'test', df, df, testset_name, X_test_features = X_test_features, feature_importances = False)


************************************************************
testset = testset_2_5k_y2021_m6_new
function - roc_auc_score = 0.6094999999999999
function - r2_score = 0.030458245523406147
function - accuracy_score = ERROR: Classification metrics can't handle a mix of binary and continuous targets



In [42]:
experiment.make_experiment(X_test, y_test, testset_name, X_test_features = X_test_features)


************************************************************
testset = testset_2_5k_y2021_m6_new
function - roc_auc_score = 0.8713586751975914
function - r2_score = -0.38745845517811994
function - accuracy_score = ERROR: Classification metrics can't handle a mix of binary and continuous targets



In [None]:
import pandas as pd
import numpy as np
from ast import literal_eval
from typing import List
import json

def pars_user_agent(user_agent:str):
    browser, system, brand, = (np.nan, np.nan, np.nan)
    list_user_agent = []
    try:
        with open('user_agent.json', 'r') as load_file:
            info = pd.DataFrame(json.load(load_file))
        list_user_agent = info['useragent'].values
    except:
        pass

    if user_agent  in list_user_agent:
        system = info[info['useragent'] == user_agent]['system'].values[0].lower().split(' ')
        browser = system[0]
        system = 'windows' if system[2].find('win') != -1 else system[2]
        brand = 'apple' if (system == 'macos') else np.nan
    else:
        user_agent = user_agent[user_agent.index('(')+1:user_agent.index(')')].lower().split(';')

        system = {'windows':('windows', np.nan), 'x11':('linux',np.nan), 
                    'iphone':('iphone', 'apple'), 'ipad':('ipad', 'apple'), 'macintosh':('macos', 'apple')}
        for key in system.copy().keys():
            for col in user_agent:
                if col.find(key) != -1:
                    system, brand = system[key]
                    return browser, system, brand 
        else:
            isandroid = len([i for i in user_agent if i.find('android') != -1]) > 0            
            if isandroid:
                brand_phone = {'samsung':'samsung', 'xiaomi':'xiaomi', 'huawei':'huawei', 'lenovo':'lenovo',
                                'motorola':'motorola', 'nokia':'nokia', 'sony':'sony', 'honor':'huawei', 
                                'tecno':'tecno', 'asus':'asus', 'meizu':'meizu', 'vivo':'vivo', 'neffos':'neffos',
                                'ulefone':'ulefone', 'htc ':'htc', 'pocophone':'poco', 'pixel':'google',
                                'lg':'lg', 'sm':'samsung', 'redmi':'xiaomi', 'oneplus':'huawei', 'htc':'htc',
                                'zte':'zte', 'mi':'xiaomi', 'm200':'xiaomi', 'cph':'oppo', 'moto':'motorola',
                                'rmx':'realme', 'jsn':'huawei','-lx':'huawei', 'yal-':'huawei', 'eml-':'huawei',
                                '-l21':'huawei', '-l29':'huawei', '-l22':'huawei', '-l31':'huawei','psp':'prestigio',
                                '-l09':'huawei', '-l19':'huawei', 'pra-':'huawei', '-l41':'huawei', '-u29':'huawei', 
                                'mz':'meizu', 'u10':'meizu', 'm5':'xiaomi','m6':'xiaomi', 'note':'xiaomi',
                                }
                system = 'android'
                for key in brand_phone.keys():
                    for col in user_agent:
                        if col.find(key) != -1:
                            brand = brand_phone[key]
                            return browser, system, brand
                             
    return [browser, system, brand]

def pars_detections(detections:str):
    country, region, city, isp = np.nan, np.nan, np.nan, np.nan
    try:
        detections = literal_eval(detections)['geo']
        isp = detections['isp']
        country = detections['country']
        city = detections['city']
        region = int(detections['region'])
    finally:
        return [country, city, region, isp]

class UserTransform():

    __name__ = 'pars'

    def __init__(self):
        pass

    def fit(self, X:pd.DataFrame, Y:pd.DataFrame or pd.Series):
        return self
        
    def transform(self, X:pd.DataFrame, y = None) -> pd.DataFrame:
        leave_columns = ['loan_amount', 'loan_days', 'gender_id', 'marital_status_id', 'children_count_id', 'education_id', 'addr_region_id',
                         'addr_owner_type_id', 'fact_addr_same', 'fact_addr_region_id', 'fact_addr_owner_type_id', 'has_immovables', 'has_movables',
                         'employment_type_id', 'position_id', 'organization_type_id', 'organization_branch_id', 'empoyees_count_id', 'seniority_years',
                         'has_prior_employment', 'monthly_income', 'income_frequency_id', 'income_source_id', 'monthly_expenses', 'other_loans_about_current', 
                         'other_loans_about_monthly', 'product_dpr', 'product_amount_from', 'product_amount_to', 'product_overdue_dpr', 'product_interest_min', 
                         'median_day_credit',	'mean_credit_summ',	'mean_credit_debt', 'last_cdolgn', 'last_wdohod', 'last_wstag', 'cgrag', 'sstate', 'family', 
                         'ceduc', 'ubki_balance_value', 'ubki_score', 'ubki_scorelast', 'ubki_scorelevel', 'ubki_all_credits', 'ubki_open_credits', 
                         'ubki_closed_credits', 'ubki_expyear', 'ubki_maxnowexp', 'ubki_phone_deltatime', 'ubki_email_deltatime', 'ubki_week_queries',
                         'rejected_applications_count', 'mean_loans', 'applied_at', 'purpose_other', 'birth_date', 'passport_date', 'email', 'position_other', 
                         'organization_type_other', 'detections', 'user_agent']
        X = X[leave_columns]

        X = X.replace('[]', np.nan, regex=False)
        X['email'] = X['email'].str.split('@', expand=True)[1]

        X['passport_year'] = pd.to_datetime(X['passport_date'], format='%Y-%m-%d', errors='coerce').dt.year
        

        X['birth_year'] = pd.to_datetime(X['birth_date'], format='%Y-%m-%d', errors='coerce').dt.year
        

        X['applied_at'] = pd.to_datetime(X['applied_at'], format='%Y-%m-%d %H', errors='coerce')
        X['applied_day'] = X['applied_at'].dt.day
        X['applied_weekday'] = X['applied_at'].dt.weekday
        X['applied_hour'] = X['applied_at'].dt.hour
        
        X = X.drop(['passport_date', 'birth_date', 'applied_at'], axis = 1)

        X[['country_det', 'city_det', 'region_det', 'isp']] = [pars_detections(val) for val in X['detections']]
        X[['browser', 'system', 'brand']] = [pars_user_agent(val) for val in X['user_agent']]
        
        X = X.drop(['detections', 'user_agent'], axis = 1)
        return X

    def target_transform(self, Y:pd.DataFrame) -> pd.DataFrame or pd.Series or List[float or int]:
        Y = Y[['overdue_days', 'status_id']]
        Y['overdue_days'] = Y['overdue_days'].fillna(0)
        Y['overdue_days'].loc[Y['overdue_days'] == 0] = 0
        Y['overdue_days'].loc[Y['overdue_days'] > 0] = 1
        Y['overdue_days'].loc[Y['status_id'] == 2] = 1
        return Y['overdue_days'].replace({0: 1, 1: 0})