In [1]:
        import pandas as pd
        import numpy as np
        import time
        import seaborn as sns
        import statsmodels.api as sm
        import matplotlib.pyplot as plt
        from pymystem3 import Mystem
        import datasist as ds
        import nltk
        import re
        from scipy import stats
        import shap
        from sklearn.metrics import plot_precision_recall_curve
        from sklearn.metrics import plot_confusion_matrix
        from sklearn import metrics
        from sklearn.preprocessing import StandardScaler
        from sklearn.preprocessing import MinMaxScaler
        from sklearn.preprocessing import Normalizer
        from sklearn.preprocessing import OrdinalEncoder
        from sklearn.preprocessing import OneHotEncoder
        from sklearn.preprocessing import Binarizer
        import category_encoders as ce

In [2]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.svm import SVR,SVC
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.dummy import DummyRegressor, DummyClassifier
import shap


In [3]:
class pipe():
    """
    info
    
    getColWithNaN - Get all columns from dataframe with NaN values.
    getCatCol - Get categorical columns of dataframe
    getNumCol - Get numeric columns of dataframe
    benchmark - returns duration of execution and function result
    getInfo   - short info with columns (Column name, datatype, null values count, unique values count, useless property)
    getQuantile - get 1.5 quantile diap
    
    preprocessing
    
    downcastNumCol - downcast all numeric columns
    fillnaByGroup - fill NA values with value by group according kind param
    fillnaCat - fill missing values by word 'missing' and replace rare values with 'rare'
    drop_useless - drop useless columns like index, id etc.
    convert_dtype - automatically type-cast features that are not represented in their right types  
    create_balanced_data - creates a balanced data set from an imbalanced one. This function is strictly used in a classification task
    scaler - transfom numeric data by params 'no': Normalizer(),
                                             'ss': StandardScaler(),
                                             'mm': MinMaxScaler(),
                                             'bc': Boxcox,
                                             'ln': Boxcox with lambda =0
    graph
    
    boxplot - draw sns boxplot
    hist    - draw sns hist
    kdeplot    - draw sns kdeplot
    corrmatrix - draw corrmatrix for all columns
    pie     - draw pie plot
    jointplot - draw jointplot
    qqplot   - draw qqplot
    numcolgraph - draw row of plots (kdeplot, boxplot, qqplot, statistic table (modified describe))
    
    text
    
    getLemmas - get lemmas by getLemmas by nltk.tokenize.word_tokenize, SnowballStemmer or Mystem (russian only)
    clear_text_cyr - clear text, left only cyrillic letters
    clear_text_lat - clear text, left only latin letters
    clear_text_cyr_lat - clear text, left only cyrillic and latin letters
    
    models
    
    _get_classification_model_list - create list of popular classification models with basic params
    _get_regression_model_list     - create list of popular regression models with basic params
    compare_model - This function takes as argument multiple machine learning models and returns a plot of a comparative metric. 
                    This can be used to pick a base model and also to compare models side by side. 
                    The compare model returns a tuple of the trained models and their score 
    plot_feature_importance - plot feature importance
    
    analyses
    
    confusion_matrix - draw confusion matrix
    precision_recall_curve - draw confusion matrix precision recall curve
    plotROC - draw AUC ROC plot
    """
    
    
    def __init__(self):
       
        pass
    class info:
        def __init__(self):
            pass
        
        def _checkforUseless(series):
            n_distinct_values = series.nunique()
            if n_distinct_values == len(series):
                # could be an index
                if series.iloc[0] == 0:
                    if (series == np.arange(len(series))).all():
                        # definitely an index
                        return True
                elif series.iloc[0] == 1:
                    if (series == np.arange(1, len(series) + 1)).all():
                        # definitely an index
                        return True
            if n_distinct_values==1:
                return True
            return False
        def getQuantile(series):
            q1=series.quantile(0.25)
            q3=series.quantile(0.75)
            mr = q3 - q1
            low =q1-mr*1.5 if q1-mr*1.5>series.min() else series.min()
            high = q3+mr*1.5 if q1+mr*1.5<series.max() else series.max()
            print('1.5 interquantile distance: {:.0f}-{:.0f}'.format(low, high))
            return low, high
       
        def benchmark(func,*args):
            
            t = time.perf_counter()
            res = func(*args)
            return time.perf_counter() - t, res
        def getColWithNaN(df):
            return df.columns[df.isnull().sum()>0]
        def getCatCol(df):
            return df.columns[(df.dtypes=='object')|(df.dtypes=='category')]
        def getNumCol(df):
            return df.columns[df.dtypes!='object']
        def getInfo(df):
            info=pd.concat([pd.DataFrame(df.columns).set_index(0),\
                            pd.DataFrame(df.dtypes),\
                            df.count(),\
                            pd.DataFrame(df.isnull().sum()),\
                            (df.isnull().sum()/df.count()).map(lambda x: '{:.2%}'.format(x)),\
                            pd.DataFrame(map(lambda x: df[x].nunique(),df.columns), index=df.columns),\
                            pd.DataFrame(map(lambda x: pipe.info._checkforUseless(df[x]),df.columns), index=df.columns)],\
                            axis=1, join ='inner')
            info.columns=['DataType','Values','Null', 'Missing Rate', 'Unique','Useless']  
            return info
    class preprocessing():
        def __init__(self):
            pass
        def downcastNumCol(df, cols):
            for col in cols:
                if 'int' in str(df[col].dtype):
                    df[col]=pd.to_numeric(df[col], downcast="integer")
                elif 'float' in str(df[col].dtype): 
                    df[col]=pd.to_numeric(df[col], downcast="float")
            return df            
        def fillnaByGroup(df,col_to_fill, group, kind='median'):
            if col_to_fill not in pipe.info.getNumCol(df):
                print('col_to_fill must be numeric')
            if kind not in ['median','mean']:
                print("Only kind in ('median','mean') available")
            df.loc[df[col_to_fill].isna(),"distance"] = df.groupby(group)[col_to_fill].transform(kind)
            return df
        def fillnaCat(df, thr = 0.005):
            for col in pipe.info.getCatCol(df):
                df[col]=df[col].astype('object')
                df.loc[df[col].isnull(), col] = 'missing'
            for col in pipe.info.getCatCol(df):
                d = dict(df[col].value_counts(dropna=False)/len(df))
                df[col] = df[col].apply(lambda x: 'rare' if d[x] <= thr else x)
            return df
        def drop_outliers(df,col, min_max_range=None):
             
            if min_max_range==None:
                low, hight = pipe.info.getQuantile(df[col])
            else:
                low=min_max_range[0]
                hight=min_max_range[1]
            df= df[(df[col]<hight)&(df[col]>low)] 
            return df
        def drop_useless(df):
            return df.drop(columns=pipe.info.getInfo(df)[pipe.info.getInfo(df)['Useless']==True].index, axis=1)
        def convert_dtype(df):
            return ds.feature_engineering.convert_dtype(df)
        def create_balanced_data(df, target, categories, class_sizes):
            return ds.feature_engineering.create_balanced_data (data = df, target = target, categories = categories, class_sizes = class_sizes)
        def scale(df,cols=[], kind='ss'):
            class ln():
                def fit_transform(self,df):
                    return df.apply(lambda x: boxcox(x, lmbda=0))    
            class bc():  
                def fit_transform(self,df):
                    return df.apply(lambda x: boxcox(x))    
            kinds={'no': Normalizer,
                   'ss': StandardScaler,
                   'mm': MinMaxScaler,
                   'bc': bc,
                   'ln': ln
                   }      
            scaler=kinds.get(kind,'Params error {}- wrong kind'.format(kind))
            if isinstance(scaler, str):
                print(scaler)
                return None
            else:         
                df[cols]=scaler().fit_transform(df[cols])
            return df
        def encoder(df, cols=[], kind='oh',**kwargs):    
            class oh():
                def __init__(self,**kwargs):
                    self.drop_first = kwargs.get('drop_first',False)
                def fit_transform(self,df):
                    dummies = pd.get_dummies(df[cols],drop_first= self.drop_first)
                    return dummies
            kinds={'oe': OrdinalEncoder,
                   'oh': oh,
                   'bi': Binarizer,
                   'cb': ce.CatBoostEncoder,
                   'pn': ce.PolynomialEncoder,
                   'he': ce.HashingEncoder
                    } 
            enc=kinds.get(kind,'Params error {}- wrong kind'.format(kind))
            if isinstance(enc, str):
                print(enc)
                return None
            else:       
                res=enc(**kwargs).fit_transform(df[cols]).astype('int')
                if len(cols)==len(res):
                    df[cols]= res
                else:
                    df=df.drop(cols, axis=1)
                    df[res.columns]=res
                return df
    class graph():
        
        def __init__(self):
            pass
        def lineplot(df,**kwargs):
            plt.figure(figsize=(15,5))
            sns.set_theme(style="whitegrid")
            ax = sns.lineplot(data=df, **kwargs)
            return None
        def boxplot(df,**kwargs):
            plt.figure(figsize=(15,5))
            sns.set_theme(style="whitegrid")
            ax = sns.boxplot(data=df, **kwargs)
            return None
        def kdeplot(df,**kwargs):
            plt.figure(figsize=(15,5))
            sns.set_theme(style="whitegrid")
            ax = sns.kdeplot(data=df, **kwargs)
            return None
        def jointplot(df,**kwargs):
            plt.figure(figsize=(15,5))
            sns.set_theme(style="whitegrid")
            ax = sns.jointplot(data=df, **kwargs)
            return None
        def qqplot(series):
            fig = sm.qqplot(series,  fit=True, line="45")
            plt.show()
            return None
        def hist(df,x,bins=10):
            if x not in sPipe.info.getNumCol(df):
                print('X must be numeric')  
            plt.figure(figsize=(15,5))    
            sns.set_theme(style="darkgrid")
            sns.displot(data=df, x=x, bins=bins, kde=True)
            return None
        def corrmatrix(df):
            plt.figure(figsize=(15,5))
            sns.set(font_scale=2)
            sns.heatmap(df.corr(),cmap='coolwarm',annot = True, annot_kws={"fontsize": 'medium'})
            return None
        def pie(df, category, val):
            df.groupby(category)[val].count().plot(kind='pie', autopct='%1.2f%%')
            return None
        def numcolgraph(series):
            PLOTS = 4
            PLOT_HEIGHT = 4
            fig, axes=plt.subplots(nrows=1,ncols=PLOTS, gridspec_kw={'hspace': 0, 'wspace': 0.4})
            fig.set_figheight(PLOT_HEIGHT)
            fig.set_figwidth(PLOT_HEIGHT*PLOTS)
            sns.set_theme(style="whitegrid")
            sns.kdeplot(x=series, ax=axes[0])
            sns.boxplot(y=series, ax=axes[1])
            ax=sm.qqplot(series,  fit=True, line="45", ax=axes[2])
            descr= round(series.describe().append(pd.Series(series.median(), index=['median'])),4)
            tbl=axes[3].table(cellText=list(map(lambda x:[x],descr.values)), rowLabels=list(descr.index), colWidths = [0.8], loc = 'upper center')
            tbl.scale(1, 2)
            tbl.auto_set_font_size(False)
            tbl.set_fontsize(16)
            axes[3].grid(False)
            axes[3].axis('off')
    class text():
        def __init__(self):
            pass
        
        def getLemmas(series, kind='nltk', language ='english'):
            if kind == 'nltk':
                nltk.download('punkt')
                def tokenize(str):
                    return ' '.join(nltk.tokenize.word_tokenize(x,language='english') for x in str.split(' '))
                tokens = series.apply(tokenize) 
            elif kind=='snowball':
                stemmer=nltk.stem.SnowballStemmer(language=language)
                def get_lemmas(str):
                    return ' '.join(stemmer.stem(x) for x in str.split(' '))
                tokens = series.apply(get_lemmas) 
            elif kind=='mystem':
                m = Mystem()
                def get_lemmas(str):
                    lemm_list = m.lemmatize(str)
                    return ' '.join(lemm_list)
                tokens = series.apply(get_lemmas) 
            return tokens

        def clear_text_cyr(text):
            text=re.sub(r'[^а-яА-ЯёЁ]',' ', text)
            return ' '.join(text.split())
        def clear_text_lat(text):
            text=re.sub(r'[^a-zA-Z]',' ', text)
            return ' '.join(text.split())
        def clear_text_cyr_lat(text):
            text=re.sub(r'[^а-яА-ЯёЁa-zA-Z]',' ', text)
            return ' '.join(text.split())  

    class models():
        RANDOM=5888
        def __init__():
            pass
        def _get_classification_model_list(random_state=RANDOM, class_weight=None):
            models=[DummyClassifier(strategy='stratified', random_state=random_state),
                    LogisticRegression(random_state=random_state, class_weight=class_weight),
                    DecisionTreeClassifier(random_state=random_state, class_weight=class_weight),
                    LinearDiscriminantAnalysis(),
                    KNeighborsClassifier(),
                    RandomForestClassifier(random_state=random_state, class_weight=class_weight),
                    SVC(random_state=random_state, class_weight=class_weight),
                    GradientBoostingClassifier(random_state=random_state)]
            return models
        def _get_regression_model_list(random_state=RANDOM):
            models=[DummyRegressor(strategy='median'),
                    LinearRegression(),
                    DecisionTreeRegressor(random_state=random_state),
                    KNeighborsRegressor(),
                    RandomForestRegressor(random_state=random_state),
                    SVR(),
                    GradientBoostingRegressor(random_state=random_state)
                   ]
            return models
        
        def compare_model(model_list, X_train, y_train, scoring,**kwargs):
            return ds.model.compare_model(models_list=model_list, x_train=X_train, y_train=y_train, scoring_metric=scoring,**kwargs)
        def plot_feature_importance(model, features):
            shap_test = shap.Explainer(model,features).shap_values(features)
            shap.summary_plot(shap_test, features,  plot_type='violin', max_display=25) 
            return None
    class analysis():
        def __init__(self):
            pass 
        def shap(df, model):
            shap_test = shap.TreeExplainer(model).shap_values(df)
            shap.summary_plot(shap_test, df,
                              max_display=25, auto_size_plot=True)
            return None
        def confusion_matrix(model, X,y):
            plot_confusion_matrix(model, X, y) 
            return None
        def precision_recall_curve(model, X,y):
            plot_precision_recall_curve(estimator=model, X=X, y=y)
            return None
        def plotROC(y_test, probs, titl=''):
            if titl!='':
                titl = ' ('+titl+')' 
            fpr, tpr, threshold = metrics.roc_curve(y_test, probs)
            roc_auc = metrics.auc(fpr, tpr)
            plt.title('Receiver Operating Characteristic'+titl)
            plt.plot(fpr, tpr, 'b', label = 'AUC = %0.5f' % roc_auc)
            plt.legend(loc = 'lower right')
            plt.plot([0, 1], [0, 1],'r--')
            plt.xlim([0, 1])
            plt.ylim([0, 1])
            plt.ylabel('True Positive Rate')
            plt.xlabel('False Positive Rate')
            plt.show()
            return None