In [1]:
import os
import gc
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.impute import SimpleImputer

import scipy.stats as ss
from statsmodels.api import ProbPlot

warnings.filterwarnings(action = 'ignore')

In [3]:
def Missing_Impute_MICE(data = None):
    """
    impute the nan values of dataset with MICE 
    """
    imputer = IterativeImputer(estimator = LinearRegression(), max_iter=30, imputation_order = 'random')
    imputer.fit(data)
    data = imputer.transform(data)

In [1]:
def Missing_Detect_Diff_Distribution(data = None, col_A = None, col_B = None, col_C = None, method = 'count'):
    """
    visualize the distribution of missing subset and not missing subset to find the difference
    """
    missing_subset = data[data[col_A].isnull() == True]
    notmissing_subset = data[data[col_A].isnull() == False]
    
    if method == 'count':
        #col列缺失的数据子集
        fig,ax = plt.subplots(1,2,figsize = (15,8))
        sns.countplot(data = missing_subset,x = col_B,ax =ax[0])
        ax[0].set_title(label = 'missing data set of {}'.format(col_A))

        #col列非缺失值的数据子集
        sns.countplot(data = notmissing_subset,x = col_B,ax =ax[1])
        ax[1].set_title(label = 'not missing data set of {}'.format(col_A))

        for idx in range(2):
            bars = ax[idx].patches
            half = int(len(bars)/2)
            left_bars = bars[:half]
            right_bars = bars[half:]

            for left, right in zip(left_bars, right_bars):
                height_l = left.get_height()
                height_r = right.get_height()
                total = height_l + height_r

                ax[idx].text(left.get_x() + left.get_width() / 2., left.get_height(),'{0:.0%}'.format(height_l/total), ha="center")
                ax[idx].text(right.get_x() + right.get_width() / 2., right.get_height(),'{0:.0%}'.format(height_r/total), ha="center")
    elif method == 'scatter':
        fig, ax = plt.subplots(1,2,figsize = (15,8))
        #非缺失子集
        sns.scatterplot(data = missing_subset, x = col_B, y= col_C, ax = ax[0])
        ax[0].set_title(label = "missing subset of {}".format(col_A))
        #缺失子集
        sns.scatterplot(data = notmissing_subset, x = col_B, y = col_C, ax = ax[1])
        ax[1].set_title(label = 'not missing subset of {}'.format(col_A))
        
    plt.show()
    plt.close()
    del missing_subset, notmissing_subset
    gc.collect()

In [None]:
def Missing_Imputer(data = None, col = None, method = "mean", fill_value = None):
    """
    impute the missing value of data[col] with these methods of mean median mode and so on
    """
    if method == 'constant':
        imputer = SimpleImputer(strategy = method,fill_value=value)    
    else:
        imputer = SimpleImputer(strategy= method)
    return imputer.fit_transform(data[[col]])

In [None]:
def Missing_Compute_Corr(data = None, col_A = None, col_B = None):
    """
    compute the correlation of col_A and col_B
    """
    notmissingsubset = data[~data[col_A].isnull()]
    notmissingsubset = notmissingsubset[~notmissingsubset[col_B].isnull()]
    
    pearsonr_corr = ss.pearsonr(notmissingsubset[col_A], notmissingsubset[col_B])
    print("the pearsonr corr between {} and {} is: {}".format(col_A,col_B,pearsonr_corr))
    
    spearmanr_corr =  ss.spearmanr(notmissingsubset[col_A], notmissingsubset[col_B])
    print("the spearmanr corr between {} and {} is: {}".format(col_A, col_B,spearmanr_corr))
    print('\n')
    
    

In [5]:
def Outlier_Detect_IQR(data = None,  col = None):

    """
    detect whether there are some outliers in a feature
    """
    #Q1 Q3 mean and median values
    Q1 = data[col].describe()['25%']
    Q3 = data[col].describe()['75%']
    mean_value = data[col].describe()['mean']
    median_value = data[col].describe()['50%']
    #calculate IQR
    IQR = Q3 - Q1
    
    #1. calculate bound values  when median as the foundment value
    upper_bound = median_value + 1.5 * IQR
    lower_bound = median_value - 1.5 * IQR
    #potentail outliers which above 1.5*IQR
    potential_outliers_upper = pd.DataFrame(data = [x for x in data[col] if x > upper_bound],
                                            columns=['outlier_above'])
    potential_outliers_lower = pd.DataFrame(data  = [x for x in data[col] if x < lower_bound], 
                                            columns=['outlier_below'])
    
    
    
    print("1. when median as the foundment value\n")
    #above 1.5IQR specifical information
    print("1.1 the upper bound value is {}".format(upper_bound))
    print("1.2 the potential outliers above 1.5IQR amount is : {}".format(potential_outliers_upper.shape[0]))
    print("1.3 the potential outliers above 1.5IQR values distribution is: \n", potential_outliers_upper.value_counts())
    
    #below 1.5IQR specifical information
    print("1.4 the lower bound value is {}".format(lower_bound))
    print("1.5 the potential outliers below 1.5IQR amount is : {}".format(potential_outliers_lower.shape[0]))
    print("1.6 the potential outliers below 1.5IQR values distribution is: \n", potential_outliers_lower.value_counts())
   
    del potential_outliers_upper, potential_outliers_lower
    gc.collect()
   
    #2. calculate bound values  when mean as the foundment value
    upper_bound = mean_value + 1.5 * IQR
    lower_bound = mean_value - 1.5 * IQR
    #potentail outliers which above 1.5*IQR
    potential_outliers_upper = pd.DataFrame(data = [x for x in data[col] if x > upper_bound], 
                                            columns=['outlier_above'])
    potential_outliers_lower = pd.DataFrame(data  = [x for x in data[col] if x < lower_bound], 
                                            columns=['outlier_below'])
    
    print(100  * '-')
    print("2. when mean as the foundment value\n")
    #above 1.5IQR specifical information
    print("2.1 the upper bound value is {}".format(upper_bound))
    print("2.2 the potential outliers above 1.5IQR amount is : {}".format(potential_outliers_upper.shape[0]))
    print("2.3 the potential outliers above 1.5IQR values distribution is: \n", potential_outliers_upper.value_counts())

    #below 1.5IQR specifical information
    
    print("2.4 the lower bound is {}".format(lower_bound))
    print("2.5 the potential outliers below 1.5IQR amount is : {}".format(potential_outliers_lower.shape[0]))
    print("2.6 the potential outliers below 1.5IQR values distribution is: \n", potential_outliers_lower.value_counts())

    del potential_outliers_upper, potential_outliers_lower
    gc.collect()
  
    fig,ax = plt.subplots(1,3, figsize = (18,5))
    sns.histplot(data = data, x = col, ax = ax[0])
    
    qqplot = ProbPlot(data[col])
    qqplot.qqplot(line='s', ax = ax[1])

    
    sns.boxplot(data = data,y = col, ax = ax[2])
    plt.show()
    plt.close()


In [1]:
def Outliers_Handle_Discretization(data = None, col = None, method  = 'median', value = -999999):
    """
        handle Outliers by discretization method
        @value_f :  the foundment value of IQR detection method, 'mean' or 'median'
    """
        if method == 'mean':
    #         value_f = np.mean(data[col])  will return nan value if data[col] contain nan value
            value_f = np.nanmean(data[col])
    #         value_f = data[col].quantile(q = 0.5)     this method also work well 
        else:
    #         value_f = np.median(data[col])  will return nan value if data[col] contain nan value
            value_f = np.nanmedian(data[col])

        Q1 = data[col].quantile(q = 0.25)
        Q3 = data[col].quantile(q = 0.75)
        IQR = Q3 - Q1
        upper_bound = value_f + 1.5 * IQR
        lower_bound = value_f - 1.5 * IQR

        #find potential outlier  value
        potential_outliers = [x for x in data[col] if x > upper_bound or x < lower_bound]
        #remove duplicate values
        potential_outliers = set(potential_outliers)

        #discretize outliers into a single value,eg:999999,-999999
        data[col] = data[col].apply(lambda x : value if x in potential_outliers else x)

        return data[col]

In [1]:
def Rare_Value_Handle(data = None, col = None, method = 'mode', t = 100):
    """
        feature may has some rare values
        accodrding to needness, we can impute rare values with mode value impute or classify them to class 'other'
        
        t is the threshold of rare value which need to be handled. its default value is 100 what means total num of 
        some value  is less 100 
    """
    rare_value_list = list((data[col].value_counts() < t).index)
    if method == "mode":
        #mode value may not be unique, so ,we choose the first value as mode
        mode_value = data[col].mode()[0]
        data[col] = data[col].apply(lambda x : mode_value  if x in rare_value_list  else x)
    else:
        #we choose a specifical value as a class of featue. The default value may be 9999or -9999
        data[col] = data[col].apply(lambda x : 9999 if x in rare_value_list  else x)
    
    return data[col]