In [4]:
import os
import gc
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.impute import SimpleImputer
import scipy.stats as ss


warnings.filterwarnings(action = 'ignore')

In [4]:
def Missing_Impute_MICE(data = None):
    """
    impute the nan values of dataset with MICE 
    """
    imputer = IterativeImputer(estimator = LinearRegression(), max_iter=30, imputation_order = 'random')
    imputer.fit(data)
    data = imputer.transform(data)

In [1]:
def Missing_Detect_Diff_Distribution(data = None, col_A = None, col_B = None, col_C = None, method = 'count'):
    """
    visualize the distribution of missing subset and not missing subset to find the difference
    """
    missing_subset = data[data[col_A].isnull() == True]
    notmissing_subset = data[data[col_A].isnull() == False]
    
    if method == 'count':
        #col列缺失的数据子集
        fig,ax = plt.subplots(1,2,figsize = (15,8))
        sns.countplot(data = missing_subset,x = col_B,ax =ax[0])
        ax[0].set_title(label = 'missing data set of {}'.format(col_A))

        #col列非缺失值的数据子集
        sns.countplot(data = notmissing_subset,x = col_B,ax =ax[1])
        ax[1].set_title(label = 'not missing data set of {}'.format(col_A))

        for idx in range(2):
            bars = ax[idx].patches
            half = int(len(bars)/2)
            left_bars = bars[:half]
            right_bars = bars[half:]

            for left, right in zip(left_bars, right_bars):
                height_l = left.get_height()
                height_r = right.get_height()
                total = height_l + height_r

                ax[idx].text(left.get_x() + left.get_width() / 2., left.get_height(),'{0:.0%}'.format(height_l/total), ha="center")
                ax[idx].text(right.get_x() + right.get_width() / 2., right.get_height(),'{0:.0%}'.format(height_r/total), ha="center")
    elif method == 'scatter':
        fig, ax = plt.subplots(1,2,figsize = (15,8))
        #非缺失子集
        sns.scatterplot(data = missing_subset, x = col_B, y= col_C, ax = ax[0])
        ax[0].set_title(label = "missing subset of {}".format(col_A))
        #缺失子集
        sns.scatterplot(data = notmissing_subset, x = col_B, y = col_C, ax = ax[1])
        ax[1].set_title(label = 'not missing subset of {}'.format(col_A))
        
    plt.show()
    plt.close()
    del missing_subset, notmissing_subset
    gc.collect()

In [None]:
def Missing_Imputer(data = None, col = None, method = "mean", fill_value = None):
    """
    impute the missing value of data[col] with these methods of mean median mode and so on
    """
    if method == 'constant':
        imputer = SimpleImputer(strategy = method,fill_value=value)    
    else:
        imputer = SimpleImputer(strategy= method)
    return imputer.fit_transform(data[[col]])

In [None]:
def Missing_Compute_Corr(data = None, col_A = None, col_B = None):
    """
    compute the correlation of col_A and col_B
    """
    notmissingsubset = data[~data[col_A].isnull()]
    notmissingsubset = notmissingsubset[~notmissingsubset[col_B].isnull()]
    
    pearsonr_corr = ss.pearsonr(notmissingsubset[col_A], notmissingsubset[col_B])
    print("the pearsonr corr between {} and {} is: {}".format(col_A,col_B,pearsonr_corr))
    
    spearmanr_corr =  ss.spearmanr(notmissingsubset[col_A], notmissingsubset[col_B])
    print("the spearmanr corr between {} and {} is: {}".format(col_A, col_B,spearmanr_corr))
    print('\n')