In [1]:
import os
import gc 
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings(action = 'ignore')

In [None]:
def Value_Count_Description(data = None, col = None):
    """
    刻画特征列的统计值
    """


    description = data[col].describe()
    Q1 = description['25%']
    Q3 = description['75%']
    IQR = Q3 - Q1
    
    over_high = [x for x in data[col].values if x > 1.5 * IQR + Q3]
    
    below_low = [x for x in data[col].values if x < Q1 - 1.5 * IQR]
    
    
    print("\n1.IQR method:")
    print("the num over Q3+1.5IQR is:{}".format(len(over_high)))
    
    print("the num below Q1-1.5IQR is:{}".format(len(below_low)))
    
    
    mean = description['mean']
    std = description['std']
    
    over_3std = [x for x in data[col].values if x > mean + 3 * std]
    
    below_3std = [x for x in data[col].values if x < mean - 3 * std]
    
    print("\n2.mean_std method:")
    print("the num over mean3std is:{}".format(len(over_3std)))
    print("the num below mean3std is:{}".format(len(below_3std)))
    
    std = description['std']
    median = description['50%']
    over_3std = [x for x in data[col].values if x > median + 3 * std]
    
    below_3std = [x for x in data[col].values if x < median - 3 * std]
  
    print("\n3.median_std method:")
    print("the num over median3std is:{}".format(len(over_3std)))
    print("the num below median3std is:{}".format(len(below_3std)))
    
    
    print("\n4.value counts of {}".format(col))
    print(data[col].value_counts(dropna = False, normalize = True))
    
    
    print("\n5.description of {}".format(col))
    print(data[col].describe())
    print("--------------------------------------------------------------------------------------------------------")


In [2]:
def Plot_Feature_Nan_Rate(data = None, col = None, label = None):
    """plot the rate of nan value of col in dataset
    """
    if data[col].isnull().sum() > 0:
        temp_df = data[data[col].isnull()].copy()
        print("The total Nan amount of {} is {}".format(col, len(temp_df)))
        print("The Nan rate of {0} is {1:.3f}%".format(col,  len(temp_df) / data.shape[0] * 100))
        
        fig,ax = plt.subplots(figsize = (5, 3))
        sns.countplot(data = data[data[col].isnull()] ,x = label, ax = ax)
        ax.set_title(label = "the {} rate in {} Nan values".format(label, col), fontsize =14)
        #plot the percentage
        for p in ax.patches:
            height = p.get_height()
            ax.text(p.get_x()+p.get_width()/2.,
                       height + 3,
                       '{0:.3f}%'.format(height/len(data[data[col].isnull()])*100),
                        ha="center",fontsize=8) 

        plt.show()
        plt.close()
        del temp_df
        gc.collect()
    else:
        print("There is no Nan value in {}".format(col))
    
    

In [2]:
def Plot_Cate_Feature(data = None,col = None, corr_col = None):
    """
    plot categorical feature description and the correlation between col and corr_col
    """
    fig,ax = plt.subplots(1,2 ,figsize = (22,8))
    
    sns.countplot(data = data, x = col, ax = ax[0])
    ax[0].set_title(label = 'Percentage of {} values'.format(col))
    ax[0].tick_params(axis = 'x', rotation = 90, labelsize= 12)

    #plot the histogram of col
    for p in ax[0].patches:
        height = p.get_height()
        ax[0].text(p.get_x()+p.get_width()/2.,
                   height + 3,
                   '{0:.3f}%'.format(height/len(data)*100),
                    ha="center",fontsize=12) 
    
    #whether show the percentage of col_A in corr_col or not
    sns.countplot(data = data, x = col, hue = corr_col,  ax = ax[1])
    ax[1].set_title(label = "Rates of {} in different {} value".format(corr_col, col))
    ax[1].tick_params(axis = 'x', rotation = 90, labelsize= 12)
    
    #plot the percentage 
    bars = ax[1].patches
    half = int(len(bars)/2)
    left_bars = bars[:half]
    right_bars = bars[half:]

    for left, right in zip(left_bars, right_bars):
        height_l = left.get_height()
        height_r = right.get_height()
        total = height_l + height_r

        ax[1].text(left.get_x() + left.get_width() / 2., left.get_height(),'{0:.2%}'.format(height_l/total), ha="center")
        ax[1].text(right.get_x() + right.get_width() / 2., right.get_height(),'{0:.2%}'.format(height_r/total), ha="center")


    plt.show()
    plt.close()


In [4]:
def Plot_Cont_Feature(data = None,col = None):
    """
    画出数值型特征的图
    """
    fig,ax = plt.subplots(1,2,figsize = (15,5))
    sns.histplot(data = data, x = col, ax = ax[0])
    ax[0].set_title(label = 'distribution of {}'.format(col))
    
    sns.boxplot(data = data, y = col, ax = ax[1])
    ax[1].set_title(label = 'Box plot of {}'.format(col))
    
    
    plt.show()
    plt.close()

In [3]:
def Plot_Cate_Correlation(data, col_A = None, hue_B = None, is_percentage = True):
    """
    visualize the correlation of categorical feature col_A and hue_B
    and show the percentage of col_A in hue_B
    """
    fig, ax = plt.subplots(figsize = (15, 8))
    sns.countplot(data = data, x = col_A, hue = hue_B,  ax = ax)
    ax.tick_params(rotation = -90)
    
    
    #whether show the percentage of col_A in hue_B or not
    if is_percentage:
        bars = ax.patches
        half = int(len(bars)/2)
        left_bars = bars[:half]
        right_bars = bars[half:]

        for left, right in zip(left_bars, right_bars):
            height_l = left.get_height()
            height_r = right.get_height()
            total = height_l + height_r

            ax.text(left.get_x() + left.get_width() / 2., left.get_height(),'{0:.0%}'.format(height_l/total), ha="center")
            ax.text(right.get_x() + right.get_width() / 2., right.get_height(),'{0:.0%}'.format(height_r/total), ha="center")

    #plot the figure
    plt.show()
    plt.close()

In [5]:
def Plot_Cont_Correlation(data = None, col_A = None, col_B = None):
    """
    plot the continue feature correlation betweent col_A and col_B,
    """
    fig,ax = plt.subplots(figsize = (15, 8))
    sns.boxplot(data = data, x = col_B, y = col_A, ax = ax )
#     ax.tick_params(axis = 'x',rostation = -90)
    plt.show()
    plt.close()
    

SyntaxError: unexpected EOF while parsing (4187520414.py, line 1)