In [1]:
import os
import gc 
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings(action = 'ignore')

In [None]:
def Value_Count_Description(data = None, col = None):
    """
    刻画特征列的统计值
    """


    description = data[col].describe()
    Q1 = description['25%']
    Q3 = description['75%']
    IQR = Q3 - Q1
    
    over_high = [x for x in data[col].values if x > 1.5 * IQR + Q3]
    
    below_low = [x for x in data[col].values if x < Q1 - 1.5 * IQR]
    
    
    print("\n1.IQR method:")
    print("the num over Q3+1.5IQR is:{}".format(len(over_high)))
    
    print("the num below Q1-1.5IQR is:{}".format(len(below_low)))
    
    
    mean = description['mean']
    std = description['std']
    
    over_3std = [x for x in data[col].values if x > mean + 3 * std]
    
    below_3std = [x for x in data[col].values if x < mean - 3 * std]
    
    print("\n2.mean_std method:")
    print("the num over mean3std is:{}".format(len(over_3std)))
    print("the num below mean3std is:{}".format(len(below_3std)))
    
    std = description['std']
    median = description['50%']
    over_3std = [x for x in data[col].values if x > median + 3 * std]
    
    below_3std = [x for x in data[col].values if x < median - 3 * std]
  
    print("\n3.median_std method:")
    print("the num over median3std is:{}".format(len(over_3std)))
    print("the num below median3std is:{}".format(len(below_3std)))
    
    
    print("\n4.value counts of {}".format(col))
    print(data[col].value_counts(dropna = False, normalize = True))
    
    
    print("\n5.description of {}".format(col))
    print(data[col].describe())
    print("--------------------------------------------------------------------------------------------------------")


In [None]:
def Plot_Cate_Feature(data = None,col = None):
    """
    画出类别型特征的图
    """
    fig,ax = plt.subplots(1,2 ,figsize = (18,8))
    sns.countplot(data = data, x = col, ax = ax[0])
    ax[0].set_title(label = 'Counts of {}'.format(col))
    ax[0].tick_params(axis = 'x', rotation = 45)
    if str(data[col].dtype)[:3] == 'int' or str(data[col].dtype)[:5] == 'float':
        sns.boxplot(data = data, y = col, ax = ax[1])
        ax[1].set_title(label = 'Box plot of {}'.format(col))
    
    plt.show()
    plt.close()


In [4]:
def Plot_Cont_Feature(data = None,col = None, is_percentage = False):
    """
    画出数值型特征的图
    """
    fig,ax = plt.subplots(1,2,figsize = (15,8))
    sns.histplot(data = data, x = col, ax = ax[0])
    ax[0].set_title(label = 'distribution of {}'.format(col))
    
    sns.boxplot(data = data, y = col, ax = ax[1])
    ax[1].set_title(label = 'Box plot of {}'.format(col))
    
    #whether plot the percentage of col_A in col_B or not
    if is_percentage:
        bars = ax.patches
    half = int(len(bars)/2)
    left_bars = bars[:half]
    right_bars = bars[half:]

    for left, right in zip(left_bars, right_bars):
        height_l = left.get_height()
        height_r = right.get_height()
        total = height_l + height_r

        ax.text(left.get_x() + left.get_width() / 2., left.get_height(),'{0:.0%}'.format(height_l/total), ha="center")
        ax.text(right.get_x() + right.get_width() / 2., right.get_height(),'{0:.0%}'.format(height_r/total), ha="center")
    
    
    plt.show()
    plt.close()

In [3]:
def Plot_Cate_Correlation(data, col_A = None, col_B = None, is_percentage = False):
    """
    visualize the correlation of categorical feature col_A and col_B
    and show the percentage of col_A in col_B
    """
    fig, ax = plt.subplots(figsize = (15, 8))
    sns.countplot(data = data, x = col_A, hue = col_B,  ax = ax)
    ax.tick_params(rotation = -90)
    
    
    #whether show the percentage of col_A in col_B or not
    if is_percentage:
        bars = ax.patches
        half = int(len(bars)/2)
        left_bars = bars[:half]
        right_bars = bars[half:]

        for left, right in zip(left_bars, right_bars):
            height_l = left.get_height()
            height_r = right.get_height()
            total = height_l + height_r

            ax.text(left.get_x() + left.get_width() / 2., left.get_height(),'{0:.0%}'.format(height_l/total), ha="center")
            ax.text(right.get_x() + right.get_width() / 2., right.get_height(),'{0:.0%}'.format(height_r/total), ha="center")

    #plot the figure
    plt.show()
    plt.close()

In [5]:
def Plot_Cont_Correlation(data = None, col_A = None, col_B = None):
    """
    plot the continue feature correlation betweent col_A and col_B,
    """
    fig,ax = plt.subplots(figsize = (15, 8))
    sns.boxplot(data = data, x = col_B, y = col_A, ax = ax )
#     ax.tick_params(axis = 'x',rostation = -90)
    plt.show()
    plt.close()
    

SyntaxError: unexpected EOF while parsing (4187520414.py, line 1)