In [None]:
import os
import gc


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



def Reduce_Memory_Usage(data):
    """
    根据数据集中的数据类型 减少数据集的内存占用
    """
    cols_int = ['int8','int16','int32','int64']
    cols_float = ['float16','float32','float64']
    #原始占据的内存大小
    origin_mem_usage = data.memory_usage().sum() / 1024**2
    for col in data.columns:
        if data[col].dtype in cols_int or data[col].dtype in cols_float:
            #该列最大最小值
            min_value = np.min(data[col])
            max_value = np.max(data[col])
            
            if str(col)[:3] == 'int':
                if min_value >= np.iinfo(np.int8).min and max_value <= np.iinfo(np.int8).max:
                    data[col] = data[col].astype('int8')
                elif min_value >= np.iinfo(np.int16).min and max_value <= np.iinfo(np.int16).max:
                    data[col] = data[col].astype('int16')
                elif min_value >= np.iinfo(np.int32).min and max_value <= np.iinfo(np.int32).max:
                    data[col] = data[col].astype('int32')
                elif min_value >= np.iinfo(np.int64).min and max_value <= np.iinfo(np.int64).max:
                    data[col] = data[col].astype('int64')
            else:
                
                if min_value >= np.finfo(np.float16).min and max_value <= np.finfo(np.float16).max:
                    data[col] = data[col].astype('float16')
                elif min_value >= np.finfo(np.float32).min and max_value <= np.finfo(np.float32).max:
                    data[col] = data[col].astype('float32')
                elif min_value >= np.finfo(np.float64).min and max_value <= np.finfo(np.float64).max:
                    data[col] = data[col].astype('float64')
                    
    reduced_mem_usage = data.memory_usage().sum() / 1024 **2
    
    print("Mem. usage decreased  ({:.1f}% reduction)".format(100 * (origin_mem_usage - reduced_mem_usage) / origin_mem_usage))
    
    gc.collect()

def Plot_num(data,col = None):
    """
    画出连续性变量的数据分布
    """
    
    fig,ax = plt.subplots(1,2,figsize = (15,6))
    
    sns.histplot(data = data,x = col, ax = ax[0])
    ax[0].set_title(label = 'Distribution of {}'.format(col))
    
    sns.boxplot(data = data, y = col, ax = ax[1])
    ax[1].set_title(label = 'boxplot of {}'.format(col))
    
    
    plt.show()
    plt.close()

def Plot_cate(data, col = None):
    """
    画出类别型遍历的数据分布
    """
    print('value counts:\n')
    print(data[col].value_counts(dropna = False, normalize = True))
    
    
    fig, ax = plt.subplots(figsize = (15,6))
    
    sns.countplot(data = data, x=col,ax = ax)
    ax.set_title(label = "Counts of {}".format(col))
    plt.show()
    plt.close()

def Value_Counts_Distribution(data, col = None):
    print('value counts:\n')
    print(data[col].value_counts(dropna = False,normalize = True))
    print('value description\n')
    print(data[col].describe())
    
    Q1 = data[col].describe()['25%']
    Q3 = data[col].describe()['75%']
    IQR = Q3 - Q1
    high = Q3 + 1.5 * IQR
    low = Q1 - 1.5 * IQR
    over_high = [x for x in data[col].values if x > high]
    below_low = [x for x in data[col].values if x < low]
    print("\nIQR detective method:")
    print("total num of over_high value is {}".format(len(over_high)))
    
    print("total num of below_low value is {}".format(len(below_low)))
    
    mean_col = data[col].describe()['mean']
    std_col = data[col].describe()['std']
    over_m_3std = [x for x in data[col].values if x > mean_col + 3 * std_col]
    
    low_m_3std = [x for x in data[col].values if x < mean_col - 3 * std_col]
    
    print("\nMean_Std detective method:")
    print("total num of over_m_3std value is {}".format(len(over_m_3std)))
    
    print("total num of below_m_3std value is {}".format(len(low_m_3std)))
    



