In [1]:
import os
import gc 
import warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings(action = 'ignore')

In [None]:
def Reduce_Memory_Usage(data = None):
    """
    根据数据集中特征的取值进行调整减少内存占用
    
    """
    
    dtypes = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    origin_mem_usage = data.memory_usage().sum()
    for col in data.columns:
        if data[col].dtype in dtypes:
            #获取该特征列的最大最小值
            max_value = np.max(data[col])
            min_value = np.min(data[col])
            
            if str(data[col].dtype)[:3] == 'int':
                if max_value <= np.iinfo(np.int8).max and min_value >= np.iinfo(np.int8).min:
                    data[col] = data[col].astype('int8')
                
                elif max_value <= np.iinfo(np.int16).max and min_value >= np.iinfo(np.int16).min:
                    data[col] = data[col].astype('int16')
                
                elif max_value <= np.iinfo(np.int32).max and min_value >= np.iinfo(np.int32).min:
                    data[col] = data[col].astype('int32')
                
                elif max_value <= np.iinfo(np.int64).max and min_value >= np.iinfo(np.int64).min:
                    data[col] = data[col].astype('int64')
            else:
                if max_value <= np.finfo(np.float16).max and min_value >= np.finfo(np.float16).min:
                    data[col] = data[col].astype('float16')
                    
                elif max_value <= np.finfo(np.float32).max and min_value >= np.finfo(np.float32).min:
                    data[col] = data[col].astype('float32')
                
                elif max_value <= np.finfo(np.float64).max and min_value >= np.finfo(np.float64).min:
                    data[col] = data[col].astype('float64')

    reduced_memory_usage = data.memory_usage().sum()
    reduced_sum = (origin_mem_usage - reduced_memory_usage) 
    print("origin memory usage: {}".format(origin_mem_usage / 1024**2))
    print("reduced memory usage: {}".format(reduced_memory_usage/1024**2))
    print("memory reduce ({:.1f}%)".format(reduced_sum * 100/origin_mem_usage))
    gc.collect()

In [None]:
def Plot_Num_Feature(data = None,col = None):
    """
    画出数值型特征的图
    """
    fig,ax = plt.subplots(1,2,figsize = (15,6))
    sns.histplot(data = data, x = col, ax = ax[0])
    ax[0].set_title(label = 'distribution of {}'.format(col))
    
    sns.boxplot(data = data, y = col, ax = ax[1])
    ax[1].set_title(label = 'Box plot of {}'.format(col))
    
    plt.show()
    plt.close()

In [None]:
def Plot_Cate_Feature(data = None,col = None):
    """
    画出类别型特征的图
    """
    fig,ax = plt.subplots(1,2,figsize = (15,6))
    sns.countplot(data = data, x = col, ax = ax[0])
    ax[0].set_title(label = 'Counts of {}'.format(col))
    ax[0].xtick()
    if str(data[col].dtype)[:3] == 'int' or str(data[col].dtype)[:5] == 'float':
        sns.boxplot(data = data, y = col, ax = ax[1])
        ax[1].set_title(label = 'Box plot of {}'.format(col))
    
    plt.show()
    plt.close()

In [None]:
def Value_Count_Description(data = None, col = None):
    """
    刻画特征列的统计值
    """


    description = data[col].describe()
    Q1 = description['25%']
    Q3 = description['75%']
    IQR = Q3 - Q1
    
    over_high = [x for x in data[col].values if x > 1.5 * IQR + Q3]
    
    below_low = [x for x in data[col].values if x < Q1 - 1.5 * IQR]
    
    
    print("\n1.IQR method:")
    print("the num over Q3+1.5IQR is:{}".format(len(over_high)))
    
    print("the num below Q1-1.5IQR is:{}".format(len(below_low)))
    
    
    mean = description['mean']
    std = description['std']
    
    over_3std = [x for x in data[col].values if x > mean + 3 * std]
    
    below_3std = [x for x in data[col].values if x < mean - 3 * std]
    
    print("\n2.mean_std method:")
    print("the num over 3std is:{}".format(len(over_3std)))
    print("the num below 3std is:{}".format(len(below_3std)))
    
    print("\n3.value counts of {}".format(col))
    print(data[col].value_counts(dropna = False, normalize = True))
    
    
    print("\n4.description of {}".format(col))
    print(data[col].describe())