In [2]:
import os
import sys
import warnings
import gc
import random
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier


%run Page_Style.ipynb
warnings.filterwarnings("ignore")

In [4]:
def Missing_Col_Detect(data,col = None):
    """
    return the counts and percentage of missing value of col
    """
    
    missing_count = data[col].isnull().sum()
    missing_id_list = list(data[data[col].isnull()]['id'])
    missing_percentage = data[col].isnull().mean()
    print('特征：{}'.format(col))
    print("缺失总数：{}".format(missing_count))
    print("缺失比例：{}%".format(missing_percentage * 100))
    print("缺失值所在列id，如下：")
    return missing_id_list
    
    

In [3]:
def Missing_All_Col_Detect(data,col = None):
    """
    return the counts and percentage of missing value of all features
    """
    
    missing_df = pd.concat([data.isnull().sum(), data.isnull().mean()], axis = 1)
    missing_df = missing_df.rename(index = str,columns = {0:'missing_total', 1:'missing_per'})
    return missing_df
    
    

In [5]:
def Missing_Imputer_Medthod(data, col = None, method = None,fill_value = None):
    """
    根据不同方法，填充缺失值
    """
    filled_data = []
    if method == 'random':
        choice_list = data[col].value_counts().index.to_list()
        filled_data = data[col].apply(lambda x: random.choice(choice_list) if pd.isna(x) else x) 
    elif method != 'constant':
        imputer = SimpleImputer(strategy = method)
        filled_data = imputer.fit_transform(data[[col]])
    else:
        imputer = SimpleImputer(strategy = method, fill_value=fill_value)
        filled_data = imputer.fit_transform(data[[col]])
        
    return filled_data

In [7]:
def Outliers_Detect_Plot(data,col = None):
    """
    检测某一个特征列的异常值, 根据1.5倍IQR原理，统计异常值数量
    """
    fig = plt.figure(figsize = (15, 10))
    sns.boxplot(data=data,y = col)
    plt.show()
    plt.close()
    
    print(data[col].describe(),'\n')
    
    Q1 = data[col].describe()['25%']
    Q3 = data[col].describe()['75%']
    IQR = Q3 - Q1
    high = Q3 + IQR * 1.5
    low = Q1 - IQR * 1.5
    print("四分位high: {}".format(high))
    print("四分位low: {}".format(low))
    over_high = [x for x in data[col].values if x > high]
    below_low = [x for x in data[col].values if x < low]

    print("over_high总数：{}, below_low总数：{}".format(len(over_high), len(below_low)))

In [9]:
def Outliers_Handle_IQR(data, col = None):
    """
    根据IQR对异常值进行处理
    """
    Q3 = data[col].describe()['75%']
    Q1 = data[col].describe()['25%']
    IQR = Q3 - Q1
    high = Q3 + IQR * 1.5
    low = Q1 - IQR * 1.5

    data[col] = data[col].apply(lambda x: high + 1 if x > high else x)
    data[col] = data[col].apply(lambda x: low - 1 if x < low else x)
    return data[col]

In [1]:
def Rare_Value_Impute(data, col= None, num = 1,method = 'extreme', value = -1):
    """
    对取值少的特征进行处理, 
    可以采用众数重赋值，
    也可以使用极值重赋值
    """
    data_value_counts = data[col].value_counts()
    value_list = list(data_value_counts.index[data_value_counts <= num])
    index_list = []
    for i in value_list:
        index_list.append(list(data[col][data[col].values == i].index))
    index_list = itertools.chain(*index_list)
    
    if(method == 'extreme'):
        data[col].loc[index_list] = value
    elif method == 'mode':
        value = data[col].mode()
        data[col].loc[index_list] = value
    elif method == 'median':
        value = data[col].method()
        data[col].loc[index_list] = value
    elif method == 'mean':
        value = data[col].mean()
        data[col].loc[index_list] = value
    return data[col]

In [3]:
def Rare_Get_Value_Counts(data, col = None, num = 1):
    """
    返回data[col]的取值统计，有助于检测rare value
    """
    print("完整的数值分布：\n")
    print(data[col].value_counts())
    indexs = data[col].value_counts().index[data[col].value_counts() <= num]
    df = pd.DataFrame(data[col].value_counts()[indexs])
    df = df.rename(columns={col:'counts'})
    df.sort_values(by = 'counts', ascending = False)
    print("取值<={}\n".format(num))
    print(df)
    del df
    gc.collect()