In [2]:
import pandas as pd 
import numpy as np


df = pd.read_csv('./OutlierDetection/OutlierDetection.csv')
df_Nair = pd.read_excel('./OutlierDetection/奈尔(Nair)检验的临界值表.xlsx')
df_Grubbs = pd.read_excel('./OutlierDetection/格拉布斯(Grubbs)检验的临界值表.xlsx')

# 改变df_Nair和df_Grubbs的列名
df_Nair.columns = ['n'] + df_Nair.columns[1: ].map(lambda x: float(x[6: ])).tolist()
df_Grubbs.columns = ['n'] + df_Grubbs.columns[1: ].map(lambda x: float(x[6: ])).tolist()

In [3]:
def outlier_detection(df, alpha, alpha_star, sigma, df_method, side='both'):
    """
    奈尔(Nair)离群值检测
    df: 要检测的数据DataFrame
    alpha: 检出水平α
    alpha_star: 剔除水平α*
    sigma: 正态分布的标准差
    df_method: 临界值表, 可以为Nair或者Grubbs
    side: 'upper', 'lower', 'both'三者之一, 默认为'both'
    """
    data = sorted(df.values.flatten())
    if side == 'both':
        while True:
            n = len(data)
            x_mean = np.mean(data)
            R_upper = (data[-1] - x_mean) / sigma
            R_lower = (x_mean - data[0]) / sigma
            R_critical = df_method.loc[df_method['n']==n, (1 - alpha / 2)].item()
            R_critical_star = df_method.loc[df_method['n']==n, (1 - alpha_star / 2)].item()

            if R_upper > R_lower and R_upper > R_critical:
                if R_upper > R_critical_star:
                    print(f'{data[-1]}为统计离群值')
                else:
                    print(f'{data[-1]}为歧离值')
                data = data[: -1]
        

            elif R_lower > R_upper and R_lower > R_critical:
                if R_lower > R_critical_star:
                    print(f'{data[0]}为统计离群值')
                else:
                    print(f'{data[0]}为歧离值')
                data = data[1: ]

            
            elif R_lower == R_upper and R_upper > R_critical:
                    if R_upper > R_critical_star:
                        print(f'{data[-1]}为统计离群值')
                    else:
                        print(f'{data[-1]}为歧离值')
                    if R_lower > R_critical_star:
                        print(f'{data[0]}为统计离群值')
                    else:
                        print(f'{data[0]}为歧离值')
                    data = data[1: -1]

            else:
                return pd.DataFrame(data, columns=df.columns)
        
    
    if side == 'upper':
        while True:
            n = len(data)
            x_mean = np.mean(data)
            R_upper = (data[-1] - x_mean) / sigma
            R_critical = df_method.loc[df_method['n']==n, (1 - alpha)].item()
            R_critical_star = df_method.loc[df_method['n']==n, (1 - alpha_star)].item()

            if R_upper > R_critical:
                if R_upper > R_critical_star:
                    print(f'{data[-1]}为统计离群值')
                else:
                    print(f'{data[-1]}为歧离值')
                data = data[: -1]
                continue

            else:
                return pd.DataFrame(data, columns=df.columns)
    
    if side == 'lower':
        while True:
            n = len(data)
            x_mean = np.mean(data)
            R_lower = (x_mean - data[0]) / sigma
            R_critical = df_method.loc[df_method['n']==n, (1 - alpha)].item()
            R_critical_star = df_method.loc[df_method['n']==n, (1 - alpha_star)].item()

            if R_lower > R_critical:
                if R_lower > R_critical_star:
                    print(f'{data[0]}为统计离群值')
                else:
                    print(f'{data[0]}为歧离值')
                data = data[1: ]
                continue

            else:
                return pd.DataFrame(data, columns=df.columns)

## 使用Nair检验分别进行下侧、上侧以及双侧检验

In [4]:
print('lower:')
df_nair_lower = outlier_detection(df=df, alpha=0.05, alpha_star=0.01, sigma=0.65, df_method=df_Nair, side='lower')
print('下侧情形Nair检验检测出了两个正态样本离群值，一个为统计离群值，一个为歧离值')
print('upper:')
df_nair_upper = outlier_detection(df=df, alpha=0.05, alpha_star=0.01, sigma=0.65, df_method=df_Nair, side='upper')
print('下侧情形Nair检验没有检测出正态样本离群值')
print('both:')
df_nair_both = outlier_detection(df=df, alpha=0.05, alpha_star=0.01, sigma=0.65, df_method=df_Nair, side='both')
print('双侧情形Nair检验检测出了一个歧离值')

lower:
3.13为统计离群值
3.49为歧离值
下侧情形Nair检验检测出了两个正态样本离群值，一个为统计离群值，一个为歧离值
upper:
下侧情形Nair检验没有检测出正态样本离群值
both:
3.13为歧离值
双侧情形Nair检验检测出了一个歧离值


## 使用Grubbs检验分别进行下侧、上侧以及双侧检验

In [5]:
print('lower:')
df_grubbs_lower = outlier_detection(df=df, alpha=0.05, alpha_star=0.01, sigma=0.65, df_method=df_Grubbs, side='lower')
print('下侧情形Grubbs检验检测出了两个正态样本离群值，一个为统计离群值，一个为歧离值')
print('upper:')
df_grubbs_upper = outlier_detection(df=df, alpha=0.05, alpha_star=0.01, sigma=0.65, df_method=df_Grubbs, side='upper')
print('下侧情形Grubbs检验没有检测出正态样本离群值')
print('both:')
df_grubbs_both = outlier_detection(df=df, alpha=0.05, alpha_star=0.01, sigma=0.65, df_method=df_Grubbs, side='both')
print('双侧情形Grubbs检验检测出了两个正态样本离群值，一个为统计离群值，一个为歧离值')

lower:
3.13为统计离群值
3.49为歧离值
下侧情形Grubbs检验检测出了两个正态样本离群值，一个为统计离群值，一个为歧离值
upper:
下侧情形Grubbs检验没有检测出正态样本离群值
both:
3.13为统计离群值
3.49为歧离值
双侧情形Grubbs检验检测出了两个正态样本离群值，一个为统计离群值，一个为歧离值
