# 2-高维异常检测
## 案例：高维数据的异常检测
### 数据集：ex8data2.mat

In [1]:
import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt

In [3]:
mat = sio.loadmat('ex8data2.mat')
X,Xval,yval = mat['X'],mat['Xval'],mat['yval']
X.shape,Xval.shape,yval.shape

((1000, 11), (100, 11), (100, 1))

In [4]:
def estimateGaussian(X,isCovariance):
    """高斯分布参数估计"""
    means = np.mean(X,axis=0)
    
    if isCovariance:
        # 求协方差
        sigma2 = (X - means).T @ (X - means) / len(X)
    else:
        # 求方差
        sigma2 = np.var(X,axis=0)
        
    return means,sigma2

In [5]:
def gaussian(X,means,sigma2):
    
    if np.ndim(sigma2) == 1:
        sigma2 = np.diag(sigma2)
        
    X = X - means
    n = X.shape[1]
    
    p = np.power(2*np.pi,-n/2) * (np.linalg.det(sigma2)**(-0.5)) * np.exp(-0.5*np.diag(X@np.linalg.inv(sigma2)@X.T))
    p = p.reshape(-1,1)
    
    return p 

In [7]:
def plotGaussian(X,means,sigma2):
    x = np.arange(0,30,0.5)
    y = np.arange(0,30,0.5)
    xx,yy = np.meshgrid(x,y)
    z = gaussian(np.c_[xx.ravel(),yy.ravel()],means,sigma2)
    zz = z.reshape(xx.shape)
    plt.plot(X[:,0],X[:,1],'bx')
    contour_levels = [10**h for h in range(-20,0,3)]
    plt.contour(xx,yy,zz,contour_levels)

In [12]:
means,sigma2 = estimateGaussian(X,isCovariance=True)
pval = gaussian(Xval,means,sigma2)

In [13]:
def selectThreshold(yval,p):
    bestEpsilon = 0
    bestF1 = 0
    epsilons = np.linspace(min(p),max(p),1000)
    
    for e in epsilons:
        p_ = p < e
        tp = np.sum((yval==1)&(p_==1))
        fp = np.sum((yval==0)&(p_==1))
        fn = np.sum((yval==1)&(p_==0))
        
        prec = tp / (tp+fp) if (tp+fp) else 0
        rec = tp / (tp+fn) if (tp+fn) else 0
        F1_e = 2 * prec * rec / (prec + rec) if (prec + rec) else 0 
        
        if F1_e > bestF1:
            bestF1 = F1_e
            bestEpsilon = e
            
    return bestEpsilon,bestF1

In [14]:
bestEpsilon,bestF1 = selectThreshold(yval,pval)
p = gaussian(X,means,sigma2)
anoms = [X[i] for i in range(len(X)) if p[i] < bestEpsilon]
len(anoms)

122