In [None]:
import matplotlib.pyplot as plt
import numpy as np
from helpers import make_state_trace, identify_system, random_input, plot_eigenvalues
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from sklearn.linear_model import OrthogonalMatchingPursuit
%matplotlib notebook

# Code written by Kuan-Yun Lee

# Outlier Removal via OMP

This notebook will guide you through the process of removing outliers for problems of the form
$$ y \approx Xw. $$

To begin, we generate data for our problem. All the values in our code can be changed, so you can play around with the numbers if you want. We model $w$ as a random vector consisting of uniform random variables in $[2,20]$, and model the data points $X$ as a matrix with uniform entries in $[0,10]$. We observe corrupted versions of the true observations $y = Xw + Z$, where $Z$ is a multivariate Gaussian random variable with diagonal covariance depending on whether the observation is an outlier. In particular, we choose uniformly $num\_outliers$ amount of outliers, and for these outliers, the variance is set to $100$, while for other points, the variance is set to $0.1$. In order to make the outliers exaggerated for us to see the effect of our algorithm, we add $40$ to outliers.

In [None]:
# --------------------------------------------------------------------

def generate_data(n, d, num_outliers):
    """
    Generates the data for our problem.
    
    args:
      n: number of samples
      d: dimension of samples
      num_outliers: how many outliers are in the noisy observations;
        we model outliers by adding large Gaussian noise
                      
    returns:
      data: the nxd data matrix X
      w_star: the underlying dx1 matrix W
      observations: nx1 vector of noisy observations with outliers
      
    """
    
    w_star = np.random.uniform(2.0,20.0,(d,1))
    data = np.random.uniform(0.0,10.0,(n,d))
    y_true = np.dot(data,w_star)
    ind = np.random.choice(np.arange(n), num_outliers, False)
    vec = 0.1*np.ones((n,1))
    for a in ind:
        vec[a] = 100.000
    observations = np.random.multivariate_normal(
                        np.ndarray.flatten(y_true), np.diagflat(vec))
    for a in ind:
        observations[a] += 40.000
    return data, w_star, observations

# --------------------------------------------------------------------

# Greedy OMP

We implement our OMP with scikit-learn's OrthogonalMatchingPursuit function.  

In [None]:
# --------------------------------------------------------------------

def greedy_OMP(X_aug, y, k, w_star):
    """
    Performs orthogonal matching pursuit.
    
    args:
      X_aug: augmented data matrix, i.e., X_aug = [X I]
      y: noisy observations with outliers
      k: how many non-zero entries in solution of OMP; is equal to
        the number of iterations in the OMP algorithm
                      
    returns:
      idx: the size k set of ignored data points
      plot_points: the prediction points for the two points x=0 and 
        x = 10.0; for use of plotting
       remaining_residual: amount of residual remaining
        
    """
    
    reg = OrthogonalMatchingPursuit(n_nonzero_coefs=k, fit_intercept=False).fit(X_aug,y)
    idx = [ j for (i,j) in zip(reg.coef_,range(1,len(reg.coef_)+1)) 
                       if (i>0.0000001 or i<-0.0000001) ]
    
    test_mat = 0.0*np.zeros((2,X_aug.shape[1]))
    test_mat[1,0] = 10.0
    plot_points = reg.predict(test_mat)
    
    y = np.reshape(y,(y.shape[0],1))
    err_vec = (y - np.reshape(reg.predict(X_aug),(y.shape[0],1)))**2
    remaining_residual = np.sum(err_vec)
    
    estimate_residual = (np.reshape(reg.predict(X_aug),(y.shape[0],1)) 
                                - np.reshape(X_aug[:,0]*w_star, (y.shape[0],1)))**2
    
    #Comment below to plot two lines, and uncomment the next line 
    return idx, plot_points, remaining_residual, np.zeros(np.asarray(err_vec).shape), np.log(np.asarray(estimate_residual))
    #return idx, plot_points, remaining_residual, np.log(np.asarray(err_vec)), np.log(np.asarray(estimate_residual))

# Plotting the graph

For each iteration of the algorithm, we plot the changes in our remaining points, the estimated line, and the remaining residue.

In [None]:
def plot_func(X_aug, X_true, y, idxes, errs, est_plot, err_vec=[], est_err_vec=[]):
    """
    Plots functions.
    
    args:
      X_aug: augmented data matrix, i.e., X_aug = [X_true I]
      X_true: the true data matrix
      y: noisy observations with outliers
      idxes: columns chosen by OMP
      errs: errs from OMP
      est_plot: estimated values at x=0 and x=10 for plotting of line
      err_vec: vector of errors with respect to true fit
      est_err_vec: vector of errors with respect to estimated fit
        
    """
    X_copy = X_true.copy()
    y_copy = y.copy()
    y_copy2 = y.copy()
    X_deleted = []
    Y_deleted = []
    
    err_vec_copy = err_vec.copy()
    est_err_vec_copy = est_err_vec.copy()
    err_vec_deleted = [None]*len(err_vec)
    est_err_vec_deleted = [None]*len(est_err_vec)
    
    
    for a in idxes: # Seperate the removed outliers from the data
        if a!=1:
            X_deleted.append(X_copy[a-2,0])
            Y_deleted.append(y_copy2[a-2])
            X_copy[a-2] = None
            y_copy[a-2] = None 
            
            err_vec_deleted[a-2] = err_vec_copy[a-2]
            est_err_vec_deleted[a-2] = est_err_vec_copy[a-2]
            err_vec[a-2] = None
            est_err_vec[a-2] = None
            
    
    if d==1: # Plot the linear fit and remaining points
        fig, ax = plt.subplots(1,1,figsize = [10,10])
        
        plt.plot(X_copy[:,0],y_copy,'ro', label = 'Remaining points')
        plt.plot(X_deleted, Y_deleted, 'gx', label = 'Removed points')
        plt.plot(np.array([0,10.0]), est_plot, label = 'Linear fit')
        plt.plot(np.array([0,10.0]), np.array([0,10.0*w_star]), label = 'True fit')
        plt.title('Points and linear fit at iteration {}'.format(n-num+1))
        plt.legend()
        
        axin = inset_axes(ax, width="30%", height="30%", loc=4, borderpad=2) 
        errs_vals = errs[:] + [None]*(num-1)
        current_val = [None]*n
        current_val[n-num] = errs_vals[n-num]
        errs_vals[n-num] = None 
        
        plt.title('Error over remaining points')
        plt.xlim(0,n+1)
        plt.ylim(-5,errs[0]+5)
        plt.plot(np.arange(1,n+1,1), errs_vals, 'b.', label = 'Previous error')
        plt.plot(np.arange(1,n+1,1), current_val, 'r.', label = 'Current error')
        plt.plot()
        plt.legend()
        
        
        if len(err_vec)!=0 and len(est_err_vec)!=0:
            axin = inset_axes(ax, width="70%", height="10%", loc=1, borderpad=2) 
            
            axin.set_yticklabels([])
            
            plt.ylim(-0.5,0.5)
            
            
            # UnComment the following lines to plot both
            #plt.plot(err_vec, 0.2*np.ones((len(err_vec),)), 'r,')
            #plt.plot(err_vec_deleted, 0.2*np.ones((len(err_vec),)), 'gx')
            #plt.plot(est_err_vec, -0.2*np.ones((len(err_vec),)), 'r,')
            #plt.plot(est_err_vec_deleted, -0.2*np.ones((len(err_vec),)), 'gx')
            #plt.title("Top: Distances to Current Fit, Bottom: Distances to True Fit")
            #plt.xlabel("Log of distances")
            
            #Use the following lines to plot just the distance to true fit, 
            #if you wish to plot both lines, comment the bottom and uncomment the above part
            
            plt.xlim(-5,10)
            plt.plot(est_err_vec, np.zeros((len(err_vec),)), 'r,')
            plt.plot(est_err_vec_deleted, np.zeros((len(err_vec),)), 'gx')
            plt.title("Distances to True Fit")
            plt.xlabel("Log of distances")
        
        plt.show()
        
        
        
        

# Running the procedure

First we generate our data. 

In [None]:
# The plots will only work for dimension d = 1.
n = 50
d = 1
num_outliers = 20

X, w_star, y = generate_data(n,d,num_outliers) 

Next, we plot the initial points and the initial linear fit.

In [None]:
idxes = []
errs = []
X_true = X.copy()
X_aug = np.concatenate((X,np.eye(X.shape[0])),axis =1)

# plot the initial points and linear fit
if d==1:
    idxes, est_plot, _, _x, _y = greedy_OMP(X_aug,y, 1, w_star)
    %matplotlib inline
    fig, ax = plt.subplots(1,1,figsize = [10,10])
    plt.plot(X,y,'ro')
    plt.plot(np.array([0,10.0]), est_plot, label = 'Linear fit')
    plt.plot(np.array([0,10.0]), np.array([0,10.0*w_star]), label = 'Optimal fit')
    
    plt.title('Initial points and linear fit')
    plt.legend()
    plt.show()

Then, we start the OMP procedure, and see what happens during each iteration. Note that here we let our procedure run until there are only two points left, or until the OMP algorithm cannot distinguish between the remaining points. This situation can happen when we have removed all the outliers and all the remaining points are on a line already.

In [None]:
num = n
idxes_size = -1

while(num>1): 
    idxes, est_plot, err, err_vec, est_err_vec = greedy_OMP(X_aug,y, n-num+2, w_star) 
    if idxes_size == len(idxes):
        break
    idxes_size = len(idxes)
    errs.append(err)
    
    plot_func(X_aug, X_true, y, idxes, errs, est_plot, err_vec, est_err_vec)
    
    num = num - 1

fig, ax = plt.subplots(1,1,figsize = [10,10])
plt.plot(np.arange(1,len(errs)+1,1), errs,)
plt.title("Error vs number of augmented bases")
plt.show()


# Example for system identification with outliers

Although using ML libraries such as sklearn is helpful, in some cases we still need to do a bit more work as engineers to fix issues. In this example, we will see how outliers for systemID will affect our estimations, how our vanilla OMP will not always work, and how we can do some extra work to make it work.

Now we consider Example 1 from the system ID Demo code. In particular, we'll use
\begin{align}
A &= \begin{bmatrix} 0 & 1 \\ 0.3 & 0.2 \end{bmatrix}, & B &= \begin{bmatrix} 0 \\ 1 \end{bmatrix}.
\end{align}
For more details, see Example 1 of the DemoSystemID code.

Below is the exact same code from Example 1. As we can see, when there are no significant outliers, the algorithm there performs perfectly.

In [None]:
a_known = np.matrix([[0, 1],
                     [0.3, 0.2]])
b_known = np.matrix([[0],
                     [1]])

k = 100
t = t = np.arange(k)

u_trace = random_input(t, 1)

x_trace = make_state_trace(a_known, b_known, u_trace)

a_identified, b_identified = identify_system(x_trace, u_trace)
print('identified A matrix:')
print(a_identified)
print('identified B matrix:')
print(b_identified)
print('Frobenius norm of matrix A and true A:',  
      np.linalg.norm(a_identified-a_known, ord='fro', axis=None, keepdims=False))
print('Frobenius norm of matrix B and true B:',  
      np.linalg.norm(b_identified-b_known, ord='fro', axis=None, keepdims=False))


## Effect of adding k outliers

Now, we simply choose $k$ random points from $\vec{x}$, and add $2000$ to both coordinates. Running the exact same code as above, there is already visible error in our estimates of $A$ and $b$.

In [None]:
k = 5

corrupt_idx = np.random.choice(np.arange(0,x_trace.shape[0],1), size = (k,1), replace=False)
x_trace_cor = x_trace.copy()
for i in corrupt_idx:
    x_trace_cor[i] = x_trace_cor[i]+2000

a_identified_cor, b_identified_cor = identify_system(np.asarray(x_trace_cor), u_trace)
print('identified A matrix:')
print(a_identified_cor)
print('identified B matrix:')
print(b_identified_cor)
print('Frobenius norm of matrix A and true A:',  
      np.linalg.norm(a_identified_cor-a_known, ord='fro', axis=None, keepdims=False))
print('Frobenius norm of matrix B and true B:',  
      np.linalg.norm(b_identified_cor-b_known, ord='fro', axis=None, keepdims=False))
plot_eigenvalues([a_known, a_identified_cor], ['true eigenvalues', 'estimated eigenvalues'])

## Effect of running OMP to remove the outlier

Next, let's use our OMP algorithm to try to remove the outlier. As we can see, this doesn't work in this case (you can play with some examples to see when the algorithm still works). 

In [None]:
X_aug = np.concatenate((x_trace_cor[0:u_trace.shape[0],:], u_trace), axis = 1)
X_aug = np.concatenate((X_aug, np.eye(u_trace.shape[0])), axis = 1)
y = x_trace_cor[1:u_trace.shape[0]+1,:]



reg1 = OrthogonalMatchingPursuit(n_nonzero_coefs=6, fit_intercept=False).fit(X_aug,y[:,0])
reg2 = OrthogonalMatchingPursuit(n_nonzero_coefs=6, fit_intercept=False).fit(X_aug,y[:,1])
X_0 = np.concatenate((np.eye(3), np.zeros((3,X_aug.shape[0])) ), axis = 1)
a1 = np.reshape(reg1.predict(X_0),(1,3))
a2 = np.reshape(reg2.predict(X_0),(1,3))
a_mat = np.concatenate((a1, a2), axis = 0)


idx1 = [ j for (i,j) in zip(reg1.coef_,range(1,len(reg1.coef_)+1)) 
                       if (i>0.0000001 or i<-0.0000001) ]
idx2 = [ j for (i,j) in zip(reg2.coef_,range(1,len(reg2.coef_)+1)) 
                       if (i>0.0000001 or i<-0.0000001) ]

new_identified_a = np.transpose(a_mat[0:2,0:2])
new_identified_b = np.reshape(a_mat[:,2],(2,1))
print('identified A matrix:')
print(new_identified_a)
print('identified B matrix:')
print(new_identified_b)
print('Frobenius norm of matrix A and true A:',  
      np.linalg.norm(new_identified_a-a_known, axis=None, keepdims=False))
print('Frobenius norm of matrix B and true B:',  
      np.linalg.norm(new_identified_b-b_known, axis=None, keepdims=False))
plot_eigenvalues([a_known, new_identified_a], ['true eigenvalues', 'OMP-estimated eigenvalues'])



And we see that removing the outlier with OMP doesn't work as amazingly as we hoped.

## Fixing the issue
To solve this issue, we implement a neighboring-OMP algorithm. The reason is that for this problem of system identification with state observations, we don't only have outliers in the observations or target vector --- they also exist as part of the data or features. To overcome this, we simply modify the algorithm such that when a basis $e_j$ is chosen, we also choose the next two bases to cover all the possible effects of the corrupted points.

In [None]:
new_idxes1 = []
new_idxes2 = []

for i in idx1:
    if i>3 and i< len(x_trace_cor)-2:
        new_idxes1.append(i-1)
        new_idxes1.append(i)
        new_idxes1.append(i+1)
    elif i==len(x_trace_cor)-1:
        new_idxes1.append(i-1)
        new_idxes1.append(i)
    elif i==len(x_trace_cor):
        new_idxes1.append(i+1)
        
new_idxes1 = np.unique(np.asarray(new_idxes1))

for i in idx2:
    if i>3 and i< len(x_trace_cor)-2:
        new_idxes2.append(i-1)
        new_idxes2.append(i)
        new_idxes2.append(i+1)
    elif i==len(x_trace_cor)-1:
        new_idxes2.append(i-1)
        new_idxes2.append(i)
    elif i==len(x_trace_cor):
        new_idxes2.append(i-1)


X1 = X_aug[new_idxes1, 0:3]
X2 = X_aug[new_idxes2, 0:3]

Ab1 = np.dot(np.dot(np.linalg.inv(np.dot(X1.T,X1)),X1.T),y[new_idxes1,0])
Ab2 = np.dot(np.dot(np.linalg.inv(np.dot(X2.T,X2)),X2.T),y[new_idxes2,1])
Ab = np.concatenate((Ab1,Ab2))
estimated_A_ours = Ab[:,0:2]
estimated_b_ours = Ab[:,2]

print('identified A matrix:')
print(estimated_A_ours)
print('identified B matrix:')
print(estimated_b_ours)
print('Frobenius norm of matrix A and true A:',  
      np.linalg.norm(estimated_A_ours-a_known, axis=None, keepdims=False))
print('Frobenius norm of matrix B and true B:',  
      np.linalg.norm(estimated_b_ours-b_known, axis=None, keepdims=False))
plot_eigenvalues([a_known, estimated_A_ours], ['true eigenvalues', 'our estimated eigenvalues'])


And we see that our method works nicely!