In [1]:
import numpy as np
import pandas as pd

In [5]:
## We will generate a population 10,000 random numbers 
# drawn from a Gaussian distribution with a mean of 50 and a standard deviation of 5.
#We will use the randn() function to generate random Gaussian values 
#with a mean of 0 and a standard deviation of 1, 
#then multiply the results by our own standard deviation and add the mean to shift the values into the preferred range.

##to seed the distribution data so that it will generate the same data each time it is run
np.random.seed(1)

data = 5*np.random.randn(10000)+50

print("Mean = %.3f and Std. Dev. = %.3f" %(np.mean(data),np.std(data)))

Mean = 50.049 and Std. Dev. = 4.994


### Detecting Outliers using Standard Deviation Method

In [None]:
#We can calculate the mean and standard deviation of a given sample, 
#then calculate the cut-off for identifying outliers as more than 3 standard deviations from the mean.

#calculating summary of statistics
data_mean , data_std_dev = np.mean(data),np.std(data)

# setting cut off or threshold 
cut_off = data_std_dev * 3

lower , upper = data_mean - cut_off , data_mean + cut_off

In [13]:
## we can now identify the outliers by simply checking for the values that does not fit in the decided range above
outliers = [x for x in data if x < lower or x > upper]
print("Total outliers detected :",len(outliers))

Total outliers detected : 29


In [15]:
## We can also remove the outliers by using the same method 
# i.e. taking values which falls under the threshold or cut off

outliers_removed = [x for x in data if x > lower and x < upper]
print("Total number of observations after removing outliers:", len(outliers_removed))

Total number of observations after removing outliers: 9971


### Interquartile Range Method

#### The IQR is calculated as the difference between the 75th and the 25th percentiles of the data and defines the box in a box and whisker plot.

In [17]:
# calculating the interquartile range
q25,q75 = np.percentile(data,25),np.percentile(data,75)
iqr  = q75-q25

In [18]:
#calculating the threshold or outlier cutoff
cut_off = iqr*1.5
lower , upper = q25 - cut_off,q75 + cut_off

In [21]:
print("Percentiles 25th = %.3f ,75th = %.3f, Interquartile range = %.3f "%(q25,q75,iqr))



## we can now identify the outliers by simply checking for the values that does not fit in the decided range above
outliers = [x for x in data if x < lower or x > upper]
print("Total outliers detected :",len(outliers))


## We can also remove the outliers by using the same method 
# i.e. taking values which falls under the threshold or cut off

outliers_removed = [x for x in data if x > lower and x < upper]
print("Total number of observations after removing outliers:", len(outliers_removed))

Percentiles 25th = 46.685 ,75th = 53.359, Interquartile range =6.674 
Total outliers detected : 81
Total number of observations after removing outliers: 9919


### Automatic Outlier Detection

##### In machine learning, an approach to tackling the problem of outlier detection is one-class classification.

##### One-Class Classification, or OCC for short, involves fitting a model on the “normal” data and predicting whether new data is normal or an outlier/anomaly.

##### The scikit-learn library provides an implementation of this approach in the LocalOutlierFactor class.

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import mean_absolute_error

In [26]:
## loading in the dataset and splitting into training and testing 
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
df = pd.read_csv(url, header=None)

#taking only the numeric values
data = df.values

X  = data[:,:-1]
y = data[:,-1]

print("Dimensions of X and y ",X.shape , y.shape)

#splitting the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

print("Summarizing the shapes of the train and test datasets :\n",X_train.shape,X_test.shape,y_train.shape,y_test.shape)



Dimensions of X and y  (506, 13) (506,)
Summarizing the shapes of the train and test datasets :
 (339, 13) (167, 13) (339,) (167,)


In [29]:
#In this case, we will fit a linear regression algorithm and 
#evaluate model performance by training the model on the test dataset 
#and making a prediction on the test data 
#and evaluate the predictions using the mean absolute error (MAE).

lin_model = LinearRegression()
lin_model.fit(X_train,y_train)
pred = lin_model.predict(X_test)
print("Mean absolute error before removing outliers :",round(mean_absolute_error(y_test,pred),3))

Mean absolute error before removing outliers : 3.417


In [30]:
## identifying outliers in dataset 
lof = LocalOutlierFactor()
preds = lof.fit_predict(X_train)

In [33]:
# selecting all the rows that are not outliers
mask = preds != -1
X_train,y_train = X_train[mask,:],y_train[mask]

In [34]:
# training the model again and evaluating the results

lin_model = LinearRegression()
lin_model.fit(X_train,y_train)
pred = lin_model.predict(X_test)
print("Mean absolute error after removing outliers :",round(mean_absolute_error(y_test,pred),3))

Mean absolute error after removing outliers : 3.356


#### Reference : https://machinelearningmastery.com/how-to-use-statistics-to-identify-outliers-in-data/