## Anomaly Detection with Python

In [1]:
# import library

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


In [2]:
stocks = pd.read_csv('Prac8-data-stocks.csv')
# set index to date
stocks.index = stocks['Date'] 
# drop date column
stocks.drop(['Date'], axis=1, inplace=True)
# print first 5 rows
stocks.head()

Unnamed: 0_level_0,MSFT,F,BAC
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/3/2007,29.860001,7.51,53.330002
1/4/2007,29.809999,7.7,53.669998
1/5/2007,29.639999,7.62,53.240002
1/8/2007,29.93,7.73,53.450001
1/9/2007,29.959999,7.79,53.5


Compute the percentage of changes in the daily closing price of each stock

In [3]:
# Extract the dimensions of the stocks DataFrame
# N is the number of rows (time points), and D is the number of columns (stock tickers)
N, D = stocks.shape 

# Calculate the day-to-day percentage change for each stock
# The calculation is performed by subtracting each day's stock price from the following day's price
# This difference is then divided by the initial day's stock price and multiplied by 100
delta = pd.DataFrame(100*np.divide(stocks.iloc[1:,:].values - stocks.iloc[:N-1,:].values,
                                   stocks.iloc[:N-1,:].values),
                      columns=stocks.columns,
                      index=stocks.iloc[1:].index)

# Print the first 5 rows of the percentage change data
delta.head()

Unnamed: 0_level_0,MSFT,F,BAC
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/4/2007,-0.167455,2.52996,0.637532
1/5/2007,-0.570278,-1.038961,-0.801185
1/8/2007,0.978411,1.44357,0.394438
1/9/2007,0.100231,0.776197,0.093543
1/10/2007,-1.001332,-0.770218,0.149536


 Pandas' built-in .pct_change() method for calculating percentage change, which automatically handles edge cases and improves code readability.

In [4]:
# Calculate the day-to-day percentage change using Pandas built-in .pct_change() method
# This method automatically handles edge cases and is easier to read
delta_best_practices = stocks.pct_change() * 100

# Drop the first row as it will have NaN value after percentage change calculation
delta_best_practices = delta_best_practices.dropna()


- k-nearest neighbours (KNN) algorithm to detect anomalies in the stocks' daily closing prices based on the percentage change.
- a normal instance is expected to have a small distance to its k-th nearest neighbour, whereas an anomaly is likely to have a large distance to its k-th nearest neighbour.

In [5]:
# Importing the NearestNeighbors class from scikit-learn
# Importing the distance module from scipy.spatial
from sklearn.neighbors import NearestNeighbors
from scipy.spatial import distance    

# Number of nearest neighbors to consider for anomaly detection
knn = 4

# Initialize the NearestNeighbors class with k neighbors and Euclidean distance as the metric
# Fit the model on the delta DataFrame converted to NumPy array
nbrs = NearestNeighbors(n_neighbors=knn, metric=distance.euclidean).fit(delta.to_numpy())

# Use the kneighbors method to find k nearest neighbors for each data point in delta
# distances contains the distances to k nearest neighbors
# indices contains the indices of k nearest neighbors
distances, indices = nbrs.kneighbors(delta.to_numpy())

# Calculate the anomaly score based on the distance to the kth nearest neighbor
anomaly_score = distances[:, knn-1]

# Create a new DataFrame to store the anomaly scores, using delta's index
anom = pd.DataFrame(anomaly_score, index=delta.index, columns=['anomaly_score'])

# Concatenate the original delta DataFrame with the anomaly score DataFrame along axis 1 (columns)
result = pd.concat([delta, anom], axis=1)

# Display the top 5 rows with the largest anomaly scores
result.nlargest(5, 'anomaly_score')


Unnamed: 0_level_0,MSFT,F,BAC,anomaly_score
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10/13/2008,18.604651,20.100503,9.199808,15.642827
11/26/2008,2.501251,29.518072,4.256757,14.212749
10/7/2008,-6.744279,-20.867209,-26.225949,13.751302
11/28/2008,-1.317721,25.116279,5.314323,13.139586
9/30/2008,6.717317,24.70024,15.702479,12.599739
