In [1]:
import pandas as pd
import numpy as np
import scipy.stats
from sklearn.datasets import fetch_openml
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import jaccard_score
import warnings
from scipy.stats import SpearmanRConstantInputWarning

import sys
sys.path.insert(0, "..")
from outliers_test import DopingOutliersTest

In [2]:
warnings.filterwarnings(action='ignore', category=SpearmanRConstantInputWarning)

In [3]:
pd.options.display.max_columns = 1000
pd.options.display.max_colwidth = 1000
pd.options.display.max_rows = 1000
pd.options.display.width = 10000

### Define a random, large set of datasets from OpenML

In [4]:
real_files = [
    'soybean',
    'micro-mass',
    'mfeat-karhunen',
    'Amazon_employee_access',
    'abalone',
    'cnae-9',
    'semeion',
    'vehicle',
    'satimage',
    'analcatdata_authorship',
    'breast-w',
    'SpeedDating',
    'eucalyptus',
    'isolet',
    'bioresponse',
    'vowel',
    'wall-robot-navigation',
    'credit-approval',
    'artificial-characters',
    'splice',
    'har',
    'cmc',
    'segment',
    'JapaneseVowels',
    'jm1',
    'gas-drift',
    'mushroom',
    'irish',
    'profb',
    'adult',
    'anneal',
    'credit-g',
    'blood-transfusion-service-center',
    'monks-problems-2',
    'tic-tac-toe',
    'qsar-biodeg',
    'wdbc',
    'phoneme',
    'diabetes',
    'ozone-level-8hr',
    'hill-valley',
    'kc2',
    'eeg-eye-state',
    'climate-model-simulation-crashes',
    'spambase',
    'ilpd',
    'one-hundred-plants-margin',
    'banknote-authentication',
    'mozilla4',
    'electricity',
    'madelon',
    'scene',
    'musk',
    'nomao',
    'MagicTelescope',
    'PhishingWebsites',
    'nursery',
    'page-blocks',
    'hypothyroid',
    'yeast',
    'kropt',
    'CreditCardSubset',
    'shuttle',
    'Satellite',
    'baseball',
    'mc1',
    'pc1',
    'cardiotocography',
    'kr-vs-k',
    'volcanoes-a1',
    'wine-quality-white',
    'car-evaluation',
    'solar-flare',
    'allbp',
    'allrep',
    'dis',
    'car',
    'steel-plates-fault'
]

In [5]:
# Count Encoding is used to encode non-numeric values, required by 
# IsolationForest and Local Outlier Factor

def get_count_encoding(df):
    df = df.copy()
    for col_name in df.columns:
        if df[col_name].dtype.name in ['str', 'category', 'object']:
            df[col_name] = df[col_name].astype(str)
            vc = df[col_name].value_counts(dropna=False)
            df[col_name] = df[col_name].replace([None, np.nan, -np.nan, 'nan'], df[col_name].mode()[0])
            map_dict = {x: y for x, y in zip(vc.index, vc.values)}
            df[col_name] = df[col_name].map(map_dict)
            df[col_name] = df[col_name].astype(str)
    return df

### Quick example with a single file

In [6]:
# In this example, we use the vehicle dataset from OpenML. We get the 
# outlier scores of each row using Isolation Forest (IF) before and 
# after doping the dataset, and check if the IF is able to correctly
# give higher scores to the modified rows after vs before doping. 

# We check the spearman correlation to determine if the specific increase
# in IF score has a rank-order correlation with the outlier scores
# estimated by the doping process. 

# We also create binary flags to indicate if the IF flagged each
# row and if the row was modifified by the doping process. We check
# for overlap in these using the jaccard similarity score. 

# In this example, both scores perform well, though this is based on
# a cut-off for IF scores of 0.5, which will not work for all datasets
# and is done here for simplicity.


# Collect a single dataset from OpenML
data = fetch_openml('vehicle', version=1)
df = pd.DataFrame(data.data, columns=data.feature_names)

# Create a doped version of the dataset
data_modifier = DopingOutliersTest()
df_modified, outlier_scores = data_modifier.transform(df, random_state=0, verbose=False)

# Encode the original data in a format usable by IF
df_encoded = get_count_encoding(df)
df_encoded = df_encoded.fillna(0)
df_encoded = df_encoded.replace([np.nan, -np.nan], 0)

# Get IsolationForest (IF) scores on original data
det = IsolationForest(random_state=0)
det.fit(df_encoded)
y_orig_if = det.score_samples(df_encoded)

# Encode the doped data in a format usable by IF
df_modified_encoded = get_count_encoding(df_modified)
df_modified_encoded = df_modified_encoded.fillna(0)
df_modified_encoded = df_modified_encoded.replace([np.nan, -np.nan], 0)

# Get IF scores on modified dataset
det = IsolationForest(random_state=0)
det.fit(df_modified_encoded)
y_modified_if = det.score_samples(df_modified_encoded)

# Store the IF results. We clean the IF scores by converting them to a larger-is-more-anomalous
# format, and setting any low scores to zero. We then take the difference in IF scores between
# the original and doped datasets. Ideally, the doped rows will be flagged as being more 
# anomalous then their original form. 
df_modified['IF Orig Score'] = y_orig_if * (-1)
df_modified['IF Orig Cleaned'] = df_modified['IF Orig Score'].apply(lambda x: 0 if x <= 0.5 else x)
df_modified['IF Modified Score'] = y_modified_if * (-1)
df_modified['IF Modified Cleaned'] = df_modified['IF Modified Score'].apply(lambda x: 0 if x <= 0.5 else x)
df_modified['IF Gain'] = df_modified['IF Modified Cleaned'] - df_modified['IF Orig Cleaned']
df_modified['IF Gain Cleaned'] = df_modified['IF Gain'].apply(lambda x: 0 if x <= 0.0 else x)

# Store the outlier score estimated by the doping tool
df_modified['OUTLIER SCORE'] = outlier_scores

# Add binary columns indicating if IF and the Doping process identified the rows
# with any score
df_modified['IF Flagged'] = df_modified['IF Gain Cleaned'] > 0
df_modified['Doping Flagged'] = df_modified['OUTLIER SCORE'] > 0

# Display the results. This just shows the 10 rows that were modified as well as 5 other rows.
display(df_modified[[
    'IF Orig Score',
    'IF Orig Cleaned',
    'IF Modified Score',
    'IF Modified Cleaned',
    'IF Gain',
    'IF Gain Cleaned',
    'OUTLIER SCORE',
    'IF Flagged',
    'Doping Flagged'
    ]].sort_values(['OUTLIER SCORE'], ascending=False).head(15))

# Print the correlations
print("Spearman Correlation: Gain in IF scores to: Estimated Scores: ",
      scipy.stats.spearmanr(df_modified['IF Gain Cleaned'], df_modified['OUTLIER SCORE'])[0])

print("Jaccard Similarity: Gain in IF scores to: Estimated Scores: ", 
      jaccard_score(df_modified['IF Flagged'], df_modified['Doping Flagged']))

# Print the number of rows where there is an increase in IF scores. Ideally this will be
# close to 10, the actual number modified.
print(f"Number rows with increase in IF scores: {df_modified['IF Flagged'].sum()}")

Unnamed: 0,IF Orig Score,IF Orig Cleaned,IF Modified Score,IF Modified Cleaned,IF Gain,IF Gain Cleaned,OUTLIER SCORE,IF Flagged,Doping Flagged
559,0.459975,0.0,0.655302,0.655302,0.655302,0.655302,15,True,True
707,0.51729,0.51729,0.625524,0.625524,0.108234,0.108234,8,True,True
723,0.455054,0.0,0.579018,0.579018,0.579018,0.579018,7,True,True
835,0.619549,0.619549,0.676282,0.676282,0.056734,0.056734,6,True,True
684,0.442317,0.0,0.520008,0.520008,0.520008,0.520008,6,True,True
763,0.44589,0.0,0.546312,0.546312,0.546312,0.546312,6,True,True
629,0.47175,0.0,0.504285,0.504285,0.504285,0.504285,5,True,True
192,0.412765,0.0,0.505419,0.505419,0.505419,0.505419,4,True,True
359,0.466335,0.0,0.522281,0.522281,0.522281,0.522281,3,True,True
9,0.456063,0.0,0.448085,0.0,0.0,0.0,1,False,True


Spearman Correlation: Gain in IF scores to: Estimated Scores:  0.791260814871002
Jaccard Similarity: Gain in IF scores to: Estimated Scores:  0.6428571428571429
Number rows with increase in IF scores: 13


### Test Isolation Forest given a doped version of each OpenML dataset

In [7]:
# This tests on all OpenML datasets listed above. In this case, we use
# Isolation Forest as well as Local Outlier Factor (LOF). 

# As well as examining the spearman correlation and jaccard score 
# (for non-zero scores), we evaluate the detectors in two other ways.

# 1) We take the top 10 scores for each detector on each dataset. 
# In this case, we know the doping process modified 10 rows, but this 
# information will not typically be available. As it can be difficult 
# with outlier detectors to determine the best cut-off, this test
# is included to simulate where there is a reasonable guess as to the 
# number of outliers. It demonstrates that the detectors rank the scores
# well such that those modified by the doping process tend to have the 
# highest scores, even if the ideal cutoff remains elusive. 

# 2) We use a common technique in outlier detection to determine a cutoff, 
# testing the set of outlier scores for extreme values, and taking any 
# unusually high scores as outliers. For this we calculate the 
# interquartile range, and use a coefficient of 2.2, which is standard 
# for IQR tests, on the IF scores. The LOF scores, however, are more 
# dispersed, and a coefficient of 22.0 was used instead. 

# This demonstrates that IF and LOF are both generally able to give higher
# scores to modified rows after the doping process than before, and not 
# give higher scores to unmodified rows. 

# The numbers of rows flagged (meaning they recieved a higher score
# after doping vs before) by both detectors is displayed. In some cases 
# the count is very high, but the actual increases in scores are trivial,
# and so a good threshold is simply needed to establish a cutoff for 
# meaningful increases in score. 

# Where the number of outliers can be estimated,  both detectors tend
# to do quite well, as seen between the high jaccard scores between 
# the top ten flagged rows and the actual modified rows flag. In many 
# cases IF does well and LOF poorly or vice versa, a common theme in outlier
# detection where different detectors use different algorithms and are 
# able to identify different types of outliers.


if_spearman_corr_arr = []
if_jaccard_scores_arr = []
if_num_flagged_arr = []
if_jaccard_top_ten_arr = []
if_jaccard_iqr_arr = []

lof_spearman_corr_arr = []
lof_jaccard_scores_arr = []
lof_num_flagged_arr = []
lof_jaccard_top_ten_arr = []
lof_jaccard_iqr_arr = []

for filename in real_files:
    print("Evaluating", filename)
    data = fetch_openml(filename, version=1)
    df = pd.DataFrame(data.data, columns=data.feature_names)
    data_modifier = DopingOutliersTest()
    df_modified, outlier_scores = data_modifier.transform(df, random_state=0, verbose=False)
    if df_modified is None:
        # The doping process may return None if there are invalid parameters or too few
        # columns remaining after removing high-cardinality categorical columns from the
        # doping process. 
        print("Doping process returned None. Skipping this file.")
        continue

    # Encode the original data
    df_encoded = get_count_encoding(df)
    df_encoded = df_encoded.fillna(0)
    df_encoded = df_encoded.replace([np.nan, -np.nan], 0)
    
    # Get IF scores on original data
    det = IsolationForest(random_state=0)
    det.fit(df_encoded)
    y_orig_if = det.score_samples(df_encoded)

    # Get LOF scores on original data   
    det = LocalOutlierFactor(novelty=True)
    det.fit(df_encoded)
    y_orig_lof = det.score_samples(df_encoded)
    
    # Encode the modified data
    df_modified_encoded = get_count_encoding(df_modified)
    df_modified_encoded = df_modified_encoded.fillna(0)
    df_modified_encoded = df_modified_encoded.replace([np.nan, -np.nan], 0)
    
    # Get IF scores on the modified dataset
    det = IsolationForest(random_state=0)
    det.fit(df_modified_encoded)
    y_modified_if = det.score_samples(df_modified_encoded)

    # Get LOF scores on the modified data   
    det = LocalOutlierFactor(novelty=True)
    det.fit(df_modified_encoded)
    y_modified_lof = det.score_samples(df_modified_encoded)
    
    # Store the IF results
    df_modified['IF Orig Score'] = y_orig_if * (-1)
    df_modified['IF Modified Score'] = y_modified_if * (-1)
    df_modified['IF Gain'] = df_modified['IF Modified Score'] - df_modified['IF Orig Score']
    
    # Store the LOF results
    df_modified['LOF Orig Score'] = y_orig_lof * (-1)
    df_modified['LOF Modified Score'] = y_modified_lof * (-1)
    df_modified['LOF Gain'] = df_modified['LOF Modified Score'] - df_modified['LOF Orig Score']

    # Get the top 10 IF scores
    top_ten_if = sorted(df_modified['IF Gain'], reverse=True)[10]
    df_modified['IF Gain Top 10'] = df_modified['IF Gain'] > top_ten_if

    # Get the top 10 LOF scores
    top_ten_lof = sorted(df_modified['LOF Gain'], reverse=True)[10]
    df_modified['LOF Gain Top 10'] = df_modified['LOF Gain'] > top_ten_lof
    
    # Get the IF scores with high IQR values
    q1 = df_modified['IF Gain'].quantile(0.25)
    q3 = df_modified['IF Gain'].quantile(0.75)
    iqr_threshold = q3 + 2.2 * (q3 - q1)
    df_modified['IF Gain High IQR'] = df_modified['IF Gain'] > iqr_threshold
    
    # Get the LOF scores with high IQR values
    q1 = df_modified['LOF Gain'].quantile(0.25)
    q3 = df_modified['LOF Gain'].quantile(0.75)
    iqr_threshold = q3 + 22.0 * (q3 - q1)
    df_modified['LOF Gain High IQR'] = df_modified['LOF Gain'] > iqr_threshold

    # Store the outlier score estimated by the doping tool
    df_modified['OUTLIER SCORE'] = outlier_scores

    # Add binary columns indicating if IF and the Doping process identified the rows
    # with any score
    df_modified['IF Flagged'] = df_modified['IF Gain'] > 0
    df_modified['LOF Flagged'] = df_modified['LOF Gain'] > 0.01
    df_modified['Doping Flagged'] = df_modified['OUTLIER SCORE'] > 0    
        
    # IF 
    spearman_corr = scipy.stats.spearmanr(df_modified['IF Gain'], df_modified['OUTLIER SCORE'])[0]
    if spearman_corr !=  spearman_corr:
        spearman_corr = 0.0
    if_spearman_corr_arr.append(spearman_corr)
    
    jaccard_sim = jaccard_score(df_modified['IF Flagged'], df_modified['Doping Flagged'])
    if_jaccard_scores_arr.append(jaccard_sim)
    
    if_num_flagged = df_modified['IF Flagged'].sum()
    if_num_flagged_arr.append(if_num_flagged)

    jaccard_sim = jaccard_score(df_modified['IF Gain Top 10'], df_modified['Doping Flagged'])
    if_jaccard_top_ten_arr.append(jaccard_sim)   

    jaccard_sim = jaccard_score(df_modified['IF Gain High IQR'], df_modified['Doping Flagged'])
    if_jaccard_iqr_arr.append(jaccard_sim)
    
    # LOF
    spearman_corr = scipy.stats.spearmanr(df_modified['LOF Gain'], df_modified['OUTLIER SCORE'])[0]
    if spearman_corr !=  spearman_corr:
        spearman_corr = 0.0
    lof_spearman_corr_arr.append(spearman_corr)
    
    jaccard_sim = jaccard_score(df_modified['LOF Flagged'], df_modified['Doping Flagged'])
    lof_jaccard_scores_arr.append(jaccard_sim)
    
    lof_num_flagged = df_modified['LOF Flagged'].sum()
    lof_num_flagged_arr.append(lof_num_flagged)
    
    jaccard_sim = jaccard_score(df_modified['LOF Gain Top 10'], df_modified['Doping Flagged'])
    lof_jaccard_top_ten_arr.append(jaccard_sim)
    
    jaccard_sim = jaccard_score(df_modified['LOF Gain High IQR'], df_modified['Doping Flagged'])
    lof_jaccard_iqr_arr.append(jaccard_sim)
    
results_df = pd.DataFrame({
    "Dataset": real_files, 
    
    "IF Spearman Correlation": if_spearman_corr_arr,
    "IF Jaccard Similarity": if_jaccard_scores_arr,
    "IF Number Flagged": if_num_flagged_arr,
    "IF Jaccard Similarity to top 10": if_jaccard_top_ten_arr,
    "IF Jaccard Similarity Given IQR": if_jaccard_iqr_arr,

    "LOF Spearman Correlation": lof_spearman_corr_arr,
    "LOF Jaccard Similarity": lof_jaccard_scores_arr,
    "LOF Number Flagged": lof_num_flagged_arr,
    "LOF Jaccard Similarity to top 10": lof_jaccard_top_ten_arr,
    "LOF Jaccard Similarity Given IQR": lof_jaccard_iqr_arr,    
})
display(results_df)

Evaluating soybean
Evaluating micro-mass
Evaluating mfeat-karhunen
Evaluating Amazon_employee_access
Evaluating abalone
Evaluating cnae-9
Evaluating semeion
Evaluating vehicle
Evaluating satimage
Evaluating analcatdata_authorship
Evaluating breast-w
Evaluating SpeedDating
Evaluating eucalyptus
Evaluating isolet
Evaluating bioresponse
Evaluating vowel


  warn("Version {} of dataset {} is inactive, meaning that issues have "


Evaluating wall-robot-navigation
Evaluating credit-approval
Evaluating artificial-characters
Evaluating splice
Evaluating har
Evaluating cmc
Evaluating segment
Evaluating JapaneseVowels
Evaluating jm1
Evaluating gas-drift
Evaluating mushroom
Evaluating irish
Evaluating profb
Evaluating adult
Evaluating anneal
Evaluating credit-g
Evaluating blood-transfusion-service-center
Evaluating monks-problems-2
Evaluating tic-tac-toe
Evaluating qsar-biodeg
Evaluating wdbc
Evaluating phoneme
Evaluating diabetes
Evaluating ozone-level-8hr
Evaluating hill-valley
Evaluating kc2
Evaluating eeg-eye-state
Evaluating climate-model-simulation-crashes
Evaluating spambase
Evaluating ilpd
Evaluating one-hundred-plants-margin
Evaluating banknote-authentication
Evaluating mozilla4
Evaluating electricity
Evaluating madelon
Evaluating scene
Evaluating musk
Evaluating nomao
Evaluating MagicTelescope
Evaluating PhishingWebsites
Evaluating nursery
Evaluating page-blocks
Evaluating hypothyroid


  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Evaluating yeast
Evaluating kropt
Evaluating CreditCardSubset
Evaluating shuttle
Evaluating Satellite
Evaluating baseball
Evaluating mc1
Evaluating pc1
Evaluating cardiotocography
Evaluating kr-vs-k
Evaluating volcanoes-a1
Evaluating wine-quality-white
Evaluating car-evaluation
Evaluating solar-flare


  warn("Version {} of dataset {} is inactive, meaning that issues have "


Evaluating allbp
Evaluating allrep
Evaluating dis
Evaluating car


  warn("Version {} of dataset {} is inactive, meaning that issues have "


Evaluating steel-plates-fault


Unnamed: 0,Dataset,IF Spearman Correlation,IF Jaccard Similarity,IF Number Flagged,IF Jaccard Similarity to top 10,IF Jaccard Similarity Given IQR,LOF Spearman Correlation,LOF Jaccard Similarity,LOF Number Flagged,LOF Jaccard Similarity to top 10,LOF Jaccard Similarity Given IQR
0,soybean,0.19275,0.048387,185,0.818182,0.8,0.16808,0.123288,72,0.666667,0.529412
1,micro-mass,0.227214,0.07438,120,0.538462,0.538462,0.211286,0.2,20,0.333333,0.5
2,mfeat-karhunen,0.121628,0.009756,1025,0.666667,0.7,0.12673,0.714286,14,1.0,0.625
3,Amazon_employee_access,0.027099,0.002167,3689,0.052632,0.040541,0.023831,0.000583,15441,0.176471,0.166667
4,abalone,0.084646,0.047847,209,1.0,1.0,0.098596,0.238095,42,1.0,0.045872
5,cnae-9,0.142895,0.020455,439,0.666667,0.45,0.165895,0.059172,169,1.0,1.0
6,semeion,0.12318,0.01996,501,0.333333,0.3,0.119277,0.363636,5,0.25,0.2
7,vehicle,0.179932,0.375,23,0.818182,0.9,0.183321,0.236842,37,0.818182,0.818182
8,satimage,0.068252,0.006173,1620,1.0,0.333333,0.086692,0.263158,38,1.0,0.030211
9,analcatdata_authorship,0.187523,0.084746,118,0.666667,0.8,0.187748,0.833333,12,1.0,1.0


In [8]:
print("Average IF Jaccard Similarity to top 10:", results_df['IF Jaccard Similarity to top 10'].mean())

Average IF Jaccard Similarity to top 10: 0.6944388303342818


In [9]:
print("Average LOF Jaccard Similarity to top 10:", results_df['LOF Jaccard Similarity to top 10'].mean())

Average LOF Jaccard Similarity to top 10: 0.6421536103171612


In [10]:
print("Average IF Jaccard Similarity using IQR:", results_df['IF Jaccard Similarity Given IQR'].mean())

Average IF Jaccard Similarity using IQR: 0.5609408440365509


In [11]:
print("Average LOF Jaccard Similarity using IQR:", results_df['LOF Jaccard Similarity Given IQR'].mean())

Average LOF Jaccard Similarity using IQR: 0.32877780387880173
