In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import statistics
from sklearn.datasets import fetch_openml
import sys
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.insert(0, '..\\CountsOutlierDetector')
from counts_outlier_detector import CountsOutlierDetector

In [2]:
# This notebook examines how many outliers are identified in each 
# dimensionality, from 1 to 6, over a large, random set of datasets
# from OpenML.

In [3]:
real_files = [
    'soybean',
    'micro-mass',
    'mfeat-karhunen',
    'Amazon_employee_access',
    'abalone',
    'cnae-9',
    'semeion',
    'vehicle',
    'satimage',
    'analcatdata_authorship',
    'breast-w',
    'SpeedDating',
    'eucalyptus',
    'isolet',
    'bioresponse',
    'vowel',
    'wall-robot-navigation',
    'credit-approval',
    'artificial-characters',
    'splice',
    'har',
    'cmc',
    'segment',
    'JapaneseVowels',
    'jm1',
    'gas-drift',
    'mushroom',
    'irish',
    'profb',
    'adult',
    'anneal',
    'credit-g',
    'blood-transfusion-service-center',
    'monks-problems-2',
    'tic-tac-toe',
    'qsar-biodeg',
    'wdbc',
    'phoneme',
    'diabetes',
    'ozone-level-8hr',
    'hill-valley',
    'kc2',
    'eeg-eye-state',
    'climate-model-simulation-crashes',
    'spambase',
    'ilpd',
    'one-hundred-plants-margin',
    'banknote-authentication',
    'mozilla4',
    'electricity',
    'madelon',
    'scene',
    'musk',
    'nomao',
    'MagicTelescope',
    'nursery',
    'page-blocks',
    'hypothyroid',
    'yeast',
    'kropt',
    'CreditCardSubset',
    'shuttle',
    'Satellite',
    'baseball',
    'mc1',
    'pc1',
    'cardiotocography',
    'kr-vs-k',
    'volcanoes-a1',
    'wine-quality-white',
    'car-evaluation',
    'solar-flare',
    'allbp',
    'allrep',
    'dis',
    'car',
    'steel-plates-fault'
]

In [None]:
percents_flagged_at_1d_arr = []
percents_flagged_at_2d_arr = []
percents_flagged_at_3d_arr = []
percents_flagged_at_4d_arr = []
percents_flagged_at_5d_arr = []
percents_flagged_at_6d_arr = []

for filename in real_files:
    
    # Collect the dataset
    print("Evaluating", filename)    
    version = 1
    if filename in ['vowel', 'car']:
        version = 2   
    if filename in ['solar-flare']:
        version = 'active'
    data = fetch_openml(filename, version=version)
    df = pd.DataFrame(data.data, columns=data.feature_names)

    # Set max_combinations very high, so that max_dimensions may be respected in 
    # most cases. It was not set to infinite, in order to ensure the test may 
    # be repeated in a manageable timeframe, so in some cases, where there are
    # many features, the detector may not examine the full six dimensions. 
    det = CountsOutlierDetector(max_dimensions=6, max_num_combinations=10_000_000)        
    results = det.predict(df)    
    flagged_summary_df = results['Flagged Summary']

    checked_2d = flagged_summary_df['Checked_2d'].values[0]
    checked_3d = flagged_summary_df['Checked_3d'].values[0]
    checked_4d = flagged_summary_df['Checked_4d'].values[0]
    checked_5d = flagged_summary_df['Checked_5d'].values[0]
    checked_6d = flagged_summary_df['Checked_6d'].values[0]

    percent_as_1d = flagged_summary_df['Percent Flagged as 1d'].values[0]
    percent_as_2d = flagged_summary_df['Percent Flagged as 2d'].values[0]
    percent_as_3d = flagged_summary_df['Percent Flagged as 3d'].values[0]
    percent_as_4d = flagged_summary_df['Percent Flagged as 4d'].values[0]
    percent_as_5d = flagged_summary_df['Percent Flagged as 5d'].values[0]
    percent_as_6d = flagged_summary_df['Percent Flagged as 6d'].values[0]

    percents_flagged_at_1d_arr.append(percent_as_1d)

    if checked_2d:
        percents_flagged_at_2d_arr.append(percent_as_2d)

    if checked_3d:
        percents_flagged_at_3d_arr.append(percent_as_3d)

    if checked_4d:
        percents_flagged_at_4d_arr.append(percent_as_4d)

    if checked_5d:
        percents_flagged_at_5d_arr.append(percent_as_5d)

    if checked_6d:
        percents_flagged_at_6d_arr.append(percent_as_6d)

Evaluating soybean
Evaluating micro-mass
Evaluating mfeat-karhunen
Evaluating Amazon_employee_access
Evaluating abalone
Evaluating cnae-9
Evaluating semeion
Evaluating vehicle
Evaluating satimage
Evaluating analcatdata_authorship
Evaluating breast-w
Evaluating SpeedDating
Evaluating eucalyptus
Evaluating isolet


In [None]:
print(percents_flagged_at_1d_arr)        
print(percents_flagged_at_2d_arr)        
print(percents_flagged_at_3d_arr)        
print(percents_flagged_at_4d_arr)        
print(percents_flagged_at_5d_arr)        
print(percents_flagged_at_6d_arr)        

In [None]:
percents_flagged_arr = [
    statistics.mean(percents_flagged_at_1d_arr),
    statistics.mean(percents_flagged_at_2d_arr),
    statistics.mean(percents_flagged_at_3d_arr),
    statistics.mean(percents_flagged_at_4d_arr),
    statistics.mean(percents_flagged_at_5d_arr),
    statistics.mean(percents_flagged_at_6d_arr),
]

for i in range(1, 7):
    print(f"Average percent flagged at {i}d over all datasets: {percents_flagged_arr[i-1]}" )

In [None]:
s = sns.barplot(x=list(range(1, 7)), y=percents_flagged_arr)
s.set_title("Average percent of rows flagged by dimension tested")
s.set_xlabel("Dimension")
s.set_ylabel("Percent Flagged")
plt.show()