In [1]:
import pandas as pd
import numpy as np
import scipy.stats
from sklearn.datasets import fetch_openml
from sklearn.ensemble import IsolationForest
from scipy.stats import SpearmanRConstantInputWarning
from sklearn.metrics import jaccard_score
import warnings
import sys

sys.path.insert(0, '..\\CountsOutlierDetector')
from counts_outlier_detector import CountsOutlierDetector

sys.path.insert(0, '..\\..\\Doping_project')
from outliers_test import DopingOutliersTest

In [2]:
warnings.filterwarnings(action='ignore', category=SpearmanRConstantInputWarning)

In [3]:
pd.options.display.max_columns = 1000
pd.options.display.max_colwidth = 1000
pd.options.display.max_rows = 1000
pd.options.display.width = 10000

In [4]:
# We use the same set of datasets from OpenML as were used in the DopingOutliersTest
# project to compare IsolationForest to Local Outlier Factor. This is an entirely
# separate set of files as were used to test the hyperparameters for 
# CountsOutlierDetector

real_files = [
    'soybean',
    'micro-mass',
    'mfeat-karhunen',
    'Amazon_employee_access',
    'abalone',
    'cnae-9',
    'semeion',
    'vehicle',
    'satimage',
    'analcatdata_authorship',
    'breast-w',
    'SpeedDating',
    'eucalyptus',    
    'isolet',
    'bioresponse',
    'vowel',
    'wall-robot-navigation',
    'credit-approval',
    'artificial-characters',
    'splice',
    'har',
    'cmc',
    'segment',
    'JapaneseVowels',
    'jm1',
    'gas-drift',
    'mushroom',
    'irish',
    'profb',
    'adult',    
    'anneal',
    'credit-g',
    'blood-transfusion-service-center',
    'monks-problems-2',
    'tic-tac-toe',
    'qsar-biodeg',
    'wdbc',
    'phoneme',    
    'diabetes',
    'ozone-level-8hr',
    'hill-valley',
    'kc2',
    'eeg-eye-state',
    'climate-model-simulation-crashes',
    'spambase',
    'ilpd',
    'one-hundred-plants-margin',
    'banknote-authentication',
    'mozilla4',
    'electricity',
    'madelon',
    'scene',
    'musk',
    'nomao',
    'MagicTelescope',
    'PhishingWebsites',
    'nursery',
    'page-blocks',   
    'hypothyroid',
    'yeast',
    'kropt',
    'CreditCardSubset',
    'shuttle',
    'Satellite',
    'baseball',
    'mc1',
    'pc1',
    'cardiotocography',
    'kr-vs-k',
    'volcanoes-a1',
    'wine-quality-white',
    'car-evaluation',
    'solar-flare',
    'allbp',
    'allrep',
    'dis',
    'car',
    'steel-plates-fault'
]

In [5]:
# Count Encoding is used to encode non-numeric values, required by 
# IsolationForest (IF), though not by CountsOutlierDetector. 

def get_count_encoding(df):
    df = df.copy()
    for col_name in df.columns:
        if df[col_name].dtype.name in ['str', 'category', 'object', 'car']:
            df[col_name] = df[col_name].astype(str)
            vc = df[col_name].value_counts(dropna=False)
            df[col_name] = df[col_name].replace([None, np.nan, -np.nan, 'nan'], df[col_name].mode()[0])
            map_dict = {x: y for x, y in zip(vc.index, vc.values)}
            df[col_name] = df[col_name].map(map_dict)
            df[col_name] = df[col_name].astype(str)
    return df

In [6]:
if_spearman_corr_arr = []
if_jaccard_scores_arr = []
if_jaccard_iqr_arr = []

cod_spearman_corr_arr = []
cod_jaccard_scores_arr = []
cod_jaccard_iqr_arr = []

for filename in real_files:
    
    # Collect the dataset
    print("Evaluating", filename)    
    version = 1
    if filename in ['vowel', 'car']:
        version = 2   
    if filename in ['solar-flare']:
        version = 'active'
    data = fetch_openml(filename, version=version)
    df = pd.DataFrame(data.data, columns=data.feature_names)
    
    # Create a modifid copy of the dataset
    data_modifier = DopingOutliersTest()
    df_modified, outlier_scores = data_modifier.transform(df, random_state=0, verbose=False)
    if df_modified is None:
        # The doping process may return None if there are invalid parameters or too few
        # columns remaining after removing high-cardinality categorical columns from the
        # doping process. 
        print("Doping process returned None. Skipping this file.")
        continue

    # Encode the original data in a format IF can accept. This is not necessary for
    # CountsOutlierDetector
    df_encoded = get_count_encoding(df)
    df_encoded = df_encoded.fillna(0)
    df_encoded = df_encoded.replace([np.nan, -np.nan], 0)
    
    # Get IF scores on original data
    det = IsolationForest(random_state=0)
    det.fit(df_encoded)
    y_orig_if = det.score_samples(df_encoded)

    # Get COD scores on original data   
    det = CountsOutlierDetector()
    y_orig_cod = det.fit_predict(df)['Scores']
    
    # Encode the modified data for IF
    df_modified_encoded = get_count_encoding(df_modified)
    df_modified_encoded = df_modified_encoded.fillna(0)
    df_modified_encoded = df_modified_encoded.replace([np.nan, -np.nan], 0)
    
    # Get IF scores on the modified dataset
    det = IsolationForest(random_state=0)
    det.fit(df_modified_encoded)
    y_modified_if = det.score_samples(df_modified_encoded)

    # Get COD scores on the modified data   
    det = CountsOutlierDetector()
    y_modified_cod = det.fit_predict(df_modified_encoded)['Scores']
    
    # Store the IF results
    df_modified['IF Orig Score'] = y_orig_if * (-1)
    df_modified['IF Modified Score'] = y_modified_if * (-1)
    df_modified['IF Gain'] = df_modified['IF Modified Score'] - df_modified['IF Orig Score']
    
    # Store the COD results
    df_modified['COD Orig Score'] = y_orig_cod
    df_modified['COD Modified Score'] = y_modified_cod
    df_modified['COD Gain'] = df_modified['COD Modified Score'] - df_modified['COD Orig Score']
   
    # Get the IF scores with high IQR values
    q1 = df_modified['IF Gain'].quantile(0.25)
    q3 = df_modified['IF Gain'].quantile(0.75)
    iqr_threshold = q3 + 2.2 * (q3 - q1)
    df_modified['IF Gain High IQR'] = df_modified['IF Gain'] > iqr_threshold
    
    # Get the COD scores with high IQR values
    q1 = df_modified['COD Gain'].quantile(0.25)
    q3 = df_modified['COD Gain'].quantile(0.75)
    iqr_threshold = q3 + 22.0 * (q3 - q1)
    df_modified['COD Gain High IQR'] = df_modified['COD Gain'] > iqr_threshold

    # Store the outlier score estimated by the doping tool
    df_modified['OUTLIER SCORE'] = outlier_scores

    # Add binary columns indicating if IF and the Doping process identified the rows
    # with any score
    df_modified['IF Flagged'] = df_modified['IF Gain'] > 0
    df_modified['COD Flagged'] = df_modified['COD Gain'] > 0.01
    df_modified['Doping Flagged'] = df_modified['OUTLIER SCORE'] > 0    
        
    # IF 
    spearman_corr = scipy.stats.spearmanr(df_modified['IF Gain'], df_modified['OUTLIER SCORE'])[0]
    if spearman_corr !=  spearman_corr:
        spearman_corr = 0.0
    if_spearman_corr_arr.append(spearman_corr)
    
    jaccard_sim = jaccard_score(df_modified['IF Flagged'], df_modified['Doping Flagged'])
    if_jaccard_scores_arr.append(jaccard_sim)
    
    jaccard_sim = jaccard_score(df_modified['IF Gain High IQR'], df_modified['Doping Flagged'])
    if_jaccard_iqr_arr.append(jaccard_sim)
    
    # COD
    spearman_corr = scipy.stats.spearmanr(df_modified['COD Gain'], df_modified['OUTLIER SCORE'])[0]
    if spearman_corr !=  spearman_corr:
        spearman_corr = 0.0
    cod_spearman_corr_arr.append(spearman_corr)
    
    jaccard_sim = jaccard_score(df_modified['COD Flagged'], df_modified['Doping Flagged'])
    cod_jaccard_scores_arr.append(jaccard_sim)
    
    jaccard_sim = jaccard_score(df_modified['COD Gain High IQR'], df_modified['Doping Flagged'])
    cod_jaccard_iqr_arr.append(jaccard_sim)
    
results_df = pd.DataFrame({
    "Dataset": real_files, 
    
    "IF Spearman Correlation": if_spearman_corr_arr,
    "IF Jaccard Similarity": if_jaccard_scores_arr,
    "IF Jaccard Similarity Given IQR": if_jaccard_iqr_arr,

    "COD Spearman Correlation": cod_spearman_corr_arr,
    "COD Jaccard Similarity": cod_jaccard_scores_arr,
    "COD Jaccard Similarity Given IQR": cod_jaccard_iqr_arr,    
})
display(results_df)

Evaluating soybean
Evaluating micro-mass
Evaluating mfeat-karhunen
Evaluating Amazon_employee_access
Evaluating abalone
Evaluating cnae-9
Evaluating semeion
Evaluating vehicle
Evaluating satimage
Evaluating analcatdata_authorship
Evaluating breast-w
Evaluating SpeedDating
Evaluating eucalyptus
Evaluating isolet
Evaluating bioresponse
Evaluating vowel
Evaluating wall-robot-navigation
Evaluating credit-approval
Evaluating artificial-characters
Evaluating splice
Evaluating har
Evaluating cmc
Evaluating segment
Evaluating JapaneseVowels
Evaluating jm1
Evaluating gas-drift
Evaluating mushroom
Evaluating irish
Evaluating profb
Evaluating adult
Evaluating anneal
Evaluating credit-g
Evaluating blood-transfusion-service-center
Evaluating monks-problems-2
Evaluating tic-tac-toe
Evaluating qsar-biodeg
Evaluating wdbc
Evaluating phoneme
Evaluating diabetes
Evaluating ozone-level-8hr
Evaluating hill-valley
Evaluating kc2
Evaluating eeg-eye-state
Evaluating climate-model-simulation-crashes
Evaluatin

Unnamed: 0,Dataset,IF Spearman Correlation,IF Jaccard Similarity,IF Jaccard Similarity Given IQR,COD Spearman Correlation,COD Jaccard Similarity,COD Jaccard Similarity Given IQR
0,soybean,0.19275,0.048387,0.8,0.242157,0.064516,0.064516
1,micro-mass,0.227214,0.07438,0.538462,0.408746,0.555556,0.555556
2,mfeat-karhunen,0.121628,0.009756,0.7,0.104522,0.034884,0.0
3,Amazon_employee_access,0.027099,0.002167,0.040541,0.0877,0.043668,0.043668
4,abalone,0.084646,0.047847,1.0,0.273306,0.714286,0.714286
5,cnae-9,0.142895,0.020455,0.45,1.0,1.0,1.0
6,semeion,0.12318,0.01996,0.3,1.0,1.0,1.0
7,vehicle,0.179932,0.375,0.9,0.343208,0.173077,0.173077
8,satimage,0.068252,0.006173,0.333333,0.073158,0.030822,0.030822
9,analcatdata_authorship,0.187523,0.084746,0.8,0.340089,0.333333,0.333333


In [7]:
# Here COD out-performs IF

print("IF average Spearmann Correlation over all rows:")
print("IF: ", results_df['IF Spearman Correlation'].mean())
print("COD:", results_df['COD Spearman Correlation'].mean())

IF average Spearmann Correlation over all rows:
IF:  0.10602394209026701
COD: 0.3391159910019315


In [8]:
# Here COD out-performs IF

print("IF average Jaccard Similarity (taking all positive vs non-positive gains):") 
print("IF: ", results_df['IF Jaccard Similarity'].mean())
print("COD:", results_df['COD Jaccard Similarity'].mean())

IF average Jaccard Similarity (taking all positive vs non-positive gains):
IF:  0.14941931388616683
COD: 0.46110036330844634


In [9]:
# Here IF out-performs COD

print("IF average Jaccard Similarity (taking as positive any large gains based on IQR):") 
print("IF: ", results_df['IF Jaccard Similarity Given IQR'].mean())
print("COD:", results_df['COD Jaccard Similarity Given IQR'].mean())

IF average Jaccard Similarity (taking as positive any large gains based on IQR):
IF:  0.5618458214121164
COD: 0.4411353889061057
