# Outlier Detection

# 1. Libraries

In [1]:
#data organizing
import pandas #storage
import numpy as np #data-type conversion
from os import getcwd

#preprocessing
from sklearn.model_selection import train_test_split #to split the data

#outlier/novelty detection
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest

# 2. Dataset - Importing

In [2]:
def cvDefPay(prediction):
    mapper = {0: False, 1: True}
    return mapper.get(prediction)

def cvIsOutlier(prediction):
    mapper = {-1: True, 1: False}
    return mapper.get(prediction)

In [3]:
url = getcwd() + '\\default of credit card clients.xls'
ccd = pandas.read_excel(io = url, \
                        sheet_name='Data', header = 1, index_col = 0, \
                        dtype = {'LIMIT_BAL': np.int32, 'AGE': np.int32, 'BILL_AMT1': np.int32, 'BILL_AMT2': np.int32, 'BILL_AMT3': np.int32, 'BILL_AMT4': np.int32, 'BILL_AMT5': np.int32, 'BILL_AMT6': np.int32, 'PAY_AMT1': np.int32, 'PAY_AMT2': np.int32, 'PAY_AMT3': np.int32, 'PAY_AMT4': np.int32, 'PAY_AMT5': np.int32, 'PAY_AMT6': np.int32}, \
                        converters = {'default payment next month': cvDefPay})

In [4]:
ccd.rename(columns = {'PAY_0': 'PAY_1'}, inplace = True)
ccd.rename(columns = {'default payment next month': 'default_payment_next_month'}, inplace = True)

# 3. Splitting the dataset

In [5]:
ccdY = pandas.DataFrame(ccd['default_payment_next_month'])
ccdX = ccd.drop(['default_payment_next_month'], axis = 'columns')

In [6]:
trainX, testX, trainY, testY = train_test_split(ccdX, ccdY, test_size = 0.25, stratify = ccdY, random_state = 44)

In [7]:
testDF = [testX, testY]
test = pandas.concat(testDF, axis = 'columns')

trainDF = [trainX, trainY]
train = pandas.concat(trainDF, axis = 'columns')

testResult = test.copy(deep = True)

# 4.a. Outlier Detection: IsolationForest

In [22]:
isolationForest = IsolationForest(n_estimators = 100, max_samples = 0.2, contamination = 0.00001,
                       n_jobs = -1, random_state = 39)

In [23]:
isolationForest.fit(train)

IsolationForest(behaviour='deprecated', bootstrap=False, contamination=1e-05,
                max_features=1.0, max_samples=0.2, n_estimators=100, n_jobs=-1,
                random_state=39, verbose=0, warm_start=False)

In [24]:
testLabels = isolationForest.predict(test)

In [25]:
testResult['IsOutlier'] = list(map(cvIsOutlier, testLabels))

In [26]:
testResult['IsOutlier'].value_counts()

False    7499
True        1
Name: IsOutlier, dtype: int64

In [27]:
testResult['IsOutlier'].value_counts(normalize = True).mul(100).round(2).astype('str') + ' %'

False    99.99 %
True      0.01 %
Name: IsOutlier, dtype: object

In [28]:
test[testResult['IsOutlier'] == True]

Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default_payment_next_month
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2198,1000000,2,1,1,47,0,0,0,-1,0,...,891586,927171,961664,50784,50723,896040,50000,50000,50256,False


# 4.b. Outlier Detection: OneClassSVM

In [57]:
oneClassSVM = OneClassSVM(kernel = 'rbf', gamma = 'scale', tol = 0.00001, nu = 0.001, shrinking = True,
                          cache_size = 500, max_iter = -1)

In [58]:
oneClassSVM.fit(train)

OneClassSVM(cache_size=500, coef0=0.0, degree=3, gamma='scale', kernel='rbf',
            max_iter=-1, nu=0.001, shrinking=True, tol=1e-05, verbose=False)

In [59]:
testLabels = oneClassSVM.predict(test)

In [60]:
testResult['IsOutlier'] = list(map(cvIsOutlier, testLabels))

In [61]:
testResult['IsOutlier'].value_counts()

False    7465
True       35
Name: IsOutlier, dtype: int64

In [62]:
testResult['IsOutlier'].value_counts(normalize = True).mul(100).round(2).astype('str') + ' %'

False    99.53 %
True      0.47 %
Name: IsOutlier, dtype: object

In [63]:
test[testResult['IsOutlier'] == True]

Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default_payment_next_month
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
29821,400000,1,2,2,37,1,-1,0,0,0,...,385947,305542,271385,423903,15100,14600,13000,9500,9500,False
24921,430000,1,1,1,39,-1,-1,0,0,-1,...,325463,38290,21800,50942,325470,20003,39068,21800,351282,False
2593,480000,1,1,1,49,1,-1,-1,-1,-1,...,33146,37337,33777,39149,344261,33312,37511,31383,10633,False
5401,500000,2,2,2,49,-1,-1,-1,0,0,...,391047,0,150,8982,385228,7821,0,150,363,False
28278,260000,2,3,2,49,-2,-2,-2,-2,-2,...,2735,316,305,217773,200304,2759,316,305,2596,False
3184,260000,1,3,1,48,0,0,0,0,-1,...,-170000,171696,174151,8812,9158,0,497000,10000,7000,False
24479,280000,1,1,2,31,2,0,0,-1,0,...,189841,193313,192742,6996,0,189841,6022,6078,5300,True
1262,380000,1,2,1,50,0,0,0,0,0,...,154283,35270,332270,12020,9009,6109,3000,332000,12000,True
28716,780000,2,2,2,41,-2,-2,-2,-2,-2,...,21482,72628,182792,62819,39558,22204,82097,184322,25695,False
20893,550000,1,1,2,35,2,2,2,2,2,...,572805,823540,501370,23000,23000,18000,0,18373,18159,False
