# Find anomalous cpu hs06 figures and outlayer CPUs (overclock, bad PSU) 

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

from sklearn import svm
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest

import pickle

### Get datasets

In [2]:
jobs = pd.read_csv('job.csv').set_index('pandaid')
jobs.head()

Unnamed: 0_level_0,cputime,walltime,hs06,processingtype,nevents,cpueff,wallPerEvent,cpuPerEvent
pandaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3326375042,29513,11863,96,reprocessing,2338,2.487819,5.073995,12.623182
3327803396,22947,3502,77,merge,64062,6.552541,0.054666,0.3582
3330689156,7696,2433,90,merge,22779,3.163173,0.106809,0.337855
3335362910,3260,4759,10,reprocessing,12652,0.685018,0.376146,0.257667
3334380752,650,2313,10,reprocessing,2152,0.28102,1.074814,0.302045


In [3]:
le={}
le['processingtype'] = preprocessing.LabelEncoder()
jobs['processingtype'] = le['processingtype'].fit_transform(jobs['processingtype'])
jobs.head()

Unnamed: 0_level_0,cputime,walltime,hs06,processingtype,nevents,cpueff,wallPerEvent,cpuPerEvent
pandaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3326375042,29513,11863,96,7,2338,2.487819,5.073995,12.623182
3327803396,22947,3502,77,2,64062,6.552541,0.054666,0.3582
3330689156,7696,2433,90,2,22779,3.163173,0.106809,0.337855
3335362910,3260,4759,10,7,12652,0.685018,0.376146,0.257667
3334380752,650,2313,10,7,2152,0.28102,1.074814,0.302045


In [4]:
benchmarks = pd.read_csv('benchmark.csv').set_index('pandaid')

le['cpuname'] = preprocessing.LabelEncoder()
le['ip'] = preprocessing.LabelEncoder()
le['site'] = preprocessing.LabelEncoder()

benchmarks['cpuname'] = le['cpuname'].fit_transform(benchmarks['cpuname'])
benchmarks['ip'] = le['ip'].fit_transform(benchmarks['ip'])
benchmarks['site'] = le['site'].fit_transform(benchmarks['site'])
benchmarks.head()

Unnamed: 0_level_0,bogomips,ip,site,mpnum,cpuname,meminfo,fastBmk,whetstone
pandaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3322806223,5333.52,8583,242,1,134,2044632.0,9.223674,3195.99
3322633663,5000.0,4829,249,1,48,16330708.0,12.12938,2749.34
3322488785,4988.06,8191,188,1,117,65840380.0,23.731048,3599.7
3322806277,5333.16,6920,63,1,75,37140112.0,12.240734,3426.18
3322875248,5332.58,3617,204,1,49,32878000.0,12.349914,2911.33


### join dataframes and filter out bad rows

In [5]:
benchmarks = benchmarks.join(jobs,how='inner')
benchmarks = benchmarks[benchmarks.nevents > 0]
benchmarks = benchmarks[benchmarks.hs06 > 0]
benchmarks = benchmarks[benchmarks.cpuPerEvent > 0]
benchmarks.head()

Unnamed: 0_level_0,bogomips,ip,site,mpnum,cpuname,meminfo,fastBmk,whetstone,cputime,walltime,hs06,processingtype,nevents,cpueff,wallPerEvent,cpuPerEvent
pandaid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
3323688554,5199.3,11338,265,1,104,65931600.0,15.404365,3616.23,22418,22634,12,1,200,0.990457,113.17,112.09
3323737831,5199.28,13473,229,1,104,65936052.0,14.863749,3612.79,73644,74922,10,8,1000,0.982942,74.922,73.644
3323782011,5199.25,13458,229,1,104,65936052.0,14.869888,3628.96,30822,31400,10,1,200,0.981592,157.0,154.11
3323782017,4189.98,13408,229,1,91,65853344.0,18.13602,3306.22,31856,32341,10,1,200,0.985004,161.705,159.28
3323788151,5199.24,13370,229,1,101,65855220.0,20.642202,3495.29,33442,33895,10,1,200,0.986635,169.475,167.21


### split on features and target

In [7]:
#target
Y = benchmarks['cpuPerEvent'].values

#cpuPerEvent.head()
X=benchmarks.copy()
del X['cpuPerEvent']
del X['cputime']
del X['walltime']
del X['nevents']
del X['meminfo']
del X['wallPerEvent']
del X['cpueff']
X.head()


Unnamed: 0,site,cpuname,ip,processingtype,bogomips,mpnum,fastBmk,whetstone,hs06
0,0,102,12373,8,4799.31,1.0,13.412817,3400.13,10.0
1,0,102,12377,1,4799.32,1.0,16.431713,3401.555,10.0
2,0,102,12378,1,4799.31,1.0,13.157895,3416.0,10.0
3,0,102,12512,1,4799.3,1.0,16.869728,3569.77,10.0
4,0,102,12513,1,4799.29,1.0,13.99689,3525.48,10.0


In [None]:
outliers_fraction = 0.05
classifiers = {
    "One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, kernel="rbf", gamma=0.1),
    "Robust covariance": EllipticEnvelope(contamination=outliers_fraction),
    "Isolation Forest": IsolationForest( contamination=outliers_fraction, random_state=42)
}

for i, (clf_name, clf) in enumerate(classifiers.items()):
    # fit the data and tag outliers
    clf.fit(X)
    scores_pred = clf.decision_function(X)
    