[Reference](https://medium.com/analytics-vidhya/introduction-to-anomaly-detection-using-pycarat-519a13be24c5)

In [1]:
pip install pycaret

Collecting pycaret
  Downloading pycaret-2.3.10-py3-none-any.whl (320 kB)
[K     |████████████████████████████████| 320 kB 4.3 MB/s 
Collecting scikit-learn==0.23.2
  Downloading scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 47.5 MB/s 
[?25hCollecting Boruta
  Downloading Boruta-0.3-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 3.5 MB/s 
Collecting pyod
  Downloading pyod-0.9.9.tar.gz (116 kB)
[K     |████████████████████████████████| 116 kB 50.2 MB/s 
[?25hCollecting imbalanced-learn==0.7.0
  Downloading imbalanced_learn-0.7.0-py3-none-any.whl (167 kB)
[K     |████████████████████████████████| 167 kB 46.4 MB/s 
[?25hCollecting kmodes>=0.10.1
  Downloading kmodes-0.12.0-py2.py3-none-any.whl (20 kB)
Collecting scikit-plot
  Downloading scikit_plot-0.3.7-py3-none-any.whl (33 kB)
Collecting umap-learn
  Downloading umap-learn-0.5.2.tar.gz (86 kB)
[K     |████████████████████████████████| 8

# Getting the Data

In [21]:
from pycaret.utils import enable_colab
enable_colab()

Colab mode enabled.


In [2]:
from pycaret.datasets import get_data
dataset = get_data('mice')

Unnamed: 0,MouseID,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,...,pCFOS_N,SYP_N,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,Genotype,Treatment,Behavior,class
0,309_1,0.503644,0.747193,0.430175,2.816329,5.990152,0.21883,0.177565,2.373744,0.232224,...,0.108336,0.427099,0.114783,0.13179,0.128186,1.675652,Control,Memantine,C/S,c-CS-m
1,309_2,0.514617,0.689064,0.41177,2.789514,5.685038,0.211636,0.172817,2.29215,0.226972,...,0.104315,0.441581,0.111974,0.135103,0.131119,1.74361,Control,Memantine,C/S,c-CS-m
2,309_3,0.509183,0.730247,0.418309,2.687201,5.622059,0.209011,0.175722,2.283337,0.230247,...,0.106219,0.435777,0.111883,0.133362,0.127431,1.926427,Control,Memantine,C/S,c-CS-m
3,309_4,0.442107,0.617076,0.358626,2.466947,4.979503,0.222886,0.176463,2.152301,0.207004,...,0.111262,0.391691,0.130405,0.147444,0.146901,1.700563,Control,Memantine,C/S,c-CS-m
4,309_5,0.43494,0.61743,0.358802,2.365785,4.718679,0.213106,0.173627,2.134014,0.192158,...,0.110694,0.434154,0.118481,0.140314,0.14838,1.83973,Control,Memantine,C/S,c-CS-m


In [3]:
#check the shape of data
dataset.shape

(1080, 82)

In [4]:
data = dataset.sample(frac=0.95, random_state=786)
data_unseen = dataset.drop(data.index)

data.reset_index(drop=True, inplace=True)
data_unseen.reset_index(drop=True, inplace=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (1026, 82)
Unseen Data For Predictions: (54, 82)


# Setting up Environment in PyCaret

In [5]:
from pycaret.anomaly import *

exp_ano101 = setup(data, normalize = True, 
                   ignore_features = ['MouseID'],
                   session_id = 123)

Unnamed: 0,Description,Value
0,session_id,123
1,Original Data,"(1026, 82)"
2,Missing Values,True
3,Numeric Features,77
4,Categorical Features,4
5,Ordinal Features,False
6,High Cardinality Features,False
7,High Cardinality Method,
8,Transformed Data,"(1026, 91)"
9,CPU Jobs,-1


# Create a Model

In [6]:
iforest = create_model('iforest')

In [7]:
print(iforest)

IForest(behaviour='new', bootstrap=False, contamination=0.05,
    max_features=1.0, max_samples='auto', n_estimators=100, n_jobs=-1,
    random_state=123, verbose=0)


In [8]:
svm = create_model('svm', fraction = 0.025)
print(svm)

OCSVM(cache_size=200, coef0=0.0, contamination=0.025, degree=3, gamma='auto',
   kernel='rbf', max_iter=-1, nu=0.5, shrinking=True, tol=0.001,
   verbose=False)


In [9]:
models()

Unnamed: 0_level_0,Name,Reference
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
abod,Angle-base Outlier Detection,pyod.models.abod.ABOD
cluster,Clustering-Based Local Outlier,pyod.models.cblof.CBLOF
cof,Connectivity-Based Local Outlier,pyod.models.cof.COF
iforest,Isolation Forest,pyod.models.iforest.IForest
histogram,Histogram-based Outlier Detection,pyod.models.hbos.HBOS
knn,K-Nearest Neighbors Detector,pyod.models.knn.KNN
lof,Local Outlier Factor,pyod.models.lof.LOF
svm,One-class SVM detector,pyod.models.ocsvm.OCSVM
pca,Principal Component Analysis,pyod.models.pca.PCA
mcd,Minimum Covariance Determinant,pyod.models.mcd.MCD


# Assign a Model

In [10]:
iforest_results = assign_model(iforest)
iforest_results.head()

Unnamed: 0,MouseID,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,...,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,Genotype,Treatment,Behavior,class,Anomaly,Anomaly_Score
0,3501_12,0.34493,0.626194,0.383583,2.534561,4.097317,0.303547,0.222829,4.592769,0.239427,...,0.2527,0.218868,0.249187,1.139493,Ts65Dn,Memantine,S/C,t-SC-m,0,-0.014462
1,3520_5,0.630001,0.839187,0.357777,2.651229,4.261675,0.253184,0.185257,3.816673,0.20494,...,0.155008,0.153219,,1.642886,Control,Memantine,C/S,c-CS-m,0,-0.070193
2,3414_13,0.555122,0.726229,0.278319,2.097249,2.897553,0.222222,0.174356,1.86788,0.203379,...,0.136109,0.15553,0.185484,1.65767,Ts65Dn,Memantine,C/S,t-CS-m,0,-0.070143
3,3488_8,0.275849,0.430764,0.285166,2.265254,3.250091,0.189258,0.157837,2.917611,0.202594,...,0.127944,0.207671,0.175357,0.893598,Control,Saline,S/C,c-SC-s,0,-0.080521
4,3501_7,0.304788,0.617299,0.335164,2.638236,4.876609,0.28059,0.199417,4.835421,0.236314,...,0.245277,0.202171,0.240372,0.795637,Ts65Dn,Memantine,S/C,t-SC-m,0,-0.064749


# Plot a Model

In [11]:
plot_model(iforest)

In [23]:
# plot_model(iforest, plot = 'umap')

# Predict on Unseen Data

In [15]:
unseen_predictions = predict_model(iforest, data=data_unseen)
unseen_predictions.head()

Unnamed: 0,MouseID,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,...,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,Genotype,Treatment,Behavior,class,Anomaly,Anomaly_Score
0,309_6,0.447506,0.628176,0.367388,2.385939,4.807635,0.218578,0.176233,2.141282,0.195188,...,0.116657,0.140766,0.14218,1.816389,Control,Memantine,C/S,c-CS-m,0,-0.077131
1,311_3,0.704633,0.802537,0.35011,2.467733,5.5484,0.205323,0.165058,2.107281,0.171401,...,0.111089,0.157731,0.158543,1.404481,Control,Memantine,C/S,c-CS-m,0,-0.060165
2,321_4,0.505093,0.695549,0.376029,2.915585,5.917957,0.226734,0.174271,2.663039,0.190038,...,0.131515,0.188391,,1.69926,Control,Memantine,C/S,c-CS-m,0,-0.052132
3,3415_12,0.429133,0.563175,0.258429,2.028151,3.542553,0.214075,0.176759,3.165139,0.16743,...,0.118223,0.171071,0.173702,1.405727,Control,Memantine,C/S,c-CS-m,0,-0.09161
4,3415_15,0.373648,0.471165,0.257909,1.860032,2.938526,0.218262,0.15038,2.610132,0.142571,...,0.086785,0.126537,0.11269,0.790975,Control,Memantine,C/S,c-CS-m,1,0.037436


In [16]:
data_predictions = predict_model(iforest, data = data)
data_predictions.head()

Unnamed: 0,MouseID,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,...,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,Genotype,Treatment,Behavior,class,Anomaly,Anomaly_Score
0,3501_12,0.34493,0.626194,0.383583,2.534561,4.097317,0.303547,0.222829,4.592769,0.239427,...,0.2527,0.218868,0.249187,1.139493,Ts65Dn,Memantine,S/C,t-SC-m,0,-0.014462
1,3520_5,0.630001,0.839187,0.357777,2.651229,4.261675,0.253184,0.185257,3.816673,0.20494,...,0.155008,0.153219,,1.642886,Control,Memantine,C/S,c-CS-m,0,-0.070193
2,3414_13,0.555122,0.726229,0.278319,2.097249,2.897553,0.222222,0.174356,1.86788,0.203379,...,0.136109,0.15553,0.185484,1.65767,Ts65Dn,Memantine,C/S,t-CS-m,0,-0.070143
3,3488_8,0.275849,0.430764,0.285166,2.265254,3.250091,0.189258,0.157837,2.917611,0.202594,...,0.127944,0.207671,0.175357,0.893598,Control,Saline,S/C,c-SC-s,0,-0.080521
4,3501_7,0.304788,0.617299,0.335164,2.638236,4.876609,0.28059,0.199417,4.835421,0.236314,...,0.245277,0.202171,0.240372,0.795637,Ts65Dn,Memantine,S/C,t-SC-m,0,-0.064749


# Saving the Model

In [17]:
save_model(iforest,'Final IForest Model 25Nov2020')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True,
                                       features_todrop=['MouseID'],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[],
                                       target='UNSUPERVISED_DUMMY_TARGET',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='most frequent',
                                 fill_value_categorical=None,
                                 fill_value_numer...
                 ('fix_perfect', 'passthrough'),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 

# Loading the Saved Model

In [18]:
saved_iforest = load_model('Final IForest Model 25Nov2020')

Transformation Pipeline and Model Successfully Loaded


In [19]:
new_prediction = predict_model(saved_iforest, data=data_unseen)

In [20]:
new_prediction.head()

Unnamed: 0,MouseID,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,...,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,Genotype,Treatment,Behavior,class,Anomaly,Anomaly_Score
0,309_6,0.447506,0.628176,0.367388,2.385939,4.807635,0.218578,0.176233,2.141282,0.195188,...,0.116657,0.140766,0.14218,1.816389,Control,Memantine,C/S,c-CS-m,0,-0.077131
1,311_3,0.704633,0.802537,0.35011,2.467733,5.5484,0.205323,0.165058,2.107281,0.171401,...,0.111089,0.157731,0.158543,1.404481,Control,Memantine,C/S,c-CS-m,0,-0.060165
2,321_4,0.505093,0.695549,0.376029,2.915585,5.917957,0.226734,0.174271,2.663039,0.190038,...,0.131515,0.188391,,1.69926,Control,Memantine,C/S,c-CS-m,0,-0.052132
3,3415_12,0.429133,0.563175,0.258429,2.028151,3.542553,0.214075,0.176759,3.165139,0.16743,...,0.118223,0.171071,0.173702,1.405727,Control,Memantine,C/S,c-CS-m,0,-0.09161
4,3415_15,0.373648,0.471165,0.257909,1.860032,2.938526,0.218262,0.15038,2.610132,0.142571,...,0.086785,0.126537,0.11269,0.790975,Control,Memantine,C/S,c-CS-m,1,0.037436
