In [2]:
from sklearn.ensemble import IsolationForest
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils import resample
import pandas as pd

In [3]:
breast_cancer = load_breast_cancer()
df = pd.DataFrame(data=breast_cancer.data, columns=breast_cancer.feature_names)
df["benign"] = breast_cancer.target

In [4]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,benign
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


For our use case, we will assume that a malignant label is anomalous. The dataset contains a relatively high number of malignant tumors. Thus, we make use of downsampling.

In [5]:
majority_df = df[df["benign"] == 1]
minority_df = df[df["benign"] == 0]
minority_downsampled_df = resample(minority_df, replace=True, n_samples=30, random_state=42)
downsampled_df = pd.concat([majority_df, minority_downsampled_df])

In [6]:
downsampled_df["benign"].value_counts()

1    357
0     30
Name: benign, dtype: int64

We save the features and target as separate variables.

In [167]:
y = downsampled_df["benign"]
X = downsampled_df.drop("benign", axis=1)

In [168]:
X

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
19,13.540,14.36,87.46,566.3,0.09779,0.08129,0.06664,0.047810,0.1885,0.05766,...,15.110,19.26,99.70,711.2,0.14400,0.17730,0.23900,0.12880,0.2977,0.07259
20,13.080,15.71,85.63,520.0,0.10750,0.12700,0.04568,0.031100,0.1967,0.06811,...,14.500,20.49,96.09,630.5,0.13120,0.27760,0.18900,0.07283,0.3184,0.08183
21,9.504,12.44,60.34,273.9,0.10240,0.06492,0.02956,0.020760,0.1815,0.06905,...,10.230,15.66,65.13,314.9,0.13240,0.11480,0.08867,0.06227,0.2450,0.07773
37,13.030,18.42,82.61,523.8,0.08983,0.03766,0.02562,0.029230,0.1467,0.05863,...,13.300,22.81,84.46,545.9,0.09701,0.04619,0.04833,0.05013,0.1987,0.06169
46,8.196,16.84,51.71,201.9,0.08600,0.05943,0.01588,0.005917,0.1769,0.06503,...,8.964,21.96,57.26,242.2,0.12970,0.13570,0.06880,0.02564,0.3105,0.07409
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,19.550,28.77,133.60,1207.0,0.09260,0.20630,0.17840,0.114400,0.1893,0.06232,...,25.050,36.27,178.60,1926.0,0.12810,0.53290,0.42510,0.19410,0.2818,0.10050
489,16.690,20.20,107.10,857.6,0.07497,0.07112,0.03649,0.023070,0.1846,0.05325,...,19.180,26.56,127.30,1084.0,0.10090,0.29200,0.24770,0.08737,0.4677,0.07623
461,27.420,26.27,186.90,2501.0,0.10840,0.19880,0.36350,0.168900,0.2061,0.05623,...,36.040,31.37,251.20,4254.0,0.13570,0.42560,0.68330,0.26250,0.2641,0.07427
23,21.160,23.04,137.20,1404.0,0.09428,0.10220,0.10970,0.086320,0.1769,0.05278,...,29.170,35.59,188.00,2615.0,0.14010,0.26000,0.31550,0.20090,0.2822,0.07526


In [178]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=23)

## Load and fit the model

In [184]:
model=IsolationForest(n_estimators=70, max_samples='auto', contamination=float(0.1),max_features=1.0, n_jobs=2)
model.fit(X_train)



**Number of estimators:**  ***n_estimators*** refers to the number of base estimators or trees in the ensemble

**Max samples:** ***max_samples*** is the number of samples to be drawn to train each base estimator. If max_samples is more than the number of samples provided, all samples will be used for all trees. The default value of max_samples is 'auto'. If 'auto', then max_samples=min(256, n_samples)

**Contamination:** This is a parameter that the algorithm is quite sensitive to; it refers to the expected proportion of outliers in the data set. This is used when fitting to define the threshold on the scores of the samples. The default value is 'auto'. If ‘auto’, the threshold value will be determined as in the original paper of Isolation Forest.

**Max features:** All the base estimators are not trained with all the features available in the dataset. It is the number of features to draw from the total features to train each base estimator or tree.The default value of max features is one.

**bootstrapbool:**, default=False
   If True, individual trees are fit on random subsets of the training data sampled with replacement. If False, sampling without replacement is performed.
   
   **n_jobs=int**: default=None. The number of jobs to run in parallel for both fit and predict. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. 
   
   **random_stateint:**, RandomState instance or None, default=None
Controls the pseudo-randomness of the selection of the feature and split values for each branching step and each tree in the forest. Pass an int for reproducible results across multiple function calls.

***ref***: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html

***ref**: https://blog.paperspace.com/anomaly-detection-isolation-forest/


## Make a prediction, add the results, and a column of anomalies.

Let's find the scores and anomaly column. We can find out the values of scores column by calling decision_function() of the trained model and passing the test data.

**decision_function(X)** : Average anomaly score of X of the base classifiers.


**fit_predict(X[, y])** : Perform fit on X and returns labels for X.


**get_params([deep])** : Get parameters for this estimator.


**predict(X) :** Predict if a particular sample is an outlier or not.

**score_samples(X):** Opposite of the anomaly score defined in the original paper.

**Ref:** https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html 

In [185]:
df1=X_test.copy()
df1['score']=model.decision_function(X_test)

df1['anomaly']=model.predict(X_test)
df1.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,score,anomaly
486,14.64,16.85,94.21,666.0,0.08641,0.06698,0.05192,0.02791,0.1409,0.05355,...,106.0,831.0,0.1142,0.207,0.2437,0.07828,0.2455,0.06596,0.088019,1
237,20.48,21.46,132.5,1306.0,0.08355,0.08348,0.09042,0.06022,0.1467,0.05177,...,161.7,1750.0,0.1228,0.2311,0.3158,0.1445,0.2238,0.07127,-0.027668,-1
332,11.22,19.86,71.94,387.3,0.1054,0.06779,0.005006,0.007583,0.194,0.06028,...,76.91,436.1,0.1424,0.09669,0.01335,0.02022,0.3292,0.06522,0.06313,1
157,16.84,19.46,108.4,880.2,0.07445,0.07223,0.0515,0.02771,0.1844,0.05268,...,120.3,1032.0,0.08774,0.171,0.1882,0.08436,0.2527,0.05972,0.057955,1
445,11.99,24.89,77.61,441.3,0.103,0.09218,0.05441,0.04274,0.182,0.0685,...,84.48,513.9,0.1311,0.1822,0.1609,0.1202,0.2599,0.08251,0.112689,1


In [186]:
df1[['score','anomaly']].head()

Unnamed: 0,score,anomaly
486,0.088019,1
237,-0.027668,-1
332,0.06313,1
157,0.057955,1
445,0.112689,1


# Evaluating the model

In [187]:
y_pred = df1['anomaly'].copy()
y_pred.values[y_pred.values == -1] = 0

In [188]:
confusion_matrix(y_test, y_pred)

array([[ 1,  5],
       [ 8, 83]])

# Print numbers of anomalies

In [175]:
anomaly=df1.loc[df1['anomaly']==-1]
anomaly_index=list(anomaly.index)
abs(sum(anomaly['anomaly']))

10

# AUROC 