### Import Libraries

In [19]:
import pandas as pd
from sklearn.ensemble import IsolationForest

### Import Data

In [20]:
iris_data = pd.read_csv('iris.csv')

### Data Understanding

In [21]:
iris_data.shape

(150, 6)

In [22]:
iris_data.isna().sum()

Unnamed: 0      0
Sepal.Length    0
Sepal.Width     0
Petal.Length    0
Petal.Width     0
Species         0
dtype: int64

In [23]:
iris_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    150 non-null    int64  
 1   Sepal.Length  150 non-null    float64
 2   Sepal.Width   150 non-null    float64
 3   Petal.Length  150 non-null    float64
 4   Petal.Width   150 non-null    float64
 5   Species       150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [24]:
iris_data.Species.value_counts()

Species
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64

In [25]:
del iris_data['Unnamed: 0']

In [26]:
iris_data.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [38]:
iris_data_encoded = pd.get_dummies(iris_data)

In [39]:
iris_data_encoded.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species_setosa,Species_versicolor,Species_virginica
0,5.1,3.5,1.4,0.2,True,False,False
1,4.9,3.0,1.4,0.2,True,False,False
2,4.7,3.2,1.3,0.2,True,False,False
3,4.6,3.1,1.5,0.2,True,False,False
4,5.0,3.6,1.4,0.2,True,False,False


### Model Building

In [40]:
iso_forest = IsolationForest(random_state=12,contamination=0.1)

### Model Training

In [41]:
iso_forest.fit(iris_data_encoded)

### Outlier Prediction

In [42]:
y_pred_outliers = iso_forest.predict(iris_data_encoded)

In [43]:
y_pred_outliers

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,
        1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1, -1, -1,
       -1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [44]:
iris_data_encoded.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species_setosa,Species_versicolor,Species_virginica
0,5.1,3.5,1.4,0.2,True,False,False
1,4.9,3.0,1.4,0.2,True,False,False
2,4.7,3.2,1.3,0.2,True,False,False
3,4.6,3.1,1.5,0.2,True,False,False
4,5.0,3.6,1.4,0.2,True,False,False


In [45]:
scores = iso_forest.decision_function(iris_data_encoded)

In [46]:
iris_data_encoded['scores'] = scores
iris_data_encoded['anomaly'] = y_pred_outliers

In [47]:
iris_data_encoded.head(15)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species_setosa,Species_versicolor,Species_virginica,scores,anomaly
0,5.1,3.5,1.4,0.2,True,False,False,0.136834,1
1,4.9,3.0,1.4,0.2,True,False,False,0.119556,1
2,4.7,3.2,1.3,0.2,True,False,False,0.110908,1
3,4.6,3.1,1.5,0.2,True,False,False,0.100089,1
4,5.0,3.6,1.4,0.2,True,False,False,0.123304,1
5,5.4,3.9,1.7,0.4,True,False,False,0.036103,1
6,4.6,3.4,1.4,0.3,True,False,False,0.081023,1
7,5.0,3.4,1.5,0.2,True,False,False,0.145503,1
8,4.4,2.9,1.4,0.2,True,False,False,0.044251,1
9,4.9,3.1,1.5,0.1,True,False,False,0.066888,1


In [48]:
iris_data_encoded.anomaly.value_counts()

anomaly
 1    135
-1     15
Name: count, dtype: int64

In [49]:
iris_data_encoded[iris_data_encoded['anomaly']==1]

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species_setosa,Species_versicolor,Species_virginica,scores,anomaly
0,5.1,3.5,1.4,0.2,True,False,False,0.136834,1
1,4.9,3.0,1.4,0.2,True,False,False,0.119556,1
2,4.7,3.2,1.3,0.2,True,False,False,0.110908,1
3,4.6,3.1,1.5,0.2,True,False,False,0.100089,1
4,5.0,3.6,1.4,0.2,True,False,False,0.123304,1
...,...,...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,False,False,True,0.089993,1
146,6.3,2.5,5.0,1.9,False,False,True,0.071922,1
147,6.5,3.0,5.2,2.0,False,False,True,0.114254,1
148,6.2,3.4,5.4,2.3,False,False,True,0.061634,1


# Predictive Power Score

In [50]:
pip install ppscore

Collecting ppscore
  Downloading ppscore-1.3.0.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting pandas<2.0.0,>=1.0.0 (from ppscore)
  Downloading pandas-1.5.3.tar.gz (5.2 MB)
     ---------------------------------------- 0.0/5.2 MB ? eta -:--:--
     -- ------------------------------------- 0.3/5.2 MB ? eta -:--:--
     -------------- ------------------------- 1.8/5.2 MB 5.0 MB/s eta 0:00:01
     ---------------------------- ----------- 3.7/5.2 MB 6.8 MB/s eta 0:00:01
     ------------------------------------ --- 4.7/5.2 MB 6.1 MB/s eta 0:00:01
     ---------------------------------------- 5.2/5.2 MB 5.6 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: still running...
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata

  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
lifelines 0.30.0 requires pandas>=2.1, but you have pandas 1.5.3 which is incompatible.
visions 0.7.6 requires pandas>=2.0.0, but you have pandas 1.5.3 which is incompatible.


In [51]:
import ppscore as pps

In [52]:
iris_data.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [54]:
pps.score(iris_data,'Sepal.Length','Petal.Length')

{'x': 'Sepal.Length',
 'y': 'Petal.Length',
 'ppscore': 0.550422595049248,
 'case': 'regression',
 'is_valid_score': True,
 'metric': 'mean absolute error',
 'baseline_score': 1.4886666666666668,
 'model_score': 0.6692708968366863,
 'model': DecisionTreeRegressor()}

In [56]:
matrix= pps.matrix(iris_data)

In [61]:
matrix

Unnamed: 0,x,y,ppscore,case,is_valid_score,metric,baseline_score,model_score,model
0,Sepal.Length,Sepal.Length,1.0,predict_itself,True,,0.0,1.0,
1,Sepal.Length,Sepal.Width,0.0,regression,True,mean absolute error,0.330667,0.364704,DecisionTreeRegressor()
2,Sepal.Length,Petal.Length,0.550423,regression,True,mean absolute error,1.488667,0.669271,DecisionTreeRegressor()
3,Sepal.Length,Petal.Width,0.431739,regression,True,mean absolute error,0.644667,0.366339,DecisionTreeRegressor()
4,Sepal.Length,Species,0.471649,classification,True,weighted F1,0.353333,0.658333,DecisionTreeClassifier()
5,Sepal.Width,Sepal.Length,0.006966,regression,True,mean absolute error,0.684667,0.679897,DecisionTreeRegressor()
6,Sepal.Width,Sepal.Width,1.0,predict_itself,True,,0.0,1.0,
7,Sepal.Width,Petal.Length,0.172375,regression,True,mean absolute error,1.488667,1.232058,DecisionTreeRegressor()
8,Sepal.Width,Petal.Width,0.132858,regression,True,mean absolute error,0.644667,0.559017,DecisionTreeRegressor()
9,Sepal.Width,Species,0.156915,classification,True,weighted F1,0.353333,0.454805,DecisionTreeClassifier()


In [60]:
matrix[matrix['y']=='Species'].sort_values('ppscore',ascending=False)

Unnamed: 0,x,y,ppscore,case,is_valid_score,metric,baseline_score,model_score,model
24,Species,Species,1.0,predict_itself,True,,0.0,1.0,
19,Petal.Width,Species,0.927652,classification,True,weighted F1,0.353333,0.953215,DecisionTreeClassifier()
14,Petal.Length,Species,0.884812,classification,True,weighted F1,0.353333,0.925512,DecisionTreeClassifier()
4,Sepal.Length,Species,0.471649,classification,True,weighted F1,0.353333,0.658333,DecisionTreeClassifier()
9,Sepal.Width,Species,0.156915,classification,True,weighted F1,0.353333,0.454805,DecisionTreeClassifier()
