In [None]:
# install necessary packages
!pip install PyNomaly

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# loading train data
import pandas as pd
train_data_csv = pd.read_csv("/content/drive/MyDrive/Filters_colorectal/train_30_noise.csv")

In [None]:
# library imports
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import DBSCAN
from PyNomaly import loop

In [None]:
t_losses = train_data_csv.loss.values

In [None]:
print(f"Max loss: {t_losses.max()}")
print(f"Min loss: {t_losses.min()}")

Max loss: 9.223382949829102
Min loss: 0.0


In [None]:
train_data_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Columns: 1026 entries, f_1 to label
dtypes: float64(1025), int64(1)
memory usage: 31.3 MB


In [None]:
# drop loss and label columns --> can we keep loss as well ?
train_data = pd.DataFrame(train_data_csv.drop(['loss', 'label'], axis=1))
train_data.head()

Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,...,f_1015,f_1016,f_1017,f_1018,f_1019,f_1020,f_1021,f_1022,f_1023,f_1024
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.087052,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.16911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.027428


In [None]:
# pLOF without cluster labels
m = loop.LocalOutlierProbability(train_data, extent=2, n_neighbors=20, use_numba=True, progress_bar=True).fit()
p_scores = m.local_outlier_probabilities
print(f"total probabilities: {len(p_scores)}")



In [None]:
# merging pLOF with original dataframe
# assigning pLOF values as a column in dataframe
train_data_wo_clust = train_data_csv.assign(plof_scores=p_scores)
train_data_wo_clust.head()

Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,...,f_1018,f_1019,f_1020,f_1021,f_1022,f_1023,f_1024,loss,label,plof_scores
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.091452,4,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.074872,2,0.001962
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000101,5,0.371803
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.3e-05,6,0.000142
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.027428,0.002919,4,0.011295


In [None]:
# save without sorting
train_data_wo_clust.to_csv(r'/content/drive/MyDrive/Filters_colorectal/30_all_train_features_with_plof.csv', index=False)

In [None]:
print(f"minimum loss: {train_data_wo_clust['loss'].min()}")
print(f"maximum loss: {train_data_wo_clust['loss'].max()}")
print(f"minimum pLOF score: {train_data_wo_clust['plof_scores'].min()}")
print(f"maximum pLOF score: {train_data_wo_clust['plof_scores'].max()}")

minimum loss: 0.0
maximum loss: 9.223382949829102
minimum pLOF score: 0.0
maximum pLOF score: 1.0


In [None]:
# taking a new dataframe and store sorted dataframe based on loss and plof scores
sorted_train_data_wo = train_data_wo_clust.sort_values(by=['plof_scores', 'loss'],
                                 ascending= [True, True], axis=0)
sorted_train_data_wo.head(10)

Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,...,f_1018,f_1019,f_1020,f_1021,f_1022,f_1023,f_1024,loss,label,plof_scores
30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,8.831419,0.0,3,0.0
39,0.0,0.0,0.0,0.0,0.0,3.273771,0.0,0.0,0.0,0.728039,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
54,0.0,0.0,0.0,0.0,0.0,3.260179,0.0,0.0,0.0,0.361833,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
84,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9.061059,0.0,3,0.0
125,0.0,0.0,0.0,0.0,0.0,2.801856,0.0,0.0,0.0,0.465698,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
341,0.0,0.0,0.0,0.0,0.0,3.154172,0.0,0.0,0.0,0.746262,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
369,0.0,0.0,0.0,0.0,0.0,2.527183,0.0,0.0,0.0,0.218023,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
386,0.0,0.0,0.0,0.0,0.0,5.619728,0.0,0.0,0.0,0.979364,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9.959064,0.0,3,0.0
449,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,11.183116,0.0,3,0.0


In [None]:
# taking 70 percentage of data after sorting
percentage = round((len(sorted_train_data_wo) / 100) * 70)
percentage = int(percentage)
print(f"data to be selected: {percentage}")

# now taking final dataset
final_train_woc = sorted_train_data_wo[:percentage]
final_train_woc.head()

data to be selected: 2800


Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,...,f_1018,f_1019,f_1020,f_1021,f_1022,f_1023,f_1024,loss,label,plof_scores
30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,8.831419,0.0,3,0.0
39,0.0,0.0,0.0,0.0,0.0,3.273771,0.0,0.0,0.0,0.728039,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
54,0.0,0.0,0.0,0.0,0.0,3.260179,0.0,0.0,0.0,0.361833,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
84,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9.061059,0.0,3,0.0
125,0.0,0.0,0.0,0.0,0.0,2.801856,0.0,0.0,0.0,0.465698,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0


In [None]:
final_train_woc.to_csv(r'/content/drive/MyDrive/Filters_colorectal/30_selected_train_features_with_plof_wo_cluster.csv', index=False)

In [None]:
# pLOF with cluster labels, here we can use any cluster based algorithm to get cluster labels
# using DBSCAN
db = DBSCAN(eps=0.9, min_samples=10).fit(train_data)

m_clust = loop.LocalOutlierProbability(train_data, extent=2, cluster_labels=list(db.labels_), use_numba=True, progress_bar=True).fit()
p_scores_clust = m_clust.local_outlier_probabilities
print(" ")
print(f"total probabilities: {p_scores_clust}")
# print("scores_clust: \n", scores_clust)

total probabilities: [0.19124835522798278 0.0 0.5002996944517496 ... 0.3057721586860246
 0.4289951745764994 0.0]


In [None]:
# assigning pLOF values as a column in dataframe
train_data_clust = train_data_csv.assign(plof_scores=p_scores_clust)
train_data_clust.head()

Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,...,f_1018,f_1019,f_1020,f_1021,f_1022,f_1023,f_1024,loss,label,plof_scores
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.091452,4,0.191248
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.074872,2,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000101,5,0.5003
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.3e-05,6,0.037311
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.027428,0.002919,4,0.360721


In [None]:
# save without sorting using cluster
train_data_clust.to_csv(r'/content/drive/MyDrive/Filters_colorectal/30_all_train_features_with_plof_cluster.csv', index=False)

In [None]:
print(f"minimum loss: {train_data_clust['loss'].min()}")
print(f"maximum loss: {train_data_clust['loss'].max()}")
print(f"minimum pLOF score: {train_data_clust['plof_scores'].min()}")
print(f"maximum pLOF score: {train_data_clust['plof_scores'].max()}")

minimum loss: 0.0
maximum loss: 9.223382949829102
minimum pLOF score: 0.0
maximum pLOF score: 0.9999425809144615


In [None]:
# taking a new dataframe and store sorted dataframe based on loss and plof scores
sorted_train_data_clust = train_data_clust.sort_values(by=['plof_scores', 'loss'],
                                 ascending= [True, True], axis=0)
sorted_train_data_clust.head(10)

Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,...,f_1018,f_1019,f_1020,f_1021,f_1022,f_1023,f_1024,loss,label,plof_scores
30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,8.831419,0.0,3,0.0
39,0.0,0.0,0.0,0.0,0.0,3.273771,0.0,0.0,0.0,0.728039,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
54,0.0,0.0,0.0,0.0,0.0,3.260179,0.0,0.0,0.0,0.361833,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
67,0.0,0.0,0.0,0.0,0.0,4.334775,0.0,0.0,0.0,0.679997,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
125,0.0,0.0,0.0,0.0,0.0,2.801856,0.0,0.0,0.0,0.465698,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
369,0.0,0.0,0.0,0.0,0.0,2.527183,0.0,0.0,0.0,0.218023,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
383,0.0,0.0,0.0,0.0,0.0,3.170602,0.0,0.0,0.0,1.004887,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
391,0.0,0.0,0.0,0.0,0.0,3.954205,0.0,0.0,0.0,0.639764,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
449,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,11.183116,0.0,3,0.0
490,0.0,0.0,0.0,0.0,0.0,3.3642,0.0,0.0,0.0,0.745108,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0


In [None]:
# taking 70 percentage of data after sorting
percentage = round((len(sorted_train_data_clust) / 100) * 70)
percentage = int(percentage)
print(f"data to be selected: {percentage}")

# now taking final dataset
final_train_clust = sorted_train_data_clust[:percentage]
final_train_clust.head()

data to be selected: 2800


Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,...,f_1018,f_1019,f_1020,f_1021,f_1022,f_1023,f_1024,loss,label,plof_scores
30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,8.831419,0.0,3,0.0
39,0.0,0.0,0.0,0.0,0.0,3.273771,0.0,0.0,0.0,0.728039,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
54,0.0,0.0,0.0,0.0,0.0,3.260179,0.0,0.0,0.0,0.361833,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
67,0.0,0.0,0.0,0.0,0.0,4.334775,0.0,0.0,0.0,0.679997,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
125,0.0,0.0,0.0,0.0,0.0,2.801856,0.0,0.0,0.0,0.465698,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0


In [None]:
# save with cluster
final_train_clust.to_csv(r'/content/drive/MyDrive/Filters_colorectal/30_selected_train_features_with_plof_cluster.csv', index=False)

In [None]:
del final_train_clust
del sorted_train_data_clust
del train_data_clust

**test data**

In [None]:
# do the same process for test data as well
# it will make the final dataset with label noise
# you will be using this train and test data for your next experiment =: feature selection, classification etc.

In [None]:
# loading test data
import pandas as pd
test_data_csv = pd.read_csv("/content/drive/MyDrive/Filters_colorectal/test_30_noise.csv")

In [None]:
t_losses = test_data_csv.loss.values

In [None]:
print(f"Max loss: {t_losses.max()}")
print(f"Min loss: {t_losses.min()}")

Max loss: 11.585342407226562
Min loss: 0.0


In [None]:
test_data_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 1026 entries, f_1 to label
dtypes: float64(1025), int64(1)
memory usage: 7.8 MB


In [None]:
# drop loss and label columns --> can we keep loss as well ?
test_data = pd.DataFrame(test_data_csv.drop(['loss', 'label'], axis=1))
test_data.head()

Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,...,f_1015,f_1016,f_1017,f_1018,f_1019,f_1020,f_1021,f_1022,f_1023,f_1024
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.889932,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,4.487196,0.0,0.0,0.0,1.419771,...,0.472916,0.0,8.289819,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.227223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.067922


In [None]:
# pLOF without cluster labels
m = loop.LocalOutlierProbability(test_data, extent=2, n_neighbors=20, use_numba=True, progress_bar=True).fit()
p_scores = m.local_outlier_probabilities
print(f"total probabilities: {len(p_scores)}")



In [None]:
# merging pLOF with original dataframe
# assigning pLOF values as a column in dataframe
test_data_wo_clust = test_data_csv.assign(plof_scores=p_scores)
test_data_wo_clust.head()

Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,...,f_1018,f_1019,f_1020,f_1021,f_1022,f_1023,f_1024,loss,label,plof_scores
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.091893,7,0.383456
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.519533,2,0.134396
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017773,2,0.133509
3,0.0,0.0,0.0,0.0,0.0,4.487196,0.0,0.0,0.0,1.419771,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.147695
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.067922,0.011317,5,0.366144


In [None]:
# save without sorting
test_data_wo_clust.to_csv(r'/content/drive/MyDrive/Filters_colorectal/30_all_test_features_with_plof.csv', index=False)

In [None]:
print(f"minimum loss: {test_data_wo_clust['loss'].min()}")
print(f"maximum loss: {test_data_wo_clust['loss'].max()}")
print(f"minimum pLOF score: {test_data_wo_clust['plof_scores'].min()}")
print(f"maximum pLOF score: {test_data_wo_clust['plof_scores'].max()}")

minimum loss: 0.0
maximum loss: 11.585342407226562
minimum pLOF score: 0.0
maximum pLOF score: 0.9990282724540579


In [None]:
# taking a new dataframe and store sorted dataframe based on loss and plof scores
sorted_test_data_wo = test_data_wo_clust.sort_values(by=['plof_scores', 'loss'],
                                 ascending= [True, True], axis=0)
sorted_test_data_wo.head(10)

Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,...,f_1018,f_1019,f_1020,f_1021,f_1022,f_1023,f_1024,loss,label,plof_scores
57,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,11.651211,0.0,3,0.0
66,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.133468,0.0,3,0.0
80,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.105808,0.0,3,0.0
159,0.0,0.0,0.0,0.0,0.0,4.163574,0.0,0.0,0.0,0.847536,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
213,0.0,0.0,0.0,0.0,0.0,3.304197,0.0,0.0,0.0,0.600333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,11.54534,0.0,4,0.0
227,0.0,0.0,0.0,0.0,0.0,4.32656,0.0,0.0,0.0,0.750134,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
247,0.0,0.0,0.0,0.0,0.0,2.346807,0.0,0.0,0.0,0.39473,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
273,0.0,0.0,0.0,0.0,0.0,2.555244,0.0,0.0,0.0,0.593737,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9.100317,0.0,3,0.0


In [None]:
# taking 70 percentage of data after sorting
percentage = round((len(sorted_test_data_wo) / 100) * 70)
percentage = int(percentage)
print(f"data to be selected: {percentage}")

# now taking final dataset
final_test_woc = sorted_test_data_wo[:percentage]
final_test_woc.head()

data to be selected: 700


Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,...,f_1018,f_1019,f_1020,f_1021,f_1022,f_1023,f_1024,loss,label,plof_scores
57,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,11.651211,0.0,3,0.0
66,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.133468,0.0,3,0.0
80,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.105808,0.0,3,0.0
159,0.0,0.0,0.0,0.0,0.0,4.163574,0.0,0.0,0.0,0.847536,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
213,0.0,0.0,0.0,0.0,0.0,3.304197,0.0,0.0,0.0,0.600333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0


In [None]:
final_test_woc.to_csv(r'/content/drive/MyDrive/Filters_colorectal/30_selected_test_features_with_plof_wo_cluster.csv', index=False)

In [None]:
# pLOF with cluster labels, here we can use any cluster based algorithm to get cluster labels
# using DBSCAN
db = DBSCAN(eps=0.9, min_samples=10).fit(test_data)

m_clust = loop.LocalOutlierProbability(test_data, extent=2, cluster_labels=list(db.labels_), use_numba=True, progress_bar=True).fit()
p_scores_clust = m_clust.local_outlier_probabilities
print(" ")
print(f"total probabilities: {p_scores_clust}")
# print("scores_clust: \n", scores_clust)

total probabilities: [0.0 0.05852776813478255 0.0 0.3515699539040839 0.28788167830179234 0.0
 0.05834830309837943 0.0 0.22931629456364833 0.32562926226198174 0.0 0.0
 0.18554070241706083 0.2610084625541716 0.15406770218012805 0.0
 0.32133835149614454 0.31334110579854213 0.41928222919396724
 0.46193208868847563 0.8427118270993538 0.0 0.18001577506440425 0.0
 0.6113289775503936 0.0 0.0 0.0 0.357231315601988 0.0 0.013775872498115826
 0.0 0.0 0.0 0.6021579834168198 0.6332707011029685 0.0349526898421166
 0.8106635221027284 0.0 0.0 0.0 0.0 0.02201820614041981 0.5002109256016728
 0.4773329247153477 0.3423194663777456 0.0 0.0 0.2494192590931445
 0.27343697530036726 0.0 0.8327235015721436 0.20412603405832316
 0.01871001617638668 0.4743733721985172 0.0 0.6694954223338652
 0.2777300084732784 0.21834020765269577 0.0 0.30464646909119647 0.0 0.0
 0.4535851569181812 0.139904474289345 0.09851761639414634
 0.19002654744020353 0.8896664260062154 0.43850215308086904 0.0 0.0 0.0
 0.9567012297815278 0.0 0.

In [None]:
# assigning pLOF values as a column in dataframe
test_data_clust = test_data_csv.assign(plof_scores=p_scores_clust)
test_data_clust.head()

Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,...,f_1018,f_1019,f_1020,f_1021,f_1022,f_1023,f_1024,loss,label,plof_scores
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.091893,7,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.519533,2,0.058528
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017773,2,0.0
3,0.0,0.0,0.0,0.0,0.0,4.487196,0.0,0.0,0.0,1.419771,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.35157
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.067922,0.011317,5,0.287882


In [None]:
# save without sorting using cluster
test_data_clust.to_csv(r'/content/drive/MyDrive/Filters_colorectal/30_all_test_features_with_plof_cluster.csv', index=False)

In [None]:
print(f"minimum loss: {test_data_clust['loss'].min()}")
print(f"maximum loss: {test_data_clust['loss'].max()}")
print(f"minimum pLOF score: {test_data_clust['plof_scores'].min()}")
print(f"maximum pLOF score: {test_data_clust['plof_scores'].max()}")

minimum loss: 0.0
maximum loss: 11.585342407226562
minimum pLOF score: 0.0
maximum pLOF score: 0.9983153628665917


In [None]:
# taking a new dataframe and store sorted dataframe based on loss and plof scores
sorted_test_data_clust = test_data_clust.sort_values(by=['plof_scores', 'loss'],
                                 ascending= [True, True], axis=0)
sorted_test_data_clust.head(10)

Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,...,f_1018,f_1019,f_1020,f_1021,f_1022,f_1023,f_1024,loss,label,plof_scores
227,0.0,0.0,0.0,0.0,0.0,4.32656,0.0,0.0,0.0,0.750134,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
247,0.0,0.0,0.0,0.0,0.0,2.346807,0.0,0.0,0.0,0.39473,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
320,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.664624,0.0,4,0.0
324,0.0,0.0,0.0,0.0,0.0,1.986207,0.0,0.0,0.0,0.198001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
461,0.0,0.0,0.0,0.0,0.0,2.289609,0.0,0.0,0.0,0.539653,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
755,0.0,0.0,0.0,0.0,0.0,2.548663,0.0,0.0,0.0,0.307429,...,0.0,0.0,0.0,0.0,0.0,0.070457,0.0,0.0,6,0.0
845,0.0,0.0,0.0,0.0,0.0,2.067249,0.0,0.0,0.0,0.124843,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,0.0
894,0.0,0.0,0.0,0.0,0.0,3.181312,0.0,0.0,0.0,0.049366,...,0.0,0.0,0.0,0.0,0.0,0.242531,0.0,0.0,7,0.0
256,0.0,0.0,0.0,0.0,0.0,2.380625,0.0,0.0,0.0,0.380331,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.192093e-07,7,0.0
400,0.0,0.0,0.0,0.0,0.0,1.795038,0.0,0.0,0.0,0.060358,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.192093e-07,6,0.0


In [None]:
# taking 70 percentage of data after sorting
percentage = round((len(sorted_test_data_clust) / 100) * 70)
percentage = int(percentage)
print(f"data to be selected: {percentage}")

# now taking final dataset
final_test_clust = sorted_test_data_clust[:percentage]
final_test_clust.head()

data to be selected: 700


Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,...,f_1018,f_1019,f_1020,f_1021,f_1022,f_1023,f_1024,loss,label,plof_scores
227,0.0,0.0,0.0,0.0,0.0,4.32656,0.0,0.0,0.0,0.750134,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
247,0.0,0.0,0.0,0.0,0.0,2.346807,0.0,0.0,0.0,0.39473,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
320,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.664624,0.0,4,0.0
324,0.0,0.0,0.0,0.0,0.0,1.986207,0.0,0.0,0.0,0.198001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
461,0.0,0.0,0.0,0.0,0.0,2.289609,0.0,0.0,0.0,0.539653,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0


In [None]:
# save with cluster
final_test_clust.to_csv(r'/content/drive/MyDrive/Filters_colorectal/30_selected_test_features_with_plof_cluster.csv', index=False)

In [None]:
del final_test_clust
del sorted_test_data_clust
del test_data_clust