## Filter dataset based on 5- and 95- percentiles

The classifier still predicts labels which correspond to several outer segments or which are too small to be considered a outer segment at all.

Here a filter of the 5- and 95- percentiles are applied based on the rescaled volume of the outer segments. 

Here, the 2 genotypes are handled independently from each other. Meaning, the dataframe will be split into a WT and a Cpfl1 dataset. Then, the 5- and 95- percentiles are calculated for both dataframes. Afterwards, the two dataframes will be filtered.

At the end both dataframes will be combined again.

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Define the path were measurements are stored
path = "../../measurements/cpfl/"

In [3]:
# Load measurements
measurements = pd.read_csv(path + "04-measurements-rescaled.csv")
measurements

Unnamed: 0,label,age,image_id,genotype,maximum,mean,median,minimum,sigma,sum,...,equivalent_spherical_perimeter_rescaled,equivalent_spherical_radius_rescaled,feret_diameter_rescaled,perimeter_2d_rescaled,major_axis_length_2d_rescaled,minor_axis_length_2d_rescaled,surface_area_rescaled,bbox_volume_rescaled,convex_volume_rescaled,volume_rescaled
0,1,8,0,cpfl,250.0,189.793103,184.144531,157.0,20.305487,5504.0,...,14.744113,0.615610,1.480172,2.957153,0.976660,0.904212,3.881768,1.516422,1.078345,0.977250
1,2,8,0,cpfl,334.0,207.273684,202.871094,133.0,42.706679,19691.0,...,32.521386,0.914283,2.624064,4.973873,2.127172,1.027014,7.095704,4.852550,3.572016,3.201335
2,4,8,0,cpfl,262.0,214.904762,215.355469,175.0,27.285353,4513.0,...,11.889571,0.552814,1.370373,2.957153,0.976660,0.904212,5.174290,1.516422,0.943551,0.707664
3,5,8,0,cpfl,311.0,195.171429,190.386719,118.0,36.997206,6831.0,...,16.713421,0.655434,1.678357,3.390218,1.255098,0.902928,7.379101,2.426275,1.381629,1.179439
4,6,8,0,cpfl,355.0,219.354167,215.355469,147.0,47.335750,21058.0,...,32.749208,0.917480,2.068209,5.481241,1.910113,1.514661,11.022045,6.065688,3.774206,3.235034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5845,176,70,39,wt,437.0,228.476298,219.855469,118.0,59.914239,101215.0,...,90.772814,1.527475,5.732681,13.307188,6.005873,1.731845,45.719796,55.197761,20.994020,14.928332
5846,177,70,39,wt,1542.0,485.858939,412.605469,137.0,265.029227,1043625.0,...,260.041601,2.585340,9.158631,32.454379,8.011704,7.033195,160.768595,300.992921,132.366793,72.383878
5847,178,70,39,wt,416.0,224.548673,213.832031,127.0,62.839505,25374.0,...,36.509477,0.968722,2.261000,5.734924,1.901773,1.715924,15.687029,8.491963,4.414473,3.807904
5848,180,70,39,wt,481.0,251.469466,237.925781,145.0,64.019970,65885.0,...,63.956915,1.282154,4.579315,13.771030,4.629315,2.203200,35.317111,35.383180,15.063125,8.828946


### Filter WT dataset

First, the WT dataset will be filtered.

In [4]:
# Retrieve WT measurements
measurements_wt = measurements[measurements["genotype"] == "wt"]
measurements_wt

Unnamed: 0,label,age,image_id,genotype,maximum,mean,median,minimum,sigma,sum,...,equivalent_spherical_perimeter_rescaled,equivalent_spherical_radius_rescaled,feret_diameter_rescaled,perimeter_2d_rescaled,major_axis_length_2d_rescaled,minor_axis_length_2d_rescaled,surface_area_rescaled,bbox_volume_rescaled,convex_volume_rescaled,volume_rescaled
2187,1,8,19,wt,1946.0,717.187793,580.285156,229.0,482.387745,152761.0,...,55.710841,1.196647,2.995379,7.572263,2.873678,1.764498,17.606808,14.557651,8.357170,7.177731
2188,2,8,19,wt,1972.0,526.794629,385.558594,138.0,420.391645,333461.0,...,115.156676,1.720445,6.752168,25.240878,7.369973,4.681271,70.701020,69.148844,40.168334,21.331003
2189,3,8,19,wt,756.0,382.525974,362.191406,180.0,126.378734,58909.0,...,44.878031,1.074022,2.778551,7.751645,2.681553,2.148312,17.738518,11.322618,6.234179,5.189533
2190,4,8,19,wt,851.0,374.461538,354.402344,195.0,104.215344,219060.0,...,109.259001,1.675811,5.787021,21.163912,6.436273,3.807164,59.762407,51.760538,33.664569,19.713486
2191,5,8,19,wt,810.0,343.696429,299.878906,182.0,148.821348,57741.0,...,47.558283,1.105629,5.248129,12.620439,6.398600,1.416824,27.726266,25.273700,11.356316,5.661309
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5845,176,70,39,wt,437.0,228.476298,219.855469,118.0,59.914239,101215.0,...,90.772814,1.527475,5.732681,13.307188,6.005873,1.731845,45.719796,55.197761,20.994020,14.928332
5846,177,70,39,wt,1542.0,485.858939,412.605469,137.0,265.029227,1043625.0,...,260.041601,2.585340,9.158631,32.454379,8.011704,7.033195,160.768595,300.992921,132.366793,72.383878
5847,178,70,39,wt,416.0,224.548673,213.832031,127.0,62.839505,25374.0,...,36.509477,0.968722,2.261000,5.734924,1.901773,1.715924,15.687029,8.491963,4.414473,3.807904
5848,180,70,39,wt,481.0,251.469466,237.925781,145.0,64.019970,65885.0,...,63.956915,1.282154,4.579315,13.771030,4.629315,2.203200,35.317111,35.383180,15.063125,8.828946


Compute 5- and 95- percentiles of the WT dataset.

In [5]:
# Compute 5- percentile of WT
five_percentile_wt = measurements_wt["volume_rescaled"].quantile(0.05)
five_percentile_wt

0.370680937

In [6]:
# Compute 95- percentiles of the WT dataset
ninety_five_percentile_wt = measurements_wt["volume_rescaled"].quantile(0.95)
ninety_five_percentile_wt

77.6509166480998

Now, the dataset will be filtered based on those percentiles and the rescaled volume.

In [7]:
# Filter wt measurements
measurements_wt_filtered = measurements_wt[(measurements_wt["volume_rescaled"] > five_percentile_wt) &
                                           (measurements_wt["volume_rescaled"] < ninety_five_percentile_wt)]
measurements_wt_filtered

Unnamed: 0,label,age,image_id,genotype,maximum,mean,median,minimum,sigma,sum,...,equivalent_spherical_perimeter_rescaled,equivalent_spherical_radius_rescaled,feret_diameter_rescaled,perimeter_2d_rescaled,major_axis_length_2d_rescaled,minor_axis_length_2d_rescaled,surface_area_rescaled,bbox_volume_rescaled,convex_volume_rescaled,volume_rescaled
2187,1,8,19,wt,1946.0,717.187793,580.285156,229.0,482.387745,152761.0,...,55.710841,1.196647,2.995379,7.572263,2.873678,1.764498,17.606808,14.557651,8.357170,7.177731
2188,2,8,19,wt,1972.0,526.794629,385.558594,138.0,420.391645,333461.0,...,115.156676,1.720445,6.752168,25.240878,7.369973,4.681271,70.701020,69.148844,40.168334,21.331003
2189,3,8,19,wt,756.0,382.525974,362.191406,180.0,126.378734,58909.0,...,44.878031,1.074022,2.778551,7.751645,2.681553,2.148312,17.738518,11.322618,6.234179,5.189533
2190,4,8,19,wt,851.0,374.461538,354.402344,195.0,104.215344,219060.0,...,109.259001,1.675811,5.787021,21.163912,6.436273,3.807164,59.762407,51.760538,33.664569,19.713486
2191,5,8,19,wt,810.0,343.696429,299.878906,182.0,148.821348,57741.0,...,47.558283,1.105629,5.248129,12.620439,6.398600,1.416824,27.726266,25.273700,11.356316,5.661309
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5845,176,70,39,wt,437.0,228.476298,219.855469,118.0,59.914239,101215.0,...,90.772814,1.527475,5.732681,13.307188,6.005873,1.731845,45.719796,55.197761,20.994020,14.928332
5846,177,70,39,wt,1542.0,485.858939,412.605469,137.0,265.029227,1043625.0,...,260.041601,2.585340,9.158631,32.454379,8.011704,7.033195,160.768595,300.992921,132.366793,72.383878
5847,178,70,39,wt,416.0,224.548673,213.832031,127.0,62.839505,25374.0,...,36.509477,0.968722,2.261000,5.734924,1.901773,1.715924,15.687029,8.491963,4.414473,3.807904
5848,180,70,39,wt,481.0,251.469466,237.925781,145.0,64.019970,65885.0,...,63.956915,1.282154,4.579315,13.771030,4.629315,2.203200,35.317111,35.383180,15.063125,8.828946


### Filter Cpfl1 dataset

Second, the Cpfl1 dataset will be filtered.

In [8]:
# Retrieve Cpfl1 dataset
measurements_cpfl = measurements[measurements["genotype"] == "cpfl"]
measurements_cpfl

Unnamed: 0,label,age,image_id,genotype,maximum,mean,median,minimum,sigma,sum,...,equivalent_spherical_perimeter_rescaled,equivalent_spherical_radius_rescaled,feret_diameter_rescaled,perimeter_2d_rescaled,major_axis_length_2d_rescaled,minor_axis_length_2d_rescaled,surface_area_rescaled,bbox_volume_rescaled,convex_volume_rescaled,volume_rescaled
0,1,8,0,cpfl,250.0,189.793103,184.144531,157.0,20.305487,5504.0,...,14.744113,0.615610,1.480172,2.957153,0.976660,0.904212,3.881768,1.516422,1.078345,0.977250
1,2,8,0,cpfl,334.0,207.273684,202.871094,133.0,42.706679,19691.0,...,32.521386,0.914283,2.624064,4.973873,2.127172,1.027014,7.095704,4.852550,3.572016,3.201335
2,4,8,0,cpfl,262.0,214.904762,215.355469,175.0,27.285353,4513.0,...,11.889571,0.552814,1.370373,2.957153,0.976660,0.904212,5.174290,1.516422,0.943551,0.707664
3,5,8,0,cpfl,311.0,195.171429,190.386719,118.0,36.997206,6831.0,...,16.713421,0.655434,1.678357,3.390218,1.255098,0.902928,7.379101,2.426275,1.381629,1.179439
4,6,8,0,cpfl,355.0,219.354167,215.355469,147.0,47.335750,21058.0,...,32.749208,0.917480,2.068209,5.481241,1.910113,1.514661,11.022045,6.065688,3.774206,3.235034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2182,82,70,18,cpfl,1226.0,413.224870,364.970703,156.0,181.391424,398762.0,...,152.535708,1.980077,11.076593,24.764288,12.670394,1.954378,67.834520,66.048603,44.346919,32.518828
2183,83,70,18,cpfl,629.0,289.981818,265.083984,151.0,98.478702,31898.0,...,35.860398,0.960072,2.166750,5.227557,1.809555,1.518178,9.973372,5.897197,4.077490,3.706809
2184,84,70,18,cpfl,626.0,302.453125,280.451172,175.0,112.065916,38714.0,...,39.672840,1.009818,2.261000,5.406938,1.904950,1.570862,8.109682,5.897197,4.582964,4.313378
2185,86,70,18,cpfl,1179.0,431.298539,364.970703,152.0,205.767885,619776.0,...,198.910553,2.261128,14.997777,37.428252,15.418214,2.924642,141.693020,185.306770,90.243959,48.424410


Compute 5- and 95- percentile of the Cpfl1 dataset.

In [9]:
# Calculate five percentile of Cpfl1 dataset
five_percentile_cpfl = measurements_cpfl["volume_rescaled"].quantile(0.05)
five_percentile_cpfl

0.370680937

In [10]:
# Calculate 95- percentile of Cpfl1 dataset
ninety_five_percentile_cpfl = measurements_cpfl["volume_rescaled"].quantile(0.95)
ninety_five_percentile_cpfl

51.635854524099955

Apply filters to Cpfl1 dataset, volume rescaled.

In [11]:
# Filter volume of Cpfl1 dataset
measurements_cpfl_filtered = measurements_cpfl[(measurements_cpfl["volume_rescaled"] > five_percentile_cpfl) &
                                               (measurements_cpfl["volume_rescaled"] < ninety_five_percentile_cpfl)]
measurements_cpfl_filtered

Unnamed: 0,label,age,image_id,genotype,maximum,mean,median,minimum,sigma,sum,...,equivalent_spherical_perimeter_rescaled,equivalent_spherical_radius_rescaled,feret_diameter_rescaled,perimeter_2d_rescaled,major_axis_length_2d_rescaled,minor_axis_length_2d_rescaled,surface_area_rescaled,bbox_volume_rescaled,convex_volume_rescaled,volume_rescaled
0,1,8,0,cpfl,250.0,189.793103,184.144531,157.0,20.305487,5504.0,...,14.744113,0.615610,1.480172,2.957153,0.976660,0.904212,3.881768,1.516422,1.078345,0.977250
1,2,8,0,cpfl,334.0,207.273684,202.871094,133.0,42.706679,19691.0,...,32.521386,0.914283,2.624064,4.973873,2.127172,1.027014,7.095704,4.852550,3.572016,3.201335
2,4,8,0,cpfl,262.0,214.904762,215.355469,175.0,27.285353,4513.0,...,11.889571,0.552814,1.370373,2.957153,0.976660,0.904212,5.174290,1.516422,0.943551,0.707664
3,5,8,0,cpfl,311.0,195.171429,190.386719,118.0,36.997206,6831.0,...,16.713421,0.655434,1.678357,3.390218,1.255098,0.902928,7.379101,2.426275,1.381629,1.179439
4,6,8,0,cpfl,355.0,219.354167,215.355469,147.0,47.335750,21058.0,...,32.749208,0.917480,2.068209,5.481241,1.910113,1.514661,11.022045,6.065688,3.774206,3.235034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2182,82,70,18,cpfl,1226.0,413.224870,364.970703,156.0,181.391424,398762.0,...,152.535708,1.980077,11.076593,24.764288,12.670394,1.954378,67.834520,66.048603,44.346919,32.518828
2183,83,70,18,cpfl,629.0,289.981818,265.083984,151.0,98.478702,31898.0,...,35.860398,0.960072,2.166750,5.227557,1.809555,1.518178,9.973372,5.897197,4.077490,3.706809
2184,84,70,18,cpfl,626.0,302.453125,280.451172,175.0,112.065916,38714.0,...,39.672840,1.009818,2.261000,5.406938,1.904950,1.570862,8.109682,5.897197,4.582964,4.313378
2185,86,70,18,cpfl,1179.0,431.298539,364.970703,152.0,205.767885,619776.0,...,198.910553,2.261128,14.997777,37.428252,15.418214,2.924642,141.693020,185.306770,90.243959,48.424410


### Combine filtered dataframes

Both dataframes were filtered and can be combined now.

In [12]:
# Combine dataframes
measurements_filtered = pd.concat([measurements_cpfl_filtered, measurements_wt_filtered], ignore_index=True)
measurements_filtered

Unnamed: 0,label,age,image_id,genotype,maximum,mean,median,minimum,sigma,sum,...,equivalent_spherical_perimeter_rescaled,equivalent_spherical_radius_rescaled,feret_diameter_rescaled,perimeter_2d_rescaled,major_axis_length_2d_rescaled,minor_axis_length_2d_rescaled,surface_area_rescaled,bbox_volume_rescaled,convex_volume_rescaled,volume_rescaled
0,1,8,0,cpfl,250.0,189.793103,184.144531,157.0,20.305487,5504.0,...,14.744113,0.615610,1.480172,2.957153,0.976660,0.904212,3.881768,1.516422,1.078345,0.977250
1,2,8,0,cpfl,334.0,207.273684,202.871094,133.0,42.706679,19691.0,...,32.521386,0.914283,2.624064,4.973873,2.127172,1.027014,7.095704,4.852550,3.572016,3.201335
2,4,8,0,cpfl,262.0,214.904762,215.355469,175.0,27.285353,4513.0,...,11.889571,0.552814,1.370373,2.957153,0.976660,0.904212,5.174290,1.516422,0.943551,0.707664
3,5,8,0,cpfl,311.0,195.171429,190.386719,118.0,36.997206,6831.0,...,16.713421,0.655434,1.678357,3.390218,1.255098,0.902928,7.379101,2.426275,1.381629,1.179439
4,6,8,0,cpfl,355.0,219.354167,215.355469,147.0,47.335750,21058.0,...,32.749208,0.917480,2.068209,5.481241,1.910113,1.514661,11.022045,6.065688,3.774206,3.235034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5226,176,70,39,wt,437.0,228.476298,219.855469,118.0,59.914239,101215.0,...,90.772814,1.527475,5.732681,13.307188,6.005873,1.731845,45.719796,55.197761,20.994020,14.928332
5227,177,70,39,wt,1542.0,485.858939,412.605469,137.0,265.029227,1043625.0,...,260.041601,2.585340,9.158631,32.454379,8.011704,7.033195,160.768595,300.992921,132.366793,72.383878
5228,178,70,39,wt,416.0,224.548673,213.832031,127.0,62.839505,25374.0,...,36.509477,0.968722,2.261000,5.734924,1.901773,1.715924,15.687029,8.491963,4.414473,3.807904
5229,180,70,39,wt,481.0,251.469466,237.925781,145.0,64.019970,65885.0,...,63.956915,1.282154,4.579315,13.771030,4.629315,2.203200,35.317111,35.383180,15.063125,8.828946


In [13]:
measurements_filtered.to_csv(path + "05-measurements-filtered-5-and-95-percentiles.csv", index=False)