# Sixth Exercise Session

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
import pandas as pd
import numpy as np



## Data Quality in Machine Learning

The basics checks that must be done during the data cleaning process in a ML pipeline are:

* delete the columns with a unique value (constant value for each tuple).
* observe the columns with a low entropy of values, i.e. that the value is the same for most of the tuples with only few exceptions.

In [3]:
!ls './Dataset'

beers.csv	      hospital_account_info.csv   property.csv
beersPoorDQ.csv       hospital_reimbursement.csv  propertyTimeliness.csv
bridges.csv	      iris.csv			  shampoo.csv
chess.csv	      nba.csv			  styles.csv
Datasets17112021.pdf  oil_spill.csv		  TechUSA.csv


In [10]:
data = pd.read_csv('./Dataset/oil_spill.csv', header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,1,2558,1506.09,456.63,90,6395000.0,40.88,7.89,29780.0,0.19,...,2850.0,1000.0,763.16,135.46,3.73,0,33243.19,65.74,7.95,1
1,2,22325,79.11,841.03,180,55812500.0,51.11,1.21,61900.0,0.02,...,5750.0,11500.0,9593.48,1648.8,0.6,0,51572.04,65.73,6.26,0
2,3,115,1449.85,608.43,88,287500.0,40.42,7.34,3340.0,0.18,...,1400.0,250.0,150.0,45.13,9.33,1,31692.84,65.81,7.84,1
3,4,1201,1562.53,295.65,66,3002500.0,42.4,7.97,18030.0,0.19,...,6041.52,761.58,453.21,144.97,13.33,1,37696.21,65.67,8.07,1
4,5,312,950.27,440.86,37,780000.0,41.43,7.03,3350.0,0.17,...,1320.04,710.63,512.54,109.16,2.58,0,29038.17,65.66,7.35,0


### Drop columns with unique values

First of all to detect the non-relevant column for machine learning we shoud look for unique values for each column:

In [86]:
print('Number of columns: ', data.shape[1])
print('Number of rows: ', data.shape[0], '\n')
print('Unique columns: ')
data.nunique() == 1

Number of columns:  50
Number of rows:  937 

Unique columns: 


0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22     True
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
32    False
33    False
34    False
35    False
36    False
37    False
38    False
39    False
40    False
41    False
42    False
43    False
44    False
45    False
46    False
47    False
48    False
49    False
dtype: bool

We have to understand if a column is useful in our ML algorithm by looking at the number of unique values: as said usually columns with a limited number of values are not useful, since the algorithms need a certain variety to learn something.

Let's drop columns with a unique value:

In [48]:
unique_mask = (data.nunique() == 1)
to_del = [i for i,v in enumerate(unique_mask) if v == True]
to_del

[22]

In [58]:
#data.drop(to_del, axis=1, inplace=True)
try:
  data[22]
except:
  print('Column 22 not found')

Column 22 not found


Where there is no more the column number 22.

### Analyze columns with low variety of values 

In [59]:
data = pd.read_csv('./Dataset/oil_spill.csv', header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,1,2558,1506.09,456.63,90,6395000.0,40.88,7.89,29780.0,0.19,...,2850.0,1000.0,763.16,135.46,3.73,0,33243.19,65.74,7.95,1
1,2,22325,79.11,841.03,180,55812500.0,51.11,1.21,61900.0,0.02,...,5750.0,11500.0,9593.48,1648.8,0.6,0,51572.04,65.73,6.26,0
2,3,115,1449.85,608.43,88,287500.0,40.42,7.34,3340.0,0.18,...,1400.0,250.0,150.0,45.13,9.33,1,31692.84,65.81,7.84,1
3,4,1201,1562.53,295.65,66,3002500.0,42.4,7.97,18030.0,0.19,...,6041.52,761.58,453.21,144.97,13.33,1,37696.21,65.67,8.07,1
4,5,312,950.27,440.86,37,780000.0,41.43,7.03,3350.0,0.17,...,1320.04,710.63,512.54,109.16,2.58,0,29038.17,65.66,7.35,0


To identify the low variety of values in a column we compute the percentage of different values in each column:

In [83]:
for i,v in enumerate([(v/data.shape[0])*100 for i,v in enumerate(data.nunique())]):
  print(i, "{:3.2f}%".format(v))

0 25.40%
1 31.70%
2 98.93%
3 99.57%
4 19.10%
5 40.02%
6 87.51%
7 65.96%
8 59.87%
9 6.08%
10 61.58%
11 6.30%
12 7.79%
13 11.42%
14 5.66%
15 9.71%
16 95.30%
17 86.45%
18 18.14%
19 5.66%
20 7.26%
21 0.96%
22 0.11%
23 9.82%
24 0.96%
25 0.85%
26 0.96%
27 32.87%
28 47.71%
29 41.84%
30 11.42%
31 4.48%
32 0.43%
33 4.80%
34 15.05%
35 11.74%
36 0.32%
37 80.90%
38 0.96%
39 0.96%
40 41.41%
41 23.48%
42 68.73%
43 69.26%
44 53.26%
45 0.21%
46 100.00%
47 18.04%
48 30.52%
49 0.21%


The columns with a low percentage have a low variety of values. 

The literature says that you do not have to drop them but they must be analyzed in order to see if they are relevant or not for your analisys.

In [94]:
for i,perc in enumerate([(v/data.shape[0])*100 for i,v in enumerate(data.nunique())]):
  if perc < 1:
    print(i, "{:3.2f}%".format(perc), '\t Unique values: ', data.nunique()[i])

21 0.96% 	 Unique values:  9
22 0.11% 	 Unique values:  1
24 0.96% 	 Unique values:  9
25 0.85% 	 Unique values:  8
26 0.96% 	 Unique values:  9
32 0.43% 	 Unique values:  4
36 0.32% 	 Unique values:  3
38 0.96% 	 Unique values:  9
39 0.96% 	 Unique values:  9
45 0.21% 	 Unique values:  2
49 0.21% 	 Unique values:  2


These columns with very low unique values can be deleted as we did before

In [105]:
to_del = ([i for i,v in enumerate(data.nunique()) if ((v/data.shape[0]*100) < 1)])
to_del

[21, 22, 24, 25, 26, 32, 36, 38, 39, 45, 49]

In [106]:
data.drop(to_del, axis=1, inplace=True)
try:
  data[to_del]
except:
  print('Columns not found')

Columns not found


In [108]:
print(data.shape)
print('Deleted columns: ', len(to_del))

(937, 39)
Deleted columns:  11
