# Lab 1:
> Data Preprocessing and Cleaning

## Imports

In [1]:
import pandas as pd
import numpy as np
from IPython.display import display_html
%matplotlib inline

In [2]:
def discretization(data):
    print("Discretization 'Clump Thickness' attribute of the breast cancer dataset")
    print("Visualizing distribution of attribute value")
    print("----value_counts()------")
    print(data["clump_thickness"].value_counts(sort = False))
    print("----cut()------")
    bins = pd.cut(data['clump_thickness'], 4)
    print(bins.value_counts(sort = False))
    print("----qcut()------")
    bins = pd.qcut(data['clump_thickness'], 4)
    print(bins.value_counts(sort = False))

In [3]:
def sampling(data):
    print("Displaying the first five records of the table without sampling")
    display_html(data.head())
    print("A sample of size 3 is randomly selected (without replacement) from the original data.")
    sample = data.sample(n=3)
    display_html(sample)
    print("Randomly select 1% of the data (without replacement) and display the selected sample.")
    sample = data.sample(frac = 0.01, random_state = 1)
    display_html(sample)
    print("A sampling with replacement to create a sample whose size is equal to 1% of the entire data.")
    sample = data.sample(frac = 0.01, replace = True, random_state = 1)
    display_html(sample)

In [4]:
def remove_duplicate(data):
    dups = data.duplicated()
    print(f"Number of duplicate rows = {dups.sum()}")
    data.loc[[11,28]]
    print(f"Number of rows before discarding duplicates = {data.shape[0]}")
    data2 = data.drop_duplicates()
    print(f"Number of rows before discarding duplicates = {data2.shape[0]}")

In [5]:
def outlier(data):
    data2 = data.drop(["class"], axis = 1)
    data2["bare_nuclei"] = pd.to_numeric(data2["bare_nuclei"], errors='coerce')
    Z = (data2 - data2.mean()) / data2.std()
    Z[20:25]
    print(f"Number of rows before discarding outliers = {Z.shape[0]}")
    Z2 = Z.loc[((Z > -3).sum(axis = 1) == 9) & ((Z<=3).sum(axis = 1) == 9),:]
    print(f"Number of rows after discarding missing values = {Z2.shape[0]}")

In [6]:
def remove_missing(data):
    print(f"Number of rows in original data = {data.shape[0]}")
    data = data.dropna()
    print(f"Number of rows after discarding missing values = {data.shape[0]}")

In [7]:
def replace_missing_value_by_median(data):
    data2 = data["bare_nuclei"]
    print("Before replacing missing values:")
    print(data2[20:25])
    data2 = data2.fillna(data2.median())
    print("After replacing missing values by median")
    print(data2[20:25]) 

In [12]:
def noise_handle(data):
    data = data.drop(["sample_code"], axis = 1)
    data = data.replace("?", np.NaN)
    print(f"Number of instances = {data.shape[0]}")
    print(f"Number of attributes = {data.shape[1]}")
    print("Number of missing values")
    for col in data.columns:
        print(f"\t{col}, {data[col].isna().sum()}")
    print('''
     To further preprocess select option
     0. Exit
     1. Replace missing value by median
     2. Remove missing values
     3. Handle Outlier
     4. Remove Duplicate
     5. Sampling
     6. Discretization
    ''')
    option = int(input())
    while(option !=0):
        if option == 1:
            replace_missing_value_by_median(data)
        elif option == 2:
            remove_missing(data)
        elif option == 3:
            outlier(data)
        elif option == 4:
            remove_duplicate(data)
        elif option == 5:
            sampling(data)
        elif option == 6:
            discretization(data)
        else:
            print("Enter correct choice")
        print('Select your option again.')
        option = int(input())

In [9]:
def view(data):
    data.head()
    print(f"Number of instances = {data.shape[0]}")
    print(f"Number of attributes = {data.shape[1]}")
    display_html(data.head())

In [13]:
def main():
    
    data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",
                       header = None)
    data.columns = [
    "sample_code", "clump_thickness", "uniformity_of_cell_size", 
    "uniformity_of_cell_shape", "marginal_adhesion", 
    "single_epithelial_cell_size", "bare_nuclei", 
    "bland_chromatin", "normal_nucleolo", "mitoses",
    "class"]
    print("Do you want to view data?")
    response = input()
    if response == 'yes':
        view(data)
    print("Do you want to remove noise and further preprocess data?")
    response = input()
    if response == 'yes':
        noise_handle(data)
    else:
        quit()

In [14]:
main()

Do you want to view data?
yes
Number of instances = 699
Number of attributes = 11


Unnamed: 0,sample_code,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleolo,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


Do you want to remove noise and further preprocess data?
yes
Number of instances = 699
Number of attributes = 10
Number of missing values
	clump_thickness, 0
	uniformity_of_cell_size, 0
	uniformity_of_cell_shape, 0
	marginal_adhesion, 0
	single_epithelial_cell_size, 0
	bare_nuclei, 16
	bland_chromatin, 0
	normal_nucleolo, 0
	mitoses, 0
	class, 0

     To further preprocess select option
     0. Exit
     1. Replace missing value by median
     2. Remove missing values
     3. Handle Outlier
     4. Remove Duplicate
     5. Sampling
     6. Discretization
    
1
Before replacing missing values:
20     10
21      7
22      1
23    NaN
24      1
Name: bare_nuclei, dtype: object
After replacing missing values by median
20     10
21      7
22      1
23    1.0
24      1
Name: bare_nuclei, dtype: object
Select your option again.
2
Number of rows in original data = 699
Number of rows after discarding missing values = 683
Select your option again.
3
Number of rows before discarding outliers = 6

Unnamed: 0,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleolo,mitoses,class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


A sample of size 3 is randomly selected (without replacement) from the original data.


Unnamed: 0,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleolo,mitoses,class
313,1,1,1,1,2,1,1,1,1,2
643,1,1,1,1,2,1,1,1,1,2
288,6,1,3,1,4,5,5,10,1,4


Randomly select 1% of the data (without replacement) and display the selected sample.


Unnamed: 0,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleolo,mitoses,class
584,5,1,1,6,3,1,1,1,1,2
417,1,1,1,1,2,1,2,1,1,2
606,4,1,1,2,2,1,1,1,1,2
349,4,2,3,5,3,8,7,6,1,4
134,3,1,1,1,3,1,2,1,1,2
502,4,1,1,2,2,1,2,1,1,2
117,4,5,5,10,4,10,7,5,8,4


A sampling with replacement to create a sample whose size is equal to 1% of the entire data.


Unnamed: 0,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleolo,mitoses,class
37,6,2,1,1,1,1.0,7,1,1,2
235,3,1,4,1,2,,3,1,1,2
72,1,3,3,2,2,1.0,7,2,1,2
645,3,1,1,1,2,1.0,2,1,1,2
144,2,1,1,1,2,1.0,2,1,1,2
129,1,1,1,1,10,1.0,1,1,1,2
583,3,1,1,1,2,1.0,1,1,1,2


Select your option again.
6
Discretization 'Clump Thickness' attribute of the breast cancer dataset
Visualizing distribution of attribute value
----value_counts()------
5     130
3     108
6      34
4      80
8      46
1     145
2      50
7      23
10     69
9      14
Name: clump_thickness, dtype: int64
----cut()------
(0.991, 3.25]    303
(3.25, 5.5]      210
(5.5, 7.75]       57
(7.75, 10.0]     129
Name: clump_thickness, dtype: int64
----qcut()------
(0.999, 2.0]    195
(2.0, 4.0]      188
(4.0, 6.0]      164
(6.0, 10.0]     152
Name: clump_thickness, dtype: int64
Select your option again.
0
