# Data Preprocessing

## Imports

In [12]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [13]:
sns.set()
%matplotlib inline
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option("expand_frame_repr", False)
pd.set_option('display.float_format', '{:.2f}'.format)

In [4]:
sys.path.append(os.path.abspath(os.path.join('../scripts')))

In [7]:
from df_overview import DfOverview
from df_outlier import DfOutlier
from file_handler import FileHandler

## Data

- ID number
- Diagnosis(M=malignant, B=benign)
- radius(mean of distances from center to points on the perimeter)
- texture(standard deviation of gray - scale values)
- perimeter
- area
- smoothness(local variation in radius lengths)
- compactness(perimeter ^ 2 / area - 1.0)
- concavity(severity of concave portions of the contour)
- concave points(number of concave portions of the contour)
- symmetry
- fractal dimension("coastline approximation" - 1)
- The mean, standard error and "worst" or largest(mean of the three largest values) of these features were computed for each image, resulting in 30 features. For instance, field 3 is Mean Radius, field 13 is Radius SE, field 23 is Worst Radius.
- All feature values are recoded with four significant digits.
- Missing attribute values: none
- Class distribution: 357 benign, 212 malignant


## Data reading

In [16]:
fh = FileHandler()
df = fh.read_csv("../data/data.csv")
df.head(5)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.12,0.28,0.3,0.15,0.24,0.08,1.09,0.91,8.59,153.4,0.01,0.05,0.05,0.02,0.03,0.01,25.38,17.33,184.6,2019.0,0.16,0.67,0.71,0.27,0.46,0.12,
1,842517,M,20.57,17.77,132.9,1326.0,0.08,0.08,0.09,0.07,0.18,0.06,0.54,0.73,3.4,74.08,0.01,0.01,0.02,0.01,0.01,0.0,24.99,23.41,158.8,1956.0,0.12,0.19,0.24,0.19,0.28,0.09,
2,84300903,M,19.69,21.25,130.0,1203.0,0.11,0.16,0.2,0.13,0.21,0.06,0.75,0.79,4.58,94.03,0.01,0.04,0.04,0.02,0.02,0.0,23.57,25.53,152.5,1709.0,0.14,0.42,0.45,0.24,0.36,0.09,
3,84348301,M,11.42,20.38,77.58,386.1,0.14,0.28,0.24,0.11,0.26,0.1,0.5,1.16,3.44,27.23,0.01,0.07,0.06,0.02,0.06,0.01,14.91,26.5,98.87,567.7,0.21,0.87,0.69,0.26,0.66,0.17,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1,0.13,0.2,0.1,0.18,0.06,0.76,0.78,5.44,94.44,0.01,0.02,0.06,0.02,0.02,0.01,22.54,16.67,152.2,1575.0,0.14,0.2,0.4,0.16,0.24,0.08,


### Missing value

In [15]:
store_overview = DfOverview(df)
store_overview.getOverview()

Unnamed: 0_level_0,count,none_count,none_percentage,unique_value_count,unique_percentage,dtype
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
id,569,0,0.0%,569,100.0%,int64
symmetry_worst,569,0,0.0%,500,87.87%,float64
concave points_worst,569,0,0.0%,492,86.47%,float64
concavity_worst,569,0,0.0%,539,94.73%,float64
compactness_worst,569,0,0.0%,529,92.97%,float64
smoothness_worst,569,0,0.0%,411,72.23%,float64
area_worst,569,0,0.0%,544,95.61%,float64
perimeter_worst,569,0,0.0%,514,90.33%,float64
texture_worst,569,0,0.0%,511,89.81%,float64
radius_worst,569,0,0.0%,457,80.32%,float64


We can see that we don't have any missing values and the only categorical data is diagnosis

In [17]:
print(df['diagnosis'].unique())

['M' 'B']


In [19]:
diagnosis = ['M', 'B']
df['diagnosis'] = df['diagnosis'].apply(lambda x: diagnosis.index(x))

In [None]:
### Outliers
