# Data Engineering

# Feature Adding, Data Cleaning and Standardization

## Importing working Libraries and Scripts

In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import sys
import os

# Importing Scripts
sys.path.append(os.path.abspath(os.path.join('..')))
from scripts.logger_creator import CreateLogger
from scripts.data_loader import load_df_from_csv
from scripts.data_information import DataInfo
from scripts.data_cleaner import DataCleaner
from scripts.data_manipulation import DataManipulator
from scripts.utilities import calculate_concavity_dispersion


In [2]:
# Configuring Notebook Settings
pd.set_option('max_column', None)
pd.set_option('display.float_format', '{:.6f}'.format)
%matplotlib inline

## Loading Data CSV File

In [3]:
# Declaring Data File-Path
DATAPATH = '../data/data.csv'

In [4]:
# Loading Breast Cancer Data-Set
data_df = load_df_from_csv(DATAPATH, na_values=['none'])

In [5]:
# Extracting Information from the data
# Instantiate DataInfo Object using our dataset dataframe
data_info = DataInfo(data_df, deep=True)

In [6]:
# View Data Details
data_info.get_basic_description()

The DataFrame containes 569 rows and 33 columns.
Current DataFrame Memory Usage:
105962
Current DataFrame Memory Usage of columns is :
DataFrame Information: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    uint32 
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float32
 3   texture_mean             569 non-null    float32
 4   perimeter_mean           569 non-null    float32
 5   area_mean                569 non-null    float32
 6   smoothness_mean          569 non-null    float32
 7   compactness_mean         569 non-null    float32
 8   concavity_mean           569 non-null    float32
 9   concave points_mean      569 non-null    float32
 10  symmetry_mean            569 non-null    float32
 11  fractal_dimension_mean   569 

In [7]:
data_info.get_size()

The DataFrame containes 569 rows and 33 columns.


(569, 33)

In [8]:
data_info.get_total_missing_values()

The total number of missing values is 569
3.03 % missing values.


569

In [9]:
data_info.get_columns_with_missing_values()

['Unnamed: 32']

In [10]:
data_info.get_column_based_missing_values()

Unnamed: 0,missing_count,type
Unnamed: 32,569,float32


In [11]:
# Remove the entire column which is missing the entire data
data_info.df.drop(['Unnamed: 32'], axis=1, inplace=True)


In [12]:
data_info.get_total_entries()

The DataFrame containes 18208 entries.


18208

In [13]:
data_info.get_dispersion_params().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,Mode,Median
id,569.0,30371831.432337,125020585.612224,8670.0,869218.0,906024.0,8813129.0,911320502.0,8670.0,906024.0
radius_mean,569.0,14.127292,3.524049,6.981,11.7,13.37,15.78,28.110001,12.34,13.37
texture_mean,569.0,19.28965,4.301036,9.71,16.17,18.84,21.799999,39.279999,14.93,18.84
perimeter_mean,569.0,91.969032,24.298981,43.790001,75.169998,86.239998,104.099998,188.5,82.610001,86.239998
area_mean,569.0,654.889038,351.914124,143.5,420.299988,551.099976,782.700012,2501.0,512.200012,551.099976
smoothness_mean,569.0,0.09636,0.014064,0.05263,0.08637,0.09587,0.1053,0.1634,0.1007,0.09587
compactness_mean,569.0,0.104341,0.052813,0.01938,0.06492,0.09263,0.1304,0.3454,0.1147,0.09263
concavity_mean,569.0,0.088799,0.07972,0.0,0.02956,0.06154,0.1307,0.4268,0.0,0.06154
concave points_mean,569.0,0.048919,0.038803,0.0,0.02031,0.0335,0.074,0.2012,0.0,0.0335
symmetry_mean,569.0,0.181162,0.027414,0.106,0.1619,0.1792,0.1957,0.304,0.1601,0.1792


In [14]:
len(data_info.get_duplicates())

0

In [15]:
data_info.get_object_columns()

['diagnosis']

In [16]:
# Remaining are all Numeric Types
len(data_info.get_numeric_columns())

30

## Feature Extraction (Adding Features)

In [17]:
# Create A Data Manipulator Class Instance from the dataframe
data_manipulator = DataManipulator(data_info.df, deep=True)

## Concavity Dispersion Feature (Number of Concavity divided by Area of Nuclei)

In [18]:
# Add Concavity Dispersion for mean concave points and mean area
data_manipulator.add_column(
    'concavity_dispersion_mean', 'concave points_mean', 'area_mean', calculate_concavity_dispersion)


In [19]:
# Add Concavity Dispersion for mean concave points and mean area
data_manipulator.add_column(
    'concavity_dispersion_se', 'concave points_se', 'area_se', calculate_concavity_dispersion)


In [20]:
# Add Concavity Dispersion for mean concave points and mean area
data_manipulator.add_column(
    'concavity_dispersion_worst', 'concave points_worst', 'area_worst', calculate_concavity_dispersion)


In [21]:
# Checking if columns have been Added
data_manipulator.df.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst',
       'concavity_dispersion_mean', 'concavity_dispersion_se',
       'concavity_dispersion_worst'],
      dtype='object')

In [22]:
## Encoding Diagnosis where M(Malginancy) is 1 and B(Benign) is 0
data_manipulator.df.diagnosis = data_manipulator.df.diagnosis.apply(
    lambda x: 1 if x == 'M' else 0)
data_manipulator.df.diagnosis.sample(10)

59     0
96     0
39     1
382    0
551    0
369    1
206    0
175    0
37     0
52     0
Name: diagnosis, dtype: int64

## Outlier and standardization Ready Dataframe

In [23]:
curreent_df = data_manipulator.df

## Fixing Outliers

In [24]:
# Create Cleaner object instance from current dataframe
data_cleaner_remove = DataCleaner(curreent_df, deep=True)
data_cleaner_revalue = DataCleaner(curreent_df, deep=True)

## Dropping Outlier Values

In [25]:
# Drop all outlier values
data_cleaner_remove.remove_outliers()


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,concavity_dispersion_mean,concavity_dispersion_se,concavity_dispersion_worst
1,842517,1,20.570000,17.770000,132.899994,1326.000000,0.084740,0.078640,0.086900,0.070170,0.181200,0.056670,0.543500,0.733900,3.398000,74.080002,0.005225,0.013080,0.018600,0.013400,0.013890,0.003532,24.990000,23.410000,158.800003,1956.000000,0.123800,0.186600,0.241600,0.186000,0.275000,0.089020,0.000053,0.000181,0.000095
2,84300903,1,19.690001,21.250000,130.000000,1203.000000,0.109600,0.159900,0.197400,0.127900,0.206900,0.059990,0.745600,0.786900,4.585000,94.029999,0.006150,0.040060,0.038320,0.020580,0.022500,0.004571,23.570000,25.530001,152.500000,1709.000000,0.144400,0.424500,0.450400,0.243000,0.361300,0.087580,0.000106,0.000219,0.000142
4,84358402,1,20.290001,14.340000,135.100006,1297.000000,0.100300,0.132800,0.198000,0.104300,0.180900,0.058830,0.757200,0.781300,5.438000,94.440002,0.011490,0.024610,0.056880,0.018850,0.017560,0.005115,22.540001,16.670000,152.199997,1575.000000,0.137400,0.205000,0.400000,0.162500,0.236400,0.076780,0.000080,0.000200,0.000103
5,843786,1,12.450000,15.700000,82.570000,477.100006,0.127800,0.170000,0.157800,0.080890,0.208700,0.076130,0.334500,0.890200,2.217000,27.190001,0.007510,0.033450,0.036720,0.011370,0.021650,0.005082,15.470000,23.750000,103.400002,741.599976,0.179100,0.524900,0.535500,0.174100,0.398500,0.124400,0.000170,0.000418,0.000235
6,844359,1,18.250000,19.980000,119.599998,1040.000000,0.094630,0.109000,0.112700,0.074000,0.179400,0.057420,0.446700,0.773200,3.180000,53.910000,0.004314,0.013820,0.022540,0.010390,0.013690,0.002179,22.879999,27.660000,153.199997,1606.000000,0.144200,0.257600,0.378400,0.193200,0.306300,0.083680,0.000071,0.000193,0.000120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560,925292,0,14.050000,27.150000,91.379997,600.400024,0.099290,0.112600,0.044620,0.043040,0.153700,0.061710,0.364500,1.492000,2.888000,29.840000,0.007256,0.026780,0.020710,0.016260,0.020800,0.005304,15.300000,33.169998,100.199997,706.700012,0.124100,0.226400,0.132600,0.104800,0.225000,0.083210,0.000072,0.000545,0.000148
563,926125,1,20.920000,25.090000,143.000000,1347.000000,0.109900,0.223600,0.317400,0.147400,0.214900,0.068790,0.962200,1.026000,8.758000,118.800003,0.006399,0.043100,0.078450,0.026240,0.020570,0.006213,24.290001,29.410000,179.100006,1819.000000,0.140700,0.418600,0.659900,0.254200,0.292900,0.098730,0.000109,0.000221,0.000140
564,926424,1,21.559999,22.389999,142.000000,1479.000000,0.111000,0.115900,0.243900,0.138900,0.172600,0.056230,1.176000,1.256000,7.673000,158.699997,0.010300,0.028910,0.051980,0.024540,0.011140,0.004239,25.450001,26.400000,166.100006,2027.000000,0.141000,0.211300,0.410700,0.221600,0.206000,0.071150,0.000094,0.000155,0.000109
565,926682,1,20.129999,28.250000,131.199997,1261.000000,0.097800,0.103400,0.144000,0.097910,0.175200,0.055330,0.765500,2.463000,5.203000,99.040001,0.005769,0.024230,0.039500,0.016780,0.018980,0.002498,23.690001,38.250000,155.000000,1731.000000,0.116600,0.192200,0.321500,0.162800,0.257200,0.066370,0.000078,0.000169,0.000094


In [27]:
# Length of outlier removed dataframe (decreased)
len(data_cleaner_remove.df)


483

## Revaluing Outliers to Median Values

In [26]:
data_cleaner_revalue.fix_outlier_columns(data_cleaner_revalue.df.columns.to_list())

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,concavity_dispersion_mean,concavity_dispersion_se,concavity_dispersion_worst
0,842302.000000,1.000000,17.990000,10.380000,122.800003,1001.000000,0.118400,0.092630,0.061540,0.033500,0.179200,0.061540,0.324200,0.905300,2.287000,24.530001,0.006399,0.049040,0.053730,0.015870,0.030030,0.006193,25.379999,17.330000,97.660004,686.500000,0.162200,0.211900,0.226700,0.099930,0.282200,0.118900,0.000065,0.000103,0.000131
1,842517.000000,1.000000,20.570000,17.770000,132.899994,551.099976,0.084740,0.078640,0.086900,0.070170,0.181200,0.056670,0.543500,0.733900,3.398000,74.080002,0.005225,0.013080,0.018600,0.013400,0.013890,0.003532,24.990000,23.410000,158.800003,1956.000000,0.123800,0.186600,0.241600,0.186000,0.275000,0.089020,0.000053,0.000181,0.000095
2,84300903.000000,1.000000,19.690001,21.250000,130.000000,1203.000000,0.109600,0.159900,0.197400,0.033500,0.206900,0.059990,0.745600,0.786900,4.585000,94.029999,0.006150,0.040060,0.038320,0.020580,0.022500,0.004571,23.570000,25.530001,152.500000,1709.000000,0.144400,0.424500,0.450400,0.099930,0.361300,0.087580,0.000106,0.000219,0.000142
3,84348301.000000,1.000000,11.420000,20.379999,77.580002,386.100006,0.095870,0.092630,0.241400,0.105200,0.179200,0.061540,0.495600,1.156000,3.445000,27.230000,0.009110,0.020450,0.056610,0.018670,0.018730,0.003187,14.910000,26.500000,98.870003,567.700012,0.131300,0.211900,0.226700,0.099930,0.282200,0.080040,0.000065,0.000686,0.000131
4,84358402.000000,1.000000,20.290001,14.340000,135.100006,1297.000000,0.100300,0.132800,0.198000,0.104300,0.180900,0.058830,0.757200,0.781300,5.438000,94.440002,0.011490,0.024610,0.056880,0.018850,0.017560,0.005115,22.540001,16.670000,152.199997,1575.000000,0.137400,0.205000,0.400000,0.162500,0.236400,0.076780,0.000080,0.000200,0.000103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424.000000,1.000000,13.370000,22.389999,86.239998,551.099976,0.111000,0.115900,0.061540,0.033500,0.172600,0.056230,0.324200,1.256000,2.287000,24.530001,0.010300,0.028910,0.051980,0.010930,0.011140,0.004239,25.450001,26.400000,166.100006,686.500000,0.141000,0.211300,0.410700,0.221600,0.206000,0.071150,0.000094,0.000155,0.000109
565,926682.000000,1.000000,20.129999,18.840000,131.199997,1261.000000,0.097800,0.103400,0.144000,0.097910,0.175200,0.055330,0.765500,1.108000,5.203000,99.040001,0.005769,0.024230,0.039500,0.016780,0.018980,0.002498,23.690001,25.410000,155.000000,1731.000000,0.116600,0.192200,0.321500,0.162800,0.257200,0.066370,0.000078,0.000169,0.000094
566,926954.000000,1.000000,16.600000,18.840000,108.300003,858.099976,0.084550,0.102300,0.092510,0.053020,0.159000,0.056480,0.456400,1.075000,3.425000,48.549999,0.005903,0.037310,0.047300,0.015570,0.013180,0.003892,18.980000,34.119999,126.699997,1124.000000,0.113900,0.309400,0.340300,0.141800,0.221800,0.078200,0.000062,0.000321,0.000126
567,927241.000000,1.000000,13.370000,18.840000,86.239998,1265.000000,0.117800,0.092630,0.061540,0.033500,0.179200,0.070160,0.726000,1.595000,5.772000,86.220001,0.006522,0.020450,0.071170,0.016640,0.023240,0.006185,14.970000,25.410000,97.660004,1821.000000,0.165000,0.211900,0.226700,0.099930,0.282200,0.080040,0.000120,0.000193,0.000146


In [28]:
# Length of outlier revalued dataframe (unchanged)
len(data_cleaner_revalue.df)


569

## Standardization

## MIN-MAX

In [37]:
# Create Manipulator object instance from dataframes
data_remove_minmax = DataManipulator(data_cleaner_remove.df, deep=True)
data_revalue_minmax = DataManipulator(data_cleaner_revalue.df, deep=True)

In [38]:
# For removed outlier
# Minmax between 0 and 1
data_remove_minmax.minmax_scale_columns(data_remove_minmax.df.columns.to_list(), (0,1))

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,concavity_dispersion_mean,concavity_dispersion_se,concavity_dispersion_worst
1,0.009066,1.000000,0.834244,0.383627,0.822731,0.766613,0.298029,0.283514,0.273787,0.449232,0.470460,0.225168,0.389733,0.163702,0.325647,0.442916,0.191281,0.149438,0.166966,0.469681,0.161906,0.253450,0.842053,0.384927,0.745000,0.798449,0.388053,0.241722,0.284604,0.688634,0.371706,0.429800,0.294444,0.152101,0.291411
2,0.916509,1.000000,0.780220,0.549262,0.795956,0.686872,0.631318,0.672280,0.621928,0.818822,0.657914,0.336577,0.573678,0.186920,0.472010,0.574254,0.260450,0.521792,0.343986,0.721346,0.393931,0.353304,0.771964,0.456573,0.701698,0.687077,0.575923,0.619353,0.530569,0.899667,0.642409,0.411586,0.588889,0.184034,0.435583
4,0.917134,1.000000,0.817055,0.220371,0.843043,0.747812,0.506636,0.542627,0.623819,0.667734,0.468271,0.297651,0.584236,0.184466,0.577189,0.576953,0.659762,0.308565,0.510592,0.660708,0.260806,0.405586,0.721125,0.157148,0.699636,0.626657,0.512084,0.270929,0.471198,0.601629,0.250627,0.274981,0.444444,0.168067,0.315951
5,0.009080,1.000000,0.335748,0.285102,0.358046,0.216272,0.875318,0.720601,0.497164,0.517862,0.671043,0.878188,0.199509,0.232171,0.180025,0.134222,0.362148,0.430567,0.329623,0.398528,0.371025,0.402414,0.372162,0.396418,0.364218,0.250879,0.892385,0.778723,0.630816,0.644576,0.759097,0.877308,0.944444,0.351261,0.720859
6,0.009086,1.000000,0.691817,0.488815,0.699935,0.581199,0.430621,0.428763,0.355072,0.473752,0.457330,0.250336,0.301629,0.180918,0.298767,0.310129,0.123159,0.159651,0.202334,0.364178,0.156516,0.123419,0.737907,0.528557,0.706509,0.640635,0.574099,0.354424,0.445753,0.715291,0.469887,0.362256,0.394444,0.162185,0.368098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560,0.009966,0.000000,0.433974,0.830081,0.439387,0.296207,0.493096,0.445986,0.140580,0.275544,0.269876,0.394295,0.226813,0.495795,0.262762,0.151668,0.343154,0.338513,0.185907,0.569926,0.348119,0.423750,0.363771,0.714768,0.342223,0.235143,0.390789,0.304899,0.156202,0.388004,0.214868,0.356312,0.400000,0.457983,0.453988
563,0.009975,1.000000,0.855731,0.732032,0.915982,0.780227,0.635340,0.977036,1.000000,0.943662,0.716265,0.631879,0.770820,0.291659,0.986560,0.737324,0.279070,0.563747,0.704219,0.919734,0.341921,0.511110,0.807503,0.587698,0.884528,0.736676,0.542180,0.609988,0.777359,0.941133,0.427854,0.552618,0.605556,0.185714,0.429448
564,0.009979,1.000000,0.895021,0.603522,0.906749,0.865802,0.650087,0.461774,0.768431,0.889245,0.407732,0.210403,0.965414,0.392413,0.852774,1.000000,0.570777,0.367910,0.466607,0.860147,0.087798,0.321397,0.864758,0.485975,0.795175,0.830463,0.544916,0.280930,0.483803,0.820437,0.155270,0.203769,0.522222,0.130252,0.334356
565,0.009982,1.000000,0.807232,0.882437,0.807035,0.724473,0.473120,0.401971,0.453686,0.626825,0.426696,0.180201,0.591790,0.921150,0.548212,0.607236,0.231960,0.303321,0.354578,0.588153,0.299073,0.154077,0.777887,0.886448,0.718881,0.696997,0.322389,0.250611,0.378725,0.602740,0.315872,0.143309,0.433333,0.142017,0.288344


In [39]:
# For revalued outlier
data_revalue_minmax.minmax_scale_columns(data_revalue_minmax.df.columns.to_list(), (0,1))

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,concavity_dispersion_mean,concavity_dispersion_se,concavity_dispersion_worst
0,0.009222,1.000000,0.810140,0.038417,0.859645,0.736368,0.996968,0.386911,0.254613,0.266932,0.588898,0.444189,0.252014,0.296282,0.243941,0.163545,0.429632,0.803255,0.684895,0.695138,0.819446,0.758815,0.988668,0.218969,0.391499,0.274866,0.911850,0.344351,0.332893,0.424151,0.504819,0.996878,0.474453,0.118391,0.513725
1,0.009225,1.000000,1.000000,0.462156,0.969535,0.350021,0.486736,0.313015,0.359537,0.559124,0.604988,0.257384,0.511848,0.203120,0.421078,0.620657,0.321995,0.185895,0.237094,0.586947,0.222288,0.377703,0.966572,0.469691,0.898086,0.970940,0.527196,0.297159,0.354772,0.789474,0.475904,0.530440,0.386861,0.208046,0.372549
2,0.932509,1.000000,0.935242,0.661697,0.937983,0.909833,0.863574,0.742235,0.816715,0.266932,0.811746,0.384733,0.751303,0.231927,0.610332,0.804701,0.406803,0.649087,0.488464,0.901445,0.540847,0.526510,0.886119,0.557113,0.845886,0.835508,0.733547,0.740911,0.661380,0.424151,0.822490,0.507961,0.773723,0.251724,0.556863
3,0.933033,1.000000,0.326661,0.611812,0.367642,0.208330,0.655449,0.386911,0.998759,0.838247,0.588898,0.444189,0.455095,0.432547,0.428571,0.188454,0.678188,0.312423,0.721606,0.817784,0.401362,0.328292,0.395467,0.597113,0.401525,0.209727,0.602324,0.344351,0.332893,0.424151,0.504819,0.390259,0.474453,0.788506,0.513725
4,0.933145,1.000000,0.979395,0.265482,0.993472,0.990554,0.722601,0.599092,0.819197,0.831076,0.602574,0.340238,0.765047,0.228884,0.746333,0.808484,0.896397,0.383842,0.725048,0.825668,0.358073,0.604423,0.827762,0.191753,0.843401,0.762035,0.663428,0.331481,0.587372,0.689728,0.320884,0.339369,0.583942,0.229885,0.403922
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,0.010153,1.000000,0.470160,0.727064,0.461865,0.350021,0.884796,0.509825,0.254613,0.266932,0.535800,0.240507,0.252014,0.486901,0.243941,0.163545,0.787293,0.457664,0.662588,0.478756,0.120542,0.478961,0.992635,0.592990,0.958572,0.274866,0.699489,0.343232,0.603084,0.940577,0.198795,0.251483,0.686131,0.178161,0.427451
565,0.010156,1.000000,0.967621,0.523509,0.951039,0.959639,0.684705,0.443799,0.595780,0.780159,0.556718,0.205984,0.774882,0.406457,0.708865,0.850920,0.371871,0.377318,0.503505,0.734998,0.410611,0.229612,0.892918,0.552165,0.866600,0.847571,0.455074,0.307605,0.472100,0.691002,0.404418,0.176866,0.569343,0.194253,0.368627
566,0.010159,1.000000,0.707852,0.523509,0.701882,0.613654,0.483856,0.437989,0.382747,0.422470,0.426388,0.250096,0.408649,0.388521,0.425383,0.385136,0.384157,0.601875,0.602932,0.681997,0.196019,0.429263,0.626062,0.911340,0.632115,0.514749,0.428028,0.526217,0.499706,0.601868,0.262249,0.361536,0.452555,0.368966,0.494118
567,0.010162,1.000000,0.470160,0.523509,0.461865,0.963074,0.987873,0.386911,0.254613,0.266932,0.588898,0.774837,0.728081,0.671160,0.799586,0.732652,0.440909,0.312423,0.907202,0.728866,0.568226,0.757670,0.398867,0.552165,0.391499,0.896918,0.939898,0.344351,0.332893,0.424151,0.504819,0.390259,0.875912,0.221839,0.572549


## Standard-Scaler

In [40]:
# Create Manipulator object instance from dataframes
data_remove_stdscale = DataManipulator(data_cleaner_remove.df, deep=True)
data_revalue_stdscale = DataManipulator(data_cleaner_revalue.df, deep=True)


In [41]:
# For removed outlier
data_remove_stdscale.standardize_columns(data_remove_stdscale.df.columns.to_list())


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,concavity_dispersion_mean,concavity_dispersion_se,concavity_dispersion_worst
1,-0.425159,1.420827,2.219378,-0.276298,2.080005,2.473312,-0.815128,-0.396691,0.202431,0.823329,0.143194,-0.931297,0.885648,-0.904331,0.603919,1.469501,-0.619391,-0.713349,-0.472199,0.521062,-0.897427,0.124579,2.229291,-0.303464,1.949389,2.528179,-0.333646,-0.357380,0.000487,1.364722,-0.186731,0.520049,-0.369183,-1.087781,-0.715445
2,2.562152,1.420827,1.927391,0.597940,1.939330,2.040475,1.131771,1.573589,1.987720,2.599558,1.263926,-0.326448,1.886958,-0.789932,1.451413,2.202984,-0.243937,1.415542,0.638465,2.017778,0.516963,0.772641,1.882592,0.069084,1.725114,1.977977,0.647725,1.525751,1.215090,2.344510,1.547292,0.420002,1.239724,-0.910460,0.143429
4,2.564210,1.420827,2.126473,-1.137975,2.186725,2.371261,0.403446,0.916505,1.997414,1.873436,0.130111,-0.537780,1.944431,-0.802019,2.060438,2.218059,1.923550,0.196440,1.683794,1.657149,-0.294545,1.111953,1.631113,-1.487885,1.714434,1.679486,0.314249,-0.211732,0.921910,0.960775,-0.962321,-0.330354,0.450449,-0.999121,-0.569253
5,-0.425114,1.420827,-0.474859,-0.796319,-0.361442,-0.513966,2.557096,1.818480,1.347924,1.153160,1.342421,2.613994,-0.149848,-0.566960,-0.239291,-0.254461,0.308082,0.893971,0.548350,0.097896,0.377331,1.091370,-0.095056,-0.243715,-0.022803,-0.176948,2.300812,2.320482,1.710122,1.160170,2.294751,2.978159,3.182555,0.018144,1.842904
6,-0.425093,1.420827,1.449596,0.278893,1.434839,1.466878,-0.040597,0.339437,0.619268,0.941170,0.064699,-0.794659,0.406050,-0.819503,0.448271,0.727929,-0.989163,-0.654958,-0.250291,-0.106391,-0.930281,-0.719336,1.714126,0.443390,1.750033,1.748540,0.638197,0.204630,0.796261,1.488485,0.442179,0.149039,0.177238,-1.031785,-0.258597
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560,-0.422196,-0.703815,0.056025,2.080125,0.065921,-0.080073,0.324349,0.426724,-0.480664,-0.011403,-1.056033,-0.013092,-0.001213,0.732018,0.239789,-0.157031,0.204984,0.367666,-0.353360,1.117247,0.237699,1.229839,-0.136563,1.411662,-0.136721,-0.254689,-0.319354,-0.042338,-0.633573,-0.031045,-1.191381,0.116385,0.207595,0.610770,0.253073
563,-0.422166,1.420827,2.335508,1.562616,2.569945,2.547211,1.155266,3.118098,3.926496,3.199531,1.612792,1.276767,2.960109,-0.273837,4.430851,3.113680,-0.142869,1.655417,2.898653,3.197640,0.199916,1.796816,2.058383,0.750917,2.672051,2.223006,0.471459,1.479049,2.433765,2.537029,0.172933,1.194674,1.330794,-0.901127,0.106882
564,-0.422156,1.420827,2.547862,0.884328,2.521436,3.011718,1.241412,0.506738,2.738996,2.938004,-0.231837,-1.011457,4.019387,0.222615,3.656183,4.580647,1.440534,0.535737,1.407818,2.843264,-1.349177,0.565560,2.341602,0.221969,2.209262,2.686335,0.485751,-0.161864,0.984152,1.976659,-1.573147,-0.721512,0.875443,-1.209107,-0.459610
565,-0.422147,1.420827,2.073384,2.356464,1.997541,2.244577,0.207660,0.203655,1.124965,1.676830,-0.118455,-1.175423,1.985553,2.827910,1.892653,2.387182,-0.398584,0.166456,0.704924,1.225644,-0.061278,-0.520364,1.911891,2.304372,1.814111,2.026982,-0.676650,-0.313053,0.465270,0.965931,-0.544387,-1.053614,0.389735,-1.143778,-0.733719


In [42]:
# For revalued outlier
data_revalue_stdscale.standardize_columns(data_revalue_stdscale.df.columns.to_list())


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,concavity_dispersion_mean,concavity_dispersion_se,concavity_dispersion_worst
0,-0.378327,1.297676,1.518324,-2.360404,1.749816,1.537012,2.016808,-0.099871,-0.243940,-0.309980,0.076964,-0.041339,-0.197574,-0.570849,-0.192154,-0.344344,-0.079236,2.169033,1.594242,1.097549,1.995475,1.960554,2.549768,-1.479690,-0.176104,-0.247174,1.714516,-0.152408,-0.096004,-0.116936,0.032959,2.871652,-0.000628,-1.435861,0.033784
1,-0.378319,1.297676,2.420824,-0.278212,2.266581,-0.188753,-0.848257,-0.442606,0.178757,0.900654,0.168599,-0.959064,1.049534,-1.001176,0.732814,1.765634,-0.663867,-0.766540,-0.514861,0.553020,-0.970525,0.128861,2.448360,-0.298361,2.131283,2.890826,-0.305606,-0.371568,-0.004868,1.405122,-0.128343,0.597266,-0.390637,-1.035087,-0.687826
2,2.997862,1.297676,2.112994,0.702305,2.118204,2.311862,1.267771,1.548146,2.020554,-0.309980,1.346108,-0.333427,2.198828,-0.868111,1.721056,2.615161,-0.203234,1.435956,0.669070,2.135900,0.611709,0.844054,2.079132,0.113550,1.893525,2.280282,0.778105,1.689233,1.272258,-0.116936,1.805044,0.487658,1.331901,-0.839837,0.254275
3,2.999779,1.297676,-0.779900,0.457176,-0.563858,-0.821675,0.099105,-0.099871,2.753939,2.057145,0.076964,-0.041339,0.777138,0.058572,0.771944,-0.229371,1.270794,-0.164895,1.767149,1.714828,-0.081093,-0.108619,-0.172641,0.302018,-0.130439,-0.540829,0.088949,-0.152408,-0.096004,-0.116936,0.032959,-0.086266,-0.000628,1.559673,0.033784
4,3.000188,1.297676,2.322878,-1.244643,2.379144,2.672435,0.476176,0.884236,2.030554,2.027432,0.154853,-0.552023,2.264795,-0.882171,2.431225,2.632621,2.455991,0.174705,1.783359,1.754510,-0.296100,1.218516,1.811312,-1.607926,1.882203,1.949055,0.409854,-0.212179,0.963986,0.989549,-0.993103,-0.334409,0.486882,-0.937462,-0.527468
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,-0.374924,1.297676,-0.097779,1.023509,-0.120771,-0.188753,1.386936,0.470210,-0.243940,-0.309980,-0.225432,-1.041979,-0.197574,0.309638,-0.192154,-0.344344,1.863392,0.525732,1.489177,0.008492,-1.475884,0.615523,2.567970,0.282589,2.406781,-0.247174,0.599240,-0.157605,1.029433,2.034671,-1.674158,-0.762950,0.941892,-1.168678,-0.407200
565,-0.374914,1.297676,2.266908,0.023269,2.179601,2.534343,0.263382,0.163978,1.130491,1.816471,-0.106306,-1.211579,2.311995,-0.061939,2.235574,2.828501,-0.392965,0.143684,0.739914,1.298165,-0.035151,-0.582891,2.110335,0.090234,1.987873,2.334662,-0.684379,-0.323059,0.483841,0.994854,-0.527119,-1.126790,0.421881,-1.096744,-0.707871
566,-0.374903,1.297676,1.032094,0.023269,1.007926,0.988864,-0.864428,0.137030,0.272264,0.334459,-0.848549,-0.994868,0.554217,-0.144791,0.755293,0.678495,-0.326235,1.211462,1.208204,1.031412,-1.101000,0.376666,0.885640,1.782565,0.919848,0.834255,-0.826419,0.692183,0.598831,0.623491,-1.320189,-0.226323,-0.098130,-0.315747,-0.066440
567,-0.374891,1.297676,-0.097779,0.023269,-0.120771,2.549686,1.965737,-0.099871,-0.243940,-0.309980,0.076964,1.583055,2.087368,1.160750,2.709298,2.282590,-0.017984,-0.164895,2.641289,1.267301,0.747697,1.955047,-0.157040,0.090234,-0.176104,2.557127,1.861817,-0.152408,-0.096004,-0.116936,0.032959,-0.086266,1.786911,-0.973429,0.334454


## Save Finalised Datasets

In [44]:
## Outlier Removed MinMax Scaled Dataframe
removed_minmax = DataCleaner(data_remove_minmax.df)
removed_minmax.save_clean_data('../data/out_removed_minmax_scale.csv')

In [45]:
## Outlier Revalued MinMax Scaled Dataframe
revalued_minmax = DataCleaner(data_revalue_minmax.df)
removed_minmax.save_clean_data('../data/out_revalued_minmax_scale.csv')


In [46]:
## Outlier Removed Standard-Scalar Scaled Dataframe
removed_stdscale = DataCleaner(data_remove_stdscale.df)
removed_minmax.save_clean_data('../data/out_removed_std_scale.csv')


In [47]:
## Outlier Revalued Standard-Scalar Scaled Dataframe
revalued_stdscale = DataCleaner(data_revalue_stdscale.df)
removed_minmax.save_clean_data('../data/out_revalued_std_scale.csv')
