# Data Engineering

# Feature Adding, Data Cleaning, Standardization and Normalization

## Importing working Libraries and Scripts

In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import sys
import os

# Importing Scripts
sys.path.append(os.path.abspath(os.path.join('..')))
from scripts.logger_creator import CreateLogger
from scripts.data_loader import load_df_from_csv
from scripts.data_information import DataInfo
from scripts.data_cleaner import DataCleaner
from scripts.data_manipulation import DataManipulator
from scripts.utilities import calculate_concavity_dispersion


In [2]:
# Configuring Notebook Settings
pd.set_option('max_column', None)
pd.set_option('display.float_format', '{:.6f}'.format)
%matplotlib inline

## Loading Data CSV File

In [3]:
# Declaring Data File-Path
DATAPATH = '../data/data.csv'

In [4]:
# Loading Breast Cancer Data-Set
data_df = load_df_from_csv(DATAPATH, na_values=['none'])

In [5]:
# Extracting Information from the data
# Instantiate DataInfo Object using our dataset dataframe
data_info = DataInfo(data_df, deep=True)

In [6]:
# View Data Details
data_info.get_basic_description()

The DataFrame containes 569 rows and 33 columns.
Current DataFrame Memory Usage:
105962
Current DataFrame Memory Usage of columns is :
DataFrame Information: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    uint32 
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float32
 3   texture_mean             569 non-null    float32
 4   perimeter_mean           569 non-null    float32
 5   area_mean                569 non-null    float32
 6   smoothness_mean          569 non-null    float32
 7   compactness_mean         569 non-null    float32
 8   concavity_mean           569 non-null    float32
 9   concave points_mean      569 non-null    float32
 10  symmetry_mean            569 non-null    float32
 11  fractal_dimension_mean   569 

In [7]:
data_info.get_size()

The DataFrame containes 569 rows and 33 columns.


(569, 33)

In [8]:
data_info.get_total_missing_values()

The total number of missing values is 569
3.03 % missing values.


569

In [9]:
data_info.get_columns_with_missing_values()

['Unnamed: 32']

In [10]:
data_info.get_column_based_missing_values()

Unnamed: 0,missing_count,type
Unnamed: 32,569,float32


In [11]:
# Remove the entire column which is missing the entire data
data_info.df.drop(['Unnamed: 32'], axis=1, inplace=True)


In [12]:
data_info.get_total_entries()

The DataFrame containes 18208 entries.


18208

In [13]:
data_info.get_dispersion_params().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,Mode,Median
id,569.0,30371831.432337,125020585.612224,8670.0,869218.0,906024.0,8813129.0,911320502.0,8670.0,906024.0
radius_mean,569.0,14.127292,3.524049,6.981,11.7,13.37,15.78,28.110001,12.34,13.37
texture_mean,569.0,19.28965,4.301036,9.71,16.17,18.84,21.799999,39.279999,14.93,18.84
perimeter_mean,569.0,91.969032,24.298981,43.790001,75.169998,86.239998,104.099998,188.5,82.610001,86.239998
area_mean,569.0,654.889038,351.914124,143.5,420.299988,551.099976,782.700012,2501.0,512.200012,551.099976
smoothness_mean,569.0,0.09636,0.014064,0.05263,0.08637,0.09587,0.1053,0.1634,0.1007,0.09587
compactness_mean,569.0,0.104341,0.052813,0.01938,0.06492,0.09263,0.1304,0.3454,0.1147,0.09263
concavity_mean,569.0,0.088799,0.07972,0.0,0.02956,0.06154,0.1307,0.4268,0.0,0.06154
concave points_mean,569.0,0.048919,0.038803,0.0,0.02031,0.0335,0.074,0.2012,0.0,0.0335
symmetry_mean,569.0,0.181162,0.027414,0.106,0.1619,0.1792,0.1957,0.304,0.1601,0.1792


In [14]:
len(data_info.get_duplicates())

0

In [15]:
data_info.get_object_columns()

['diagnosis']

In [16]:
# Remaining are all Numeric Types
len(data_info.get_numeric_columns())

30

## Feature Extraction (Adding Features)

In [17]:
# Create A Data Manipulator Class Instance from the dataframe
data_manipulator = DataManipulator(data_info.df, deep=True)

## Concavity Dispersion Feature (Number of Concavity divided by Area of Nuclei)

In [18]:
# Add Concavity Dispersion for mean concave points and mean area
data_manipulator.add_column(
    'concavity_dispersion_mean', 'concave points_mean', 'area_mean', calculate_concavity_dispersion)


In [19]:
# Add Concavity Dispersion for se concave points and se area
data_manipulator.add_column(
    'concavity_dispersion_se', 'concave points_se', 'area_se', calculate_concavity_dispersion)


In [20]:
# Add Concavity Dispersion for worst concave points and worst area
data_manipulator.add_column(
    'concavity_dispersion_worst', 'concave points_worst', 'area_worst', calculate_concavity_dispersion)


In [21]:
# Checking if columns have been Added
data_manipulator.df.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst',
       'concavity_dispersion_mean', 'concavity_dispersion_se',
       'concavity_dispersion_worst'],
      dtype='object')

In [22]:
## Encoding Diagnosis where M(Malginancy) is 1 and B(Benign) is 0
data_manipulator.df.diagnosis = data_manipulator.df.diagnosis.apply(
    lambda x: 1 if x == 'M' else 0)
data_manipulator.df.diagnosis.sample(10)

511    0
114    0
483    0
119    1
411    0
16     1
449    1
363    0
482    0
562    1
Name: diagnosis, dtype: int64

## Outlier and standardization Ready Dataframe

In [23]:
# Current Null value Cleaned and labelled dataframe
current_df = data_manipulator.df

# Identifying Columns to Standardize
columns_to_standardize = current_df.columns.tolist()
columns_to_standardize.remove('id')
columns_to_standardize.remove('diagnosis')

## Fixing Outliers

In [24]:
# Create Cleaner object instance from current dataframe
data_cleaner_remove = DataCleaner(current_df, deep=True)
data_cleaner_revalue = DataCleaner(current_df, deep=True)

## Dropping Outlier Values

In [25]:
# Drop all outlier values
data_cleaner_remove.remove_outliers()


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,concavity_dispersion_mean,concavity_dispersion_se,concavity_dispersion_worst
1,842517,1,20.570000,17.770000,132.899994,1326.000000,0.084740,0.078640,0.086900,0.070170,0.181200,0.056670,0.543500,0.733900,3.398000,74.080002,0.005225,0.013080,0.018600,0.013400,0.013890,0.003532,24.990000,23.410000,158.800003,1956.000000,0.123800,0.186600,0.241600,0.186000,0.275000,0.089020,0.000053,0.000181,0.000095
2,84300903,1,19.690001,21.250000,130.000000,1203.000000,0.109600,0.159900,0.197400,0.127900,0.206900,0.059990,0.745600,0.786900,4.585000,94.029999,0.006150,0.040060,0.038320,0.020580,0.022500,0.004571,23.570000,25.530001,152.500000,1709.000000,0.144400,0.424500,0.450400,0.243000,0.361300,0.087580,0.000106,0.000219,0.000142
4,84358402,1,20.290001,14.340000,135.100006,1297.000000,0.100300,0.132800,0.198000,0.104300,0.180900,0.058830,0.757200,0.781300,5.438000,94.440002,0.011490,0.024610,0.056880,0.018850,0.017560,0.005115,22.540001,16.670000,152.199997,1575.000000,0.137400,0.205000,0.400000,0.162500,0.236400,0.076780,0.000080,0.000200,0.000103
5,843786,1,12.450000,15.700000,82.570000,477.100006,0.127800,0.170000,0.157800,0.080890,0.208700,0.076130,0.334500,0.890200,2.217000,27.190001,0.007510,0.033450,0.036720,0.011370,0.021650,0.005082,15.470000,23.750000,103.400002,741.599976,0.179100,0.524900,0.535500,0.174100,0.398500,0.124400,0.000170,0.000418,0.000235
6,844359,1,18.250000,19.980000,119.599998,1040.000000,0.094630,0.109000,0.112700,0.074000,0.179400,0.057420,0.446700,0.773200,3.180000,53.910000,0.004314,0.013820,0.022540,0.010390,0.013690,0.002179,22.879999,27.660000,153.199997,1606.000000,0.144200,0.257600,0.378400,0.193200,0.306300,0.083680,0.000071,0.000193,0.000120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560,925292,0,14.050000,27.150000,91.379997,600.400024,0.099290,0.112600,0.044620,0.043040,0.153700,0.061710,0.364500,1.492000,2.888000,29.840000,0.007256,0.026780,0.020710,0.016260,0.020800,0.005304,15.300000,33.169998,100.199997,706.700012,0.124100,0.226400,0.132600,0.104800,0.225000,0.083210,0.000072,0.000545,0.000148
563,926125,1,20.920000,25.090000,143.000000,1347.000000,0.109900,0.223600,0.317400,0.147400,0.214900,0.068790,0.962200,1.026000,8.758000,118.800003,0.006399,0.043100,0.078450,0.026240,0.020570,0.006213,24.290001,29.410000,179.100006,1819.000000,0.140700,0.418600,0.659900,0.254200,0.292900,0.098730,0.000109,0.000221,0.000140
564,926424,1,21.559999,22.389999,142.000000,1479.000000,0.111000,0.115900,0.243900,0.138900,0.172600,0.056230,1.176000,1.256000,7.673000,158.699997,0.010300,0.028910,0.051980,0.024540,0.011140,0.004239,25.450001,26.400000,166.100006,2027.000000,0.141000,0.211300,0.410700,0.221600,0.206000,0.071150,0.000094,0.000155,0.000109
565,926682,1,20.129999,28.250000,131.199997,1261.000000,0.097800,0.103400,0.144000,0.097910,0.175200,0.055330,0.765500,2.463000,5.203000,99.040001,0.005769,0.024230,0.039500,0.016780,0.018980,0.002498,23.690001,38.250000,155.000000,1731.000000,0.116600,0.192200,0.321500,0.162800,0.257200,0.066370,0.000078,0.000169,0.000094


In [26]:
# Length of outlier removed dataframe (decreased)
len(data_cleaner_remove.df)


483

## Revaluing Outliers to Median Values

In [27]:
data_cleaner_revalue.fix_outlier_columns(columns_to_standardize)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,concavity_dispersion_mean,concavity_dispersion_se,concavity_dispersion_worst
0,842302,1,17.990000,10.380000,122.800003,1001.000000,0.118400,0.092630,0.061540,0.033500,0.179200,0.061540,0.324200,0.905300,2.287000,24.530001,0.006399,0.049040,0.053730,0.015870,0.030030,0.006193,25.379999,17.330000,97.660004,686.500000,0.162200,0.211900,0.226700,0.099930,0.282200,0.118900,0.000065,0.000103,0.000131
1,842517,1,20.570000,17.770000,132.899994,551.099976,0.084740,0.078640,0.086900,0.070170,0.181200,0.056670,0.543500,0.733900,3.398000,74.080002,0.005225,0.013080,0.018600,0.013400,0.013890,0.003532,24.990000,23.410000,158.800003,1956.000000,0.123800,0.186600,0.241600,0.186000,0.275000,0.089020,0.000053,0.000181,0.000095
2,84300903,1,19.690001,21.250000,130.000000,1203.000000,0.109600,0.159900,0.197400,0.033500,0.206900,0.059990,0.745600,0.786900,4.585000,94.029999,0.006150,0.040060,0.038320,0.020580,0.022500,0.004571,23.570000,25.530001,152.500000,1709.000000,0.144400,0.424500,0.450400,0.099930,0.361300,0.087580,0.000106,0.000219,0.000142
3,84348301,1,11.420000,20.379999,77.580002,386.100006,0.095870,0.092630,0.241400,0.105200,0.179200,0.061540,0.495600,1.156000,3.445000,27.230000,0.009110,0.020450,0.056610,0.018670,0.018730,0.003187,14.910000,26.500000,98.870003,567.700012,0.131300,0.211900,0.226700,0.099930,0.282200,0.080040,0.000065,0.000686,0.000131
4,84358402,1,20.290001,14.340000,135.100006,1297.000000,0.100300,0.132800,0.198000,0.104300,0.180900,0.058830,0.757200,0.781300,5.438000,94.440002,0.011490,0.024610,0.056880,0.018850,0.017560,0.005115,22.540001,16.670000,152.199997,1575.000000,0.137400,0.205000,0.400000,0.162500,0.236400,0.076780,0.000080,0.000200,0.000103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,1,13.370000,22.389999,86.239998,551.099976,0.111000,0.115900,0.061540,0.033500,0.172600,0.056230,0.324200,1.256000,2.287000,24.530001,0.010300,0.028910,0.051980,0.010930,0.011140,0.004239,25.450001,26.400000,166.100006,686.500000,0.141000,0.211300,0.410700,0.221600,0.206000,0.071150,0.000094,0.000155,0.000109
565,926682,1,20.129999,18.840000,131.199997,1261.000000,0.097800,0.103400,0.144000,0.097910,0.175200,0.055330,0.765500,1.108000,5.203000,99.040001,0.005769,0.024230,0.039500,0.016780,0.018980,0.002498,23.690001,25.410000,155.000000,1731.000000,0.116600,0.192200,0.321500,0.162800,0.257200,0.066370,0.000078,0.000169,0.000094
566,926954,1,16.600000,18.840000,108.300003,858.099976,0.084550,0.102300,0.092510,0.053020,0.159000,0.056480,0.456400,1.075000,3.425000,48.549999,0.005903,0.037310,0.047300,0.015570,0.013180,0.003892,18.980000,34.119999,126.699997,1124.000000,0.113900,0.309400,0.340300,0.141800,0.221800,0.078200,0.000062,0.000321,0.000126
567,927241,1,13.370000,18.840000,86.239998,1265.000000,0.117800,0.092630,0.061540,0.033500,0.179200,0.070160,0.726000,1.595000,5.772000,86.220001,0.006522,0.020450,0.071170,0.016640,0.023240,0.006185,14.970000,25.410000,97.660004,1821.000000,0.165000,0.211900,0.226700,0.099930,0.282200,0.080040,0.000120,0.000193,0.000146


In [28]:
# Length of outlier revalued dataframe (unchanged)
len(data_cleaner_revalue.df)


569

## Standardization

## MIN-MAX

In [29]:
# Create Manipulator object instance from dataframes
data_remove_minmax = DataManipulator(data_cleaner_remove.df, deep=True)
data_revalue_minmax = DataManipulator(data_cleaner_revalue.df, deep=True)

In [30]:
# For removed outlier
# Minmax between 0 and 1
data_remove_minmax.minmax_scale_columns(columns_to_standardize, (0, 1))


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,concavity_dispersion_mean,concavity_dispersion_se,concavity_dispersion_worst
1,842517,1,0.834244,0.383627,0.822731,0.766613,0.298029,0.283514,0.273787,0.449232,0.470459,0.225168,0.389733,0.163702,0.325647,0.442916,0.191281,0.149438,0.166966,0.469681,0.161906,0.253450,0.842053,0.384927,0.745000,0.798449,0.388053,0.241722,0.284604,0.688634,0.371706,0.429800,0.294444,0.152101,0.291411
2,84300903,1,0.780220,0.549262,0.795956,0.686872,0.631318,0.672280,0.621928,0.818822,0.657914,0.336577,0.573678,0.186920,0.472010,0.574254,0.260450,0.521792,0.343986,0.721346,0.393931,0.353304,0.771964,0.456573,0.701698,0.687077,0.575923,0.619353,0.530569,0.899667,0.642409,0.411586,0.588889,0.184034,0.435583
4,84358402,1,0.817054,0.220371,0.843043,0.747812,0.506636,0.542627,0.623819,0.667734,0.468271,0.297651,0.584236,0.184466,0.577189,0.576953,0.659762,0.308565,0.510592,0.660708,0.260806,0.405586,0.721125,0.157148,0.699636,0.626657,0.512084,0.270929,0.471198,0.601629,0.250627,0.274981,0.444444,0.168067,0.315951
5,843786,1,0.335748,0.285102,0.358046,0.216272,0.875318,0.720601,0.497164,0.517862,0.671043,0.878188,0.199509,0.232171,0.180025,0.134222,0.362148,0.430567,0.329623,0.398528,0.371025,0.402414,0.372162,0.396418,0.364218,0.250879,0.892385,0.778723,0.630816,0.644576,0.759097,0.877308,0.944444,0.351261,0.720859
6,844359,1,0.691817,0.488815,0.699935,0.581199,0.430621,0.428763,0.355072,0.473752,0.457330,0.250336,0.301629,0.180918,0.298767,0.310129,0.123159,0.159651,0.202334,0.364178,0.156516,0.123419,0.737907,0.528557,0.706509,0.640635,0.574099,0.354424,0.445753,0.715291,0.469887,0.362256,0.394444,0.162185,0.368098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560,925292,0,0.433974,0.830081,0.439387,0.296207,0.493096,0.445986,0.140580,0.275544,0.269876,0.394295,0.226813,0.495795,0.262762,0.151668,0.343154,0.338513,0.185907,0.569926,0.348119,0.423750,0.363771,0.714768,0.342223,0.235143,0.390789,0.304899,0.156202,0.388004,0.214868,0.356312,0.400000,0.457983,0.453988
563,926125,1,0.855731,0.732032,0.915982,0.780227,0.635340,0.977036,1.000000,0.943662,0.716265,0.631879,0.770820,0.291659,0.986560,0.737324,0.279070,0.563747,0.704219,0.919734,0.341921,0.511110,0.807503,0.587698,0.884528,0.736676,0.542180,0.609988,0.777359,0.941133,0.427854,0.552618,0.605556,0.185714,0.429448
564,926424,1,0.895021,0.603522,0.906749,0.865802,0.650087,0.461774,0.768431,0.889245,0.407732,0.210403,0.965414,0.392413,0.852774,1.000000,0.570777,0.367910,0.466607,0.860147,0.087798,0.321397,0.864758,0.485975,0.795175,0.830463,0.544916,0.280930,0.483803,0.820437,0.155270,0.203769,0.522222,0.130252,0.334356
565,926682,1,0.807232,0.882437,0.807035,0.724473,0.473120,0.401971,0.453686,0.626825,0.426696,0.180201,0.591790,0.921149,0.548212,0.607236,0.231960,0.303321,0.354578,0.588153,0.299073,0.154077,0.777887,0.886448,0.718881,0.696997,0.322389,0.250611,0.378725,0.602740,0.315872,0.143309,0.433333,0.142017,0.288344


In [31]:
# For revalued outlier
data_revalue_minmax.minmax_scale_columns(columns_to_standardize, (0, 1))


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,concavity_dispersion_mean,concavity_dispersion_se,concavity_dispersion_worst
0,842302,1,0.810140,0.038417,0.859645,0.736368,0.996968,0.386911,0.254613,0.266932,0.588898,0.444189,0.252014,0.296282,0.243941,0.163545,0.429632,0.803255,0.684895,0.695138,0.819446,0.758815,0.988668,0.218969,0.391499,0.274866,0.911850,0.344351,0.332893,0.424151,0.504819,0.996878,0.474453,0.118391,0.513725
1,842517,1,1.000000,0.462156,0.969535,0.350021,0.486736,0.313015,0.359537,0.559124,0.604988,0.257384,0.511848,0.203120,0.421078,0.620657,0.321995,0.185895,0.237094,0.586947,0.222288,0.377703,0.966572,0.469691,0.898086,0.970940,0.527196,0.297159,0.354772,0.789474,0.475904,0.530440,0.386861,0.208046,0.372549
2,84300903,1,0.935242,0.661697,0.937983,0.909833,0.863574,0.742235,0.816715,0.266932,0.811746,0.384734,0.751303,0.231927,0.610332,0.804701,0.406803,0.649087,0.488464,0.901445,0.540847,0.526510,0.886119,0.557113,0.845886,0.835508,0.733547,0.740911,0.661380,0.424151,0.822490,0.507961,0.773723,0.251724,0.556863
3,84348301,1,0.326661,0.611812,0.367642,0.208330,0.655449,0.386911,0.998759,0.838247,0.588898,0.444189,0.455095,0.432547,0.428571,0.188454,0.678188,0.312423,0.721606,0.817784,0.401362,0.328292,0.395467,0.597113,0.401525,0.209727,0.602324,0.344351,0.332893,0.424151,0.504819,0.390259,0.474453,0.788506,0.513725
4,84358402,1,0.979395,0.265482,0.993472,0.990554,0.722601,0.599092,0.819197,0.831076,0.602574,0.340238,0.765047,0.228884,0.746333,0.808484,0.896397,0.383842,0.725048,0.825668,0.358073,0.604423,0.827762,0.191753,0.843400,0.762035,0.663428,0.331480,0.587372,0.689728,0.320884,0.339369,0.583942,0.229885,0.403922
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,1,0.470160,0.727064,0.461865,0.350021,0.884796,0.509825,0.254613,0.266932,0.535800,0.240507,0.252014,0.486901,0.243941,0.163545,0.787293,0.457664,0.662588,0.478756,0.120542,0.478961,0.992635,0.592990,0.958571,0.274866,0.699489,0.343232,0.603084,0.940577,0.198795,0.251483,0.686131,0.178161,0.427451
565,926682,1,0.967621,0.523509,0.951039,0.959639,0.684705,0.443799,0.595780,0.780159,0.556718,0.205984,0.774882,0.406457,0.708865,0.850920,0.371871,0.377318,0.503505,0.734998,0.410611,0.229612,0.892918,0.552165,0.866600,0.847571,0.455074,0.307605,0.472100,0.691002,0.404418,0.176866,0.569343,0.194253,0.368627
566,926954,1,0.707852,0.523509,0.701882,0.613654,0.483856,0.437989,0.382747,0.422470,0.426388,0.250096,0.408649,0.388521,0.425383,0.385136,0.384157,0.601875,0.602932,0.681997,0.196019,0.429263,0.626062,0.911340,0.632115,0.514749,0.428028,0.526217,0.499706,0.601868,0.262249,0.361536,0.452555,0.368966,0.494118
567,927241,1,0.470160,0.523509,0.461865,0.963074,0.987873,0.386911,0.254613,0.266932,0.588898,0.774837,0.728081,0.671160,0.799585,0.732652,0.440909,0.312423,0.907202,0.728866,0.568226,0.757670,0.398867,0.552165,0.391499,0.896919,0.939898,0.344351,0.332893,0.424151,0.504819,0.390259,0.875912,0.221839,0.572549


## Standard-Scaler

In [32]:
# Create Manipulator object instance from dataframes
data_remove_stdscale = DataManipulator(data_cleaner_remove.df, deep=True)
data_revalue_stdscale = DataManipulator(data_cleaner_revalue.df, deep=True)


In [33]:
# For removed outlier
data_remove_stdscale.standardize_columns(columns_to_standardize)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,concavity_dispersion_mean,concavity_dispersion_se,concavity_dispersion_worst
1,842517,1,2.219377,-0.276298,2.080005,2.473312,-0.815128,-0.396691,0.202431,0.823329,0.143194,-0.931297,0.885648,-0.904331,0.603919,1.469501,-0.619391,-0.713349,-0.472199,0.521062,-0.897427,0.124579,2.229291,-0.303464,1.949388,2.528179,-0.333646,-0.357380,0.000487,1.364722,-0.186731,0.520049,-0.369183,-1.087781,-0.715445
2,84300903,1,1.927391,0.597940,1.939330,2.040475,1.131772,1.573589,1.987720,2.599558,1.263926,-0.326448,1.886958,-0.789931,1.451413,2.202984,-0.243937,1.415541,0.638465,2.017778,0.516963,0.772641,1.882592,0.069084,1.725114,1.977977,0.647725,1.525751,1.215090,2.344509,1.547292,0.420002,1.239724,-0.910460,0.143429
4,84358402,1,2.126473,-1.137975,2.186725,2.371261,0.403446,0.916506,1.997414,1.873436,0.130111,-0.537781,1.944431,-0.802019,2.060438,2.218059,1.923550,0.196440,1.683795,1.657149,-0.294545,1.111953,1.631114,-1.487885,1.714434,1.679486,0.314249,-0.211732,0.921910,0.960774,-0.962321,-0.330354,0.450449,-0.999121,-0.569253
5,843786,1,-0.474859,-0.796319,-0.361442,-0.513966,2.557096,1.818479,1.347924,1.153160,1.342421,2.613994,-0.149848,-0.566960,-0.239291,-0.254461,0.308082,0.893971,0.548350,0.097896,0.377331,1.091370,-0.095056,-0.243716,-0.022803,-0.176948,2.300812,2.320482,1.710122,1.160170,2.294751,2.978159,3.182555,0.018144,1.842904
6,844359,1,1.449596,0.278893,1.434839,1.466878,-0.040597,0.339436,0.619268,0.941170,0.064699,-0.794659,0.406050,-0.819503,0.448272,0.727929,-0.989163,-0.654958,-0.250291,-0.106391,-0.930281,-0.719336,1.714126,0.443389,1.750033,1.748540,0.638197,0.204630,0.796261,1.488485,0.442179,0.149040,0.177238,-1.031785,-0.258597
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560,925292,0,0.056025,2.080125,0.065921,-0.080074,0.324349,0.426724,-0.480664,-0.011403,-1.056033,-0.013092,-0.001213,0.732018,0.239789,-0.157031,0.204984,0.367667,-0.353360,1.117247,0.237699,1.229839,-0.136562,1.411663,-0.136721,-0.254689,-0.319355,-0.042338,-0.633573,-0.031045,-1.191381,0.116385,0.207595,0.610770,0.253073
563,926125,1,2.335508,1.562616,2.569944,2.547210,1.155266,3.118097,3.926496,3.199531,1.612792,1.276767,2.960109,-0.273837,4.430851,3.113681,-0.142869,1.655416,2.898653,3.197640,0.199916,1.796815,2.058383,0.750917,2.672051,2.223006,0.471459,1.479049,2.433765,2.537029,0.172933,1.194674,1.330794,-0.901127,0.106882
564,926424,1,2.547862,0.884328,2.521436,3.011718,1.241412,0.506738,2.738996,2.938004,-0.231837,-1.011458,4.019387,0.222615,3.656183,4.580647,1.440534,0.535737,1.407818,2.843264,-1.349177,0.565560,2.341602,0.221969,2.209262,2.686335,0.485751,-0.161864,0.984152,1.976659,-1.573147,-0.721512,0.875443,-1.209107,-0.459610
565,926682,1,2.073384,2.356464,1.997541,2.244577,0.207660,0.203655,1.124965,1.676829,-0.118456,-1.175423,1.985553,2.827910,1.892653,2.387182,-0.398584,0.166456,0.704924,1.225644,-0.061278,-0.520364,1.911891,2.304372,1.814112,2.026983,-0.676650,-0.313053,0.465270,0.965931,-0.544386,-1.053614,0.389735,-1.143778,-0.733719


In [34]:
# For revalued outlier
data_revalue_stdscale.standardize_columns(columns_to_standardize)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,concavity_dispersion_mean,concavity_dispersion_se,concavity_dispersion_worst
0,842302,1,1.518324,-2.360404,1.749817,1.537012,2.016808,-0.099871,-0.243940,-0.309980,0.076963,-0.041339,-0.197573,-0.570850,-0.192154,-0.344344,-0.079236,2.169033,1.594242,1.097549,1.995475,1.960554,2.549768,-1.479690,-0.176104,-0.247174,1.714516,-0.152408,-0.096004,-0.116936,0.032959,2.871652,-0.000628,-1.435861,0.033784
1,842517,1,2.420823,-0.278212,2.266581,-0.188753,-0.848256,-0.442606,0.178757,0.900654,0.168599,-0.959064,1.049534,-1.001176,0.732814,1.765634,-0.663867,-0.766540,-0.514861,0.553020,-0.970525,0.128861,2.448361,-0.298361,2.131283,2.890826,-0.305606,-0.371568,-0.004868,1.405122,-0.128343,0.597267,-0.390637,-1.035087,-0.687826
2,84300903,1,2.112994,0.702305,2.118203,2.311862,1.267772,1.548146,2.020554,-0.309980,1.346108,-0.333428,2.198828,-0.868111,1.721056,2.615161,-0.203234,1.435956,0.669070,2.135900,0.611709,0.844054,2.079132,0.113550,1.893525,2.280282,0.778105,1.689233,1.272258,-0.116936,1.805044,0.487658,1.331901,-0.839837,0.254275
3,84348301,1,-0.779901,0.457176,-0.563858,-0.821675,0.099105,-0.099871,2.753939,2.057145,0.076963,-0.041339,0.777138,0.058572,0.771944,-0.229371,1.270794,-0.164894,1.767149,1.714828,-0.081093,-0.108619,-0.172641,0.302018,-0.130439,-0.540829,0.088949,-0.152408,-0.096004,-0.116936,0.032959,-0.086267,-0.000628,1.559673,0.033784
4,84358402,1,2.322878,-1.244643,2.379144,2.672435,0.476176,0.884235,2.030554,2.027432,0.154853,-0.552023,2.264795,-0.882171,2.431225,2.632621,2.455992,0.174705,1.783359,1.754510,-0.296100,1.218516,1.811312,-1.607926,1.882203,1.949055,0.409854,-0.212179,0.963987,0.989549,-0.993103,-0.334409,0.486882,-0.937462,-0.527468
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,1,-0.097779,1.023510,-0.120771,-0.188753,1.386937,0.470210,-0.243940,-0.309980,-0.225432,-1.041979,-0.197573,0.309638,-0.192154,-0.344344,1.863392,0.525732,1.489177,0.008492,-1.475885,0.615523,2.567970,0.282588,2.406781,-0.247174,0.599240,-0.157605,1.029433,2.034671,-1.674158,-0.762950,0.941892,-1.168678,-0.407200
565,926682,1,2.266908,0.023269,2.179601,2.534343,0.263382,0.163978,1.130491,1.816471,-0.106306,-1.211579,2.311995,-0.061939,2.235574,2.828501,-0.392965,0.143684,0.739914,1.298165,-0.035151,-0.582891,2.110335,0.090234,1.987873,2.334662,-0.684379,-0.323059,0.483841,0.994854,-0.527119,-1.126790,0.421881,-1.096744,-0.707871
566,926954,1,1.032094,0.023269,1.007926,0.988864,-0.864429,0.137030,0.272263,0.334459,-0.848550,-0.994868,0.554217,-0.144791,0.755293,0.678495,-0.326235,1.211461,1.208203,1.031412,-1.101000,0.376666,0.885640,1.782565,0.919848,0.834255,-0.826419,0.692183,0.598831,0.623491,-1.320189,-0.226323,-0.098130,-0.315747,-0.066440
567,927241,1,-0.097779,0.023269,-0.120771,2.549687,1.965737,-0.099871,-0.243940,-0.309980,0.076963,1.583054,2.087368,1.160750,2.709298,2.282590,-0.017984,-0.164894,2.641289,1.267301,0.747697,1.955047,-0.157040,0.090234,-0.176104,2.557128,1.861817,-0.152408,-0.096004,-0.116936,0.032959,-0.086267,1.786911,-0.973429,0.334454


## Normalizing

In [35]:
# Create Manipulator object instance from dataframes to Normalize
data_remove_minmax_norm = DataManipulator(data_remove_minmax.df, deep=True)
data_revalue_minmax_norm = DataManipulator(data_revalue_minmax.df, deep=True)
data_remove_stdscale_norm = DataManipulator(data_remove_stdscale.df, deep=True)
data_revalue_stdscale_norm = DataManipulator(data_revalue_stdscale.df, deep=True)

In [36]:
# Normalize dataframes
data_remove_minmax_norm.normalize_df(sep_index=2)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,concavity_dispersion_mean,concavity_dispersion_se,concavity_dispersion_worst
1,842517,1,0.231359,0.162873,0.236026,0.203679,0.187205,0.199352,0.184421,0.242806,0.195092,0.099806,0.170113,0.055427,0.139966,0.170284,0.077232,0.154728,0.102002,0.213902,0.116813,0.104766,0.228912,0.135388,0.208075,0.203740,0.170779,0.183657,0.157330,0.266779,0.190494,0.122048,0.174624,0.054572,0.129164
2,84300903,1,0.272468,0.073489,0.281135,0.249377,0.168951,0.180953,0.208029,0.222673,0.156157,0.099260,0.194829,0.061515,0.192479,0.192400,0.220015,0.102899,0.170270,0.220330,0.086973,0.135253,0.240478,0.052405,0.233312,0.208975,0.170768,0.090348,0.157133,0.200629,0.083578,0.091700,0.148212,0.056046,0.105362
4,84358402,1,0.267905,0.189293,0.271049,0.225069,0.166757,0.166038,0.137501,0.183460,0.177101,0.096942,0.116806,0.070060,0.115697,0.120097,0.047693,0.061825,0.078354,0.141028,0.060611,0.047794,0.285754,0.204683,0.273595,0.248085,0.222319,0.137250,0.172618,0.276996,0.181963,0.140283,0.152748,0.062806,0.142546
5,843786,1,0.143115,0.183361,0.148447,0.097565,0.261908,0.240529,0.102229,0.132743,0.260019,0.285406,0.147632,0.154311,0.132382,0.100713,0.159011,0.134057,0.077374,0.175831,0.065147,0.150400,0.156120,0.188733,0.143324,0.111189,0.265873,0.183608,0.109290,0.199578,0.177241,0.263182,0.200165,0.082680,0.183847
6,844359,1,0.109889,0.171412,0.120015,0.072549,0.258316,0.247306,0.174179,0.178071,0.256608,0.238809,0.051698,0.083609,0.060468,0.034297,0.068137,0.134489,0.094849,0.127795,0.108575,0.081575,0.110970,0.188041,0.114037,0.074300,0.241518,0.238758,0.188823,0.226812,0.262407,0.196202,0.297388,0.125953,0.254513
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,911685,0,0.190471,0.364322,0.192847,0.130005,0.216420,0.195743,0.061700,0.120936,0.118448,0.173056,0.099548,0.217604,0.115326,0.066567,0.150610,0.148573,0.081594,0.250141,0.152789,0.185984,0.159659,0.313712,0.150202,0.103204,0.171517,0.133820,0.068557,0.170295,0.094306,0.156385,0.175560,0.201009,0.199255
479,911916,1,0.209142,0.178909,0.223867,0.190688,0.155278,0.238789,0.244401,0.230632,0.175056,0.154432,0.188389,0.071282,0.241116,0.180203,0.068205,0.137780,0.172112,0.224784,0.083566,0.124916,0.197354,0.143634,0.216180,0.180044,0.132509,0.149082,0.189987,0.230014,0.104568,0.135060,0.147998,0.045389,0.104958
480,912193,0,0.244832,0.165093,0.248041,0.236840,0.177831,0.126318,0.210204,0.243252,0.111535,0.057556,0.264088,0.107344,0.233276,0.273549,0.156136,0.100641,0.127640,0.235293,0.024017,0.087918,0.236554,0.132938,0.217520,0.227172,0.149061,0.076848,0.132344,0.224430,0.042474,0.055741,0.142854,0.035630,0.091463
481,91227,0,0.256786,0.280709,0.256724,0.230460,0.150503,0.127870,0.144321,0.199397,0.135735,0.057323,0.188253,0.293024,0.174390,0.193166,0.073788,0.096488,0.112794,0.187095,0.095137,0.049013,0.247451,0.281985,0.228681,0.221720,0.102554,0.079721,0.120475,0.191736,0.100481,0.045588,0.137846,0.045177,0.091724


In [37]:
data_revalue_minmax_norm.normalize_df(sep_index=2)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,concavity_dispersion_mean,concavity_dispersion_se,concavity_dispersion_worst
0,842302,1,0.241254,0.011440,0.255996,0.219285,0.296890,0.115219,0.075822,0.079490,0.175369,0.132276,0.075048,0.088231,0.072644,0.048703,0.127941,0.239203,0.203957,0.207007,0.244025,0.225969,0.294418,0.065207,0.116585,0.081853,0.271542,0.102545,0.099133,0.126309,0.150331,0.296863,0.141288,0.035256,0.152984
1,842517,1,0.317164,0.146579,0.307502,0.111014,0.154375,0.099277,0.114032,0.177334,0.191880,0.081633,0.162340,0.064422,0.133551,0.196850,0.102125,0.058959,0.075198,0.186158,0.070502,0.119794,0.306562,0.148969,0.284841,0.307947,0.167208,0.094248,0.112521,0.250393,0.150940,0.168237,0.122698,0.065985,0.118159
2,84300903,1,0.235076,0.166320,0.235765,0.228689,0.217062,0.186563,0.205284,0.067094,0.204035,0.096704,0.188842,0.058296,0.153409,0.202264,0.102251,0.163150,0.122777,0.226581,0.135943,0.132340,0.222729,0.140032,0.212616,0.210007,0.184379,0.186230,0.166240,0.106612,0.206735,0.127678,0.194477,0.063272,0.139969
3,84348301,1,0.108165,0.202584,0.121734,0.068983,0.217034,0.128115,0.330711,0.277562,0.194997,0.147081,0.150692,0.143226,0.141909,0.062401,0.224563,0.103450,0.238940,0.270786,0.132900,0.108705,0.130948,0.197717,0.132954,0.069445,0.199443,0.114022,0.110228,0.140446,0.167157,0.129223,0.157102,0.261091,0.170106
4,84358402,1,0.258276,0.070010,0.261989,0.261219,0.190557,0.157986,0.216031,0.219163,0.158905,0.089724,0.201751,0.060359,0.196815,0.213205,0.236389,0.101223,0.191202,0.217737,0.094427,0.159392,0.218289,0.050567,0.222413,0.200956,0.174953,0.087415,0.154896,0.181888,0.084620,0.089495,0.153991,0.060623,0.106518
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,1,0.150862,0.233297,0.148201,0.112313,0.283909,0.163590,0.081699,0.085652,0.171925,0.077173,0.080865,0.156234,0.078275,0.052478,0.252622,0.146853,0.212608,0.153621,0.038679,0.153687,0.318512,0.190276,0.307582,0.088198,0.224449,0.110134,0.193514,0.301808,0.063788,0.080695,0.220162,0.057167,0.137158
565,926682,1,0.272904,0.147648,0.268227,0.270652,0.193111,0.125167,0.168031,0.220033,0.157014,0.058095,0.218544,0.114635,0.199925,0.239990,0.104881,0.106417,0.142006,0.207296,0.115807,0.064759,0.251835,0.155730,0.244412,0.239045,0.128347,0.086756,0.133149,0.194887,0.114060,0.049882,0.160575,0.054786,0.103966
566,926954,1,0.241492,0.178602,0.239456,0.209356,0.165074,0.149425,0.130579,0.144131,0.145468,0.085323,0.139416,0.132549,0.145125,0.131394,0.131060,0.205337,0.205698,0.232672,0.066874,0.146448,0.213589,0.310915,0.215654,0.175613,0.146027,0.179525,0.170481,0.205335,0.089470,0.123343,0.154395,0.125877,0.168574
567,927241,1,0.131360,0.146265,0.129042,0.269077,0.276006,0.108101,0.071137,0.074579,0.164535,0.216485,0.203421,0.187518,0.223399,0.204699,0.123187,0.087289,0.253467,0.203641,0.158759,0.211688,0.111441,0.154272,0.109382,0.250594,0.262602,0.096210,0.093008,0.118505,0.141043,0.109036,0.244725,0.061981,0.159967


In [38]:
data_remove_stdscale_norm.normalize_df(sep_index=2)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,concavity_dispersion_mean,concavity_dispersion_se,concavity_dispersion_worst
1,842517,1,0.227410,0.070550,0.228818,0.240752,0.133536,0.185665,0.234528,0.306718,0.149129,-0.038517,0.222639,-0.093203,0.171250,0.259927,-0.028782,0.167017,0.075331,0.238074,0.060996,0.091163,0.222124,0.008151,0.203543,0.233378,0.076424,0.180021,0.143366,0.276625,0.182562,0.049555,0.146273,-0.107424,0.016923
2,84300903,1,0.265037,-0.141834,0.272547,0.295546,0.050284,0.114230,0.248951,0.233499,0.016217,-0.067027,0.242348,-0.099961,0.256807,0.276452,0.239745,0.024484,0.209863,0.206542,-0.036711,0.138590,0.203297,-0.185445,0.213682,0.209326,0.039167,-0.026390,0.114904,0.119748,-0.119941,-0.041174,0.056143,-0.124527,-0.070950
4,84358402,1,0.280133,0.053896,0.277281,0.283473,-0.007845,0.065596,0.119673,0.181880,0.012503,-0.153567,0.078469,-0.158368,0.086628,0.140672,-0.191155,-0.126570,-0.048369,-0.020560,-0.179776,-0.139011,0.331253,0.085685,0.338192,0.337904,0.123331,0.039545,0.153877,0.287648,0.085451,0.028802,0.034251,-0.199392,-0.049974
5,843786,1,-0.009539,0.082717,0.001458,-0.026750,0.312453,0.283062,0.052350,0.084963,0.305340,0.389514,0.182058,0.081266,0.156373,0.104056,0.140045,0.108282,-0.019905,0.125343,-0.123981,0.217900,0.049242,0.088648,0.039224,0.028424,0.276851,0.181432,0.025683,0.141465,0.119165,0.391725,0.198046,-0.101987,0.119250
6,844359,1,-0.032695,0.082879,-0.013676,-0.040672,0.281574,0.266261,0.201504,0.172445,0.278374,0.246680,-0.032381,-0.036416,-0.011669,-0.040256,-0.046297,0.113824,0.053826,0.031694,0.038154,0.029067,-0.010084,0.109913,0.008597,-0.020361,0.210413,0.272948,0.193515,0.191058,0.344921,0.199405,0.389844,0.046906,0.296002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,911685,0,0.014854,0.551498,0.017477,-0.021230,0.085994,0.113136,-0.127437,-0.003023,-0.279984,-0.003471,-0.000321,0.194078,0.063575,-0.041633,0.054347,0.097479,-0.093685,0.296213,0.063020,0.326064,-0.036206,0.374271,-0.036248,-0.067525,-0.084670,-0.011225,-0.167978,-0.008231,-0.315868,0.030857,0.055039,0.161932,0.067097
479,911916,1,0.184375,0.123360,0.202883,0.201088,0.091202,0.246156,0.309975,0.252585,0.127321,0.100794,0.233684,-0.021618,0.349791,0.245807,-0.011279,0.130686,0.228832,0.252436,0.015782,0.141848,0.162498,0.059281,0.210943,0.175494,0.037219,0.116763,0.192132,0.200284,0.013652,0.094313,0.105059,-0.071139,0.008438
480,912193,0,0.219256,0.076101,0.216982,0.259173,0.106829,0.043607,0.235704,0.252829,-0.019951,-0.087041,0.345887,0.019157,0.314632,0.394186,0.123965,0.046103,0.121149,0.244676,-0.116103,0.048669,0.201506,0.019102,0.190117,0.231172,0.041801,-0.013929,0.084691,0.170101,-0.135377,-0.062090,0.075336,-0.104049,-0.039552
481,91227,0,0.248539,0.282473,0.239448,0.269060,0.024892,0.024412,0.134851,0.201004,-0.014199,-0.140899,0.238011,0.338985,0.226875,0.286155,-0.047779,0.019953,0.084500,0.146920,-0.007345,-0.062377,0.229181,0.276228,0.217460,0.242977,-0.081111,-0.037526,0.055773,0.115787,-0.065256,-0.126298,0.046718,-0.137106,-0.087952


In [39]:
data_revalue_stdscale_norm.normalize_df(sep_index=2)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,concavity_dispersion_mean,concavity_dispersion_se,concavity_dispersion_worst
0,842302,1,0.202131,-0.314235,0.232949,0.204619,0.268493,-0.013296,-0.032475,-0.041267,0.010246,-0.005503,-0.026302,-0.075996,-0.025581,-0.045842,-0.010549,0.288758,0.212238,0.146114,0.265653,0.261004,0.339445,-0.196988,-0.023444,-0.032906,0.228249,-0.020290,-0.012781,-0.015567,0.004388,0.382296,-0.000084,-0.191153,0.004498
1,842517,1,0.359311,-0.041294,0.336418,-0.028016,-0.125903,-0.065694,0.026532,0.133680,0.025024,-0.142349,0.155777,-0.148600,0.108768,0.262065,-0.098535,-0.113774,-0.076418,0.082082,-0.144050,0.019126,0.363398,-0.044284,0.316336,0.429071,-0.045360,-0.055150,-0.000723,0.208556,-0.019049,0.088649,-0.057980,-0.153633,-0.102091
2,84300903,1,0.247755,0.082347,0.248365,0.271072,0.148650,0.181525,0.236916,-0.036346,0.157835,-0.039095,0.257819,-0.101789,0.201799,0.306635,-0.023830,0.168370,0.078450,0.250441,0.071725,0.098968,0.243784,0.013314,0.222021,0.267370,0.091235,0.198068,0.149176,-0.013711,0.211647,0.057179,0.156169,-0.098473,0.029815
3,84348301,1,-0.154261,0.090427,-0.111529,-0.162523,0.019602,-0.019754,0.544716,0.406894,0.015223,-0.008177,0.153714,0.011585,0.152687,-0.045368,0.251357,-0.032615,0.349534,0.339185,-0.016040,-0.021484,-0.034148,0.059738,-0.025800,-0.106973,0.017594,-0.030146,-0.018989,-0.023129,0.006519,-0.017063,-0.000124,0.308496,0.006682
4,84358402,1,0.260060,-0.139345,0.266359,0.299195,0.053311,0.098995,0.227333,0.226983,0.017337,-0.061802,0.253557,-0.098764,0.272190,0.294737,0.274963,0.019559,0.199658,0.196428,-0.033150,0.136420,0.202787,-0.180017,0.210724,0.218208,0.045886,-0.023755,0.107924,0.110786,-0.111184,-0.037439,0.054509,-0.104954,-0.059053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,1,-0.016016,0.167646,-0.019782,-0.030917,0.227174,0.077018,-0.039956,-0.050773,-0.036925,-0.170671,-0.032362,0.050717,-0.031474,-0.056402,0.305215,0.086112,0.243920,0.001391,-0.241743,0.100820,0.420621,0.046287,0.394219,-0.040486,0.098153,-0.025815,0.168616,0.333269,-0.274219,-0.124968,0.154277,-0.191424,-0.066697
565,926682,1,0.286395,0.002940,0.275365,0.320182,0.033275,0.020717,0.142823,0.229488,-0.013430,-0.153068,0.292092,-0.007825,0.282437,0.357346,-0.049646,0.018153,0.093479,0.164007,-0.004441,-0.073641,0.266614,0.011400,0.251143,0.294955,-0.086463,-0.040814,0.061127,0.125687,-0.066595,-0.142356,0.053299,-0.138560,-0.089431
566,926954,1,0.220781,0.004978,0.215611,0.211533,-0.184914,0.029313,0.058241,0.071546,-0.181518,-0.212817,0.118555,-0.030973,0.161569,0.145140,-0.069787,0.259150,0.258453,0.220635,-0.235521,0.080575,0.189452,0.381318,0.196770,0.178460,-0.176784,0.148069,0.128099,0.133374,-0.282408,-0.048414,-0.020992,-0.067543,-0.014213
567,927241,1,-0.012763,0.003037,-0.015765,0.332817,0.256593,-0.013036,-0.031842,-0.040463,0.010046,0.206640,0.272470,0.151516,0.353652,0.297953,-0.002348,-0.021524,0.344775,0.165424,0.097599,0.255198,-0.020499,0.011778,-0.022987,0.333789,0.243028,-0.019894,-0.012532,-0.015264,0.004302,-0.011261,0.233250,-0.127064,0.043657


## Save Finalised Datasets

In [48]:
## Outlier Removed MinMax Scaled Dataframe
removed_minmax = DataCleaner(data_remove_minmax.df)
# removed_minmax.df['diagnosis'].apply(lambda x: 'M' if int(x) == 1 else 'B')
removed_minmax.save_clean_data('../data/out_removed_minmax_scale.csv')

In [49]:
## Outlier Revalued MinMax Scaled Dataframe
revalued_minmax = DataCleaner(data_revalue_minmax.df)
revalued_minmax.save_clean_data('../data/out_revalued_minmax_scale.csv')


In [50]:
## Outlier Removed Standard-Scalar Scaled Dataframe
removed_stdscale = DataCleaner(data_remove_stdscale.df)
removed_stdscale.save_clean_data('../data/out_removed_std_scale.csv')


In [51]:
## Outlier Revalued Standard-Scalar Scaled Dataframe
revalued_stdscale = DataCleaner(data_revalue_stdscale.df)
revalued_stdscale.save_clean_data('../data/out_revalued_std_scale.csv')


In [52]:
## Outlier Removed MinMax Scaled Dataframe
removed_minmax_norm = DataCleaner(data_remove_minmax_norm.df)
removed_minmax_norm.save_clean_data('../data/out_removed_minmax_scale_norm.csv')


In [53]:
## Outlier Revalued MinMax Scaled Dataframe
revalued_minmax_norm = DataCleaner(data_revalue_minmax.df)
revalued_minmax_norm.save_clean_data('../data/out_revalued_minmax_scale_norm.csv')


In [54]:
## Outlier Removed Standard-Scalar Scaled Dataframe
removed_stdscale_norm = DataCleaner(data_remove_stdscale_norm.df)
removed_stdscale_norm.save_clean_data('../data/out_removed_std_scale_norm.csv')


In [55]:

## Outlier Revalued Standard-Scalar Scaled Dataframe
revalued_stdscale_norm = DataCleaner(data_revalue_stdscale_norm.df)
revalued_stdscale_norm.save_clean_data('../data/out_revalued_std_scale_norm.csv')
