# Data Engineering

# Feature Adding, Data Cleaning, Standardization and Normalization

## Importing working Libraries and Scripts

In [48]:
# Importing Libraries
import pandas as pd
import numpy as np
import sys
import os

# Importing Scripts
sys.path.append(os.path.abspath(os.path.join('..')))
from scripts.logger_creator import CreateLogger
from scripts.data_loader import load_df_from_csv
from scripts.data_information import DataInfo
from scripts.data_cleaner import DataCleaner
from scripts.data_manipulation import DataManipulator
from scripts.utilities import calculate_concavity_dispersion


In [49]:
# Configuring Notebook Settings
pd.set_option('max_column', None)
pd.set_option('display.float_format', '{:.6f}'.format)
%matplotlib inline

## Loading Data CSV File

In [50]:
# Declaring Data File-Path
DATAPATH = '../data/data.csv'

In [51]:
# Loading Breast Cancer Data-Set
data_df = load_df_from_csv(DATAPATH, na_values=['none'])

In [52]:
# Extracting Information from the data
# Instantiate DataInfo Object using our dataset dataframe
data_info = DataInfo(data_df, deep=True)

In [53]:
# View Data Details
data_info.get_basic_description()

The DataFrame containes 569 rows and 33 columns.
Current DataFrame Memory Usage:
105962
Current DataFrame Memory Usage of columns is :
DataFrame Information: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    uint32 
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float32
 3   texture_mean             569 non-null    float32
 4   perimeter_mean           569 non-null    float32
 5   area_mean                569 non-null    float32
 6   smoothness_mean          569 non-null    float32
 7   compactness_mean         569 non-null    float32
 8   concavity_mean           569 non-null    float32
 9   concave points_mean      569 non-null    float32
 10  symmetry_mean            569 non-null    float32
 11  fractal_dimension_mean   569 

In [54]:
data_info.get_size()

The DataFrame containes 569 rows and 33 columns.


(569, 33)

In [55]:
data_info.get_total_missing_values()

The total number of missing values is 569
3.03 % missing values.


569

In [56]:
data_info.get_columns_with_missing_values()

['Unnamed: 32']

In [57]:
data_info.get_column_based_missing_values()

Unnamed: 0,missing_count,type
Unnamed: 32,569,float32


In [58]:
# Remove the entire column which is missing the entire data
data_info.df.drop(['Unnamed: 32'], axis=1, inplace=True)


In [59]:
data_info.get_total_entries()

The DataFrame containes 18208 entries.


18208

In [60]:
data_info.get_dispersion_params().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,Mode,Median
id,569.0,30371831.432337,125020585.612224,8670.0,869218.0,906024.0,8813129.0,911320502.0,8670.0,906024.0
radius_mean,569.0,14.127292,3.524049,6.981,11.7,13.37,15.78,28.110001,12.34,13.37
texture_mean,569.0,19.28965,4.301036,9.71,16.17,18.84,21.799999,39.279999,14.93,18.84
perimeter_mean,569.0,91.969032,24.298981,43.790001,75.169998,86.239998,104.099998,188.5,82.610001,86.239998
area_mean,569.0,654.889038,351.914124,143.5,420.299988,551.099976,782.700012,2501.0,512.200012,551.099976
smoothness_mean,569.0,0.09636,0.014064,0.05263,0.08637,0.09587,0.1053,0.1634,0.1007,0.09587
compactness_mean,569.0,0.104341,0.052813,0.01938,0.06492,0.09263,0.1304,0.3454,0.1147,0.09263
concavity_mean,569.0,0.088799,0.07972,0.0,0.02956,0.06154,0.1307,0.4268,0.0,0.06154
concave points_mean,569.0,0.048919,0.038803,0.0,0.02031,0.0335,0.074,0.2012,0.0,0.0335
symmetry_mean,569.0,0.181162,0.027414,0.106,0.1619,0.1792,0.1957,0.304,0.1601,0.1792


In [61]:
len(data_info.get_duplicates())

0

In [62]:
data_info.get_object_columns()

['diagnosis']

In [63]:
# Remaining are all Numeric Types
len(data_info.get_numeric_columns())

30

## Feature Extraction (Adding Features)

In [64]:
# Create A Data Manipulator Class Instance from the dataframe
data_manipulator = DataManipulator(data_info.df, deep=True)

## Concavity Dispersion Feature (Number of Concavity divided by Area of Nuclei)

In [65]:
# Add Concavity Dispersion for mean concave points and mean area
data_manipulator.add_column(
    'concavity_dispersion_mean', 'concave points_mean', 'area_mean', calculate_concavity_dispersion)


In [66]:
# Add Concavity Dispersion for se concave points and se area
data_manipulator.add_column(
    'concavity_dispersion_se', 'concave points_se', 'area_se', calculate_concavity_dispersion)


In [67]:
# Add Concavity Dispersion for worst concave points and worst area
data_manipulator.add_column(
    'concavity_dispersion_worst', 'concave points_worst', 'area_worst', calculate_concavity_dispersion)


In [68]:
# Checking if columns have been Added
data_manipulator.df.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst',
       'concavity_dispersion_mean', 'concavity_dispersion_se',
       'concavity_dispersion_worst'],
      dtype='object')

In [69]:
## Encoding Diagnosis where M(Malginancy) is 1 and B(Benign) is 0
data_manipulator.df.diagnosis = data_manipulator.df.diagnosis.apply(
    lambda x: 1 if x == 'M' else 0)
data_manipulator.df.diagnosis.sample(10)

62     1
125    0
148    0
256    1
144    0
387    0
180    1
464    0
522    0
161    1
Name: diagnosis, dtype: int64

## Outlier and standardization Ready Dataframe

In [70]:
# Current Null value Cleaned and labelled dataframe
current_df = data_manipulator.df

# Identifying Columns to Standardize
columns_to_standardize = current_df.columns.tolist()
columns_to_standardize.remove('id')
columns_to_standardize.remove('diagnosis')

## Fixing Outliers

In [71]:
# Create Cleaner object instance from current dataframe
data_cleaner_remove = DataCleaner(current_df, deep=True)
data_cleaner_revalue = DataCleaner(current_df, deep=True)

## Dropping Outlier Values

In [72]:
# Drop all outlier values
data_cleaner_remove.remove_outliers()


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,concavity_dispersion_mean,concavity_dispersion_se,concavity_dispersion_worst
1,842517,1,20.570000,17.770000,132.899994,1326.000000,0.084740,0.078640,0.086900,0.070170,0.181200,0.056670,0.543500,0.733900,3.398000,74.080002,0.005225,0.013080,0.018600,0.013400,0.013890,0.003532,24.990000,23.410000,158.800003,1956.000000,0.123800,0.186600,0.241600,0.186000,0.275000,0.089020,0.000053,0.000181,0.000095
2,84300903,1,19.690001,21.250000,130.000000,1203.000000,0.109600,0.159900,0.197400,0.127900,0.206900,0.059990,0.745600,0.786900,4.585000,94.029999,0.006150,0.040060,0.038320,0.020580,0.022500,0.004571,23.570000,25.530001,152.500000,1709.000000,0.144400,0.424500,0.450400,0.243000,0.361300,0.087580,0.000106,0.000219,0.000142
4,84358402,1,20.290001,14.340000,135.100006,1297.000000,0.100300,0.132800,0.198000,0.104300,0.180900,0.058830,0.757200,0.781300,5.438000,94.440002,0.011490,0.024610,0.056880,0.018850,0.017560,0.005115,22.540001,16.670000,152.199997,1575.000000,0.137400,0.205000,0.400000,0.162500,0.236400,0.076780,0.000080,0.000200,0.000103
5,843786,1,12.450000,15.700000,82.570000,477.100006,0.127800,0.170000,0.157800,0.080890,0.208700,0.076130,0.334500,0.890200,2.217000,27.190001,0.007510,0.033450,0.036720,0.011370,0.021650,0.005082,15.470000,23.750000,103.400002,741.599976,0.179100,0.524900,0.535500,0.174100,0.398500,0.124400,0.000170,0.000418,0.000235
6,844359,1,18.250000,19.980000,119.599998,1040.000000,0.094630,0.109000,0.112700,0.074000,0.179400,0.057420,0.446700,0.773200,3.180000,53.910000,0.004314,0.013820,0.022540,0.010390,0.013690,0.002179,22.879999,27.660000,153.199997,1606.000000,0.144200,0.257600,0.378400,0.193200,0.306300,0.083680,0.000071,0.000193,0.000120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560,925292,0,14.050000,27.150000,91.379997,600.400024,0.099290,0.112600,0.044620,0.043040,0.153700,0.061710,0.364500,1.492000,2.888000,29.840000,0.007256,0.026780,0.020710,0.016260,0.020800,0.005304,15.300000,33.169998,100.199997,706.700012,0.124100,0.226400,0.132600,0.104800,0.225000,0.083210,0.000072,0.000545,0.000148
563,926125,1,20.920000,25.090000,143.000000,1347.000000,0.109900,0.223600,0.317400,0.147400,0.214900,0.068790,0.962200,1.026000,8.758000,118.800003,0.006399,0.043100,0.078450,0.026240,0.020570,0.006213,24.290001,29.410000,179.100006,1819.000000,0.140700,0.418600,0.659900,0.254200,0.292900,0.098730,0.000109,0.000221,0.000140
564,926424,1,21.559999,22.389999,142.000000,1479.000000,0.111000,0.115900,0.243900,0.138900,0.172600,0.056230,1.176000,1.256000,7.673000,158.699997,0.010300,0.028910,0.051980,0.024540,0.011140,0.004239,25.450001,26.400000,166.100006,2027.000000,0.141000,0.211300,0.410700,0.221600,0.206000,0.071150,0.000094,0.000155,0.000109
565,926682,1,20.129999,28.250000,131.199997,1261.000000,0.097800,0.103400,0.144000,0.097910,0.175200,0.055330,0.765500,2.463000,5.203000,99.040001,0.005769,0.024230,0.039500,0.016780,0.018980,0.002498,23.690001,38.250000,155.000000,1731.000000,0.116600,0.192200,0.321500,0.162800,0.257200,0.066370,0.000078,0.000169,0.000094


In [73]:
# Length of outlier removed dataframe (decreased)
len(data_cleaner_remove.df)


483

## Revaluing Outliers to Median Values

In [74]:
data_cleaner_revalue.fix_outlier_columns(columns_to_standardize)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,concavity_dispersion_mean,concavity_dispersion_se,concavity_dispersion_worst
0,842302,1,17.990000,10.380000,122.800003,1001.000000,0.118400,0.092630,0.061540,0.033500,0.179200,0.061540,0.324200,0.905300,2.287000,24.530001,0.006399,0.049040,0.053730,0.015870,0.030030,0.006193,25.379999,17.330000,97.660004,686.500000,0.162200,0.211900,0.226700,0.099930,0.282200,0.118900,0.000065,0.000103,0.000131
1,842517,1,20.570000,17.770000,132.899994,551.099976,0.084740,0.078640,0.086900,0.070170,0.181200,0.056670,0.543500,0.733900,3.398000,74.080002,0.005225,0.013080,0.018600,0.013400,0.013890,0.003532,24.990000,23.410000,158.800003,1956.000000,0.123800,0.186600,0.241600,0.186000,0.275000,0.089020,0.000053,0.000181,0.000095
2,84300903,1,19.690001,21.250000,130.000000,1203.000000,0.109600,0.159900,0.197400,0.033500,0.206900,0.059990,0.745600,0.786900,4.585000,94.029999,0.006150,0.040060,0.038320,0.020580,0.022500,0.004571,23.570000,25.530001,152.500000,1709.000000,0.144400,0.424500,0.450400,0.099930,0.361300,0.087580,0.000106,0.000219,0.000142
3,84348301,1,11.420000,20.379999,77.580002,386.100006,0.095870,0.092630,0.241400,0.105200,0.179200,0.061540,0.495600,1.156000,3.445000,27.230000,0.009110,0.020450,0.056610,0.018670,0.018730,0.003187,14.910000,26.500000,98.870003,567.700012,0.131300,0.211900,0.226700,0.099930,0.282200,0.080040,0.000065,0.000686,0.000131
4,84358402,1,20.290001,14.340000,135.100006,1297.000000,0.100300,0.132800,0.198000,0.104300,0.180900,0.058830,0.757200,0.781300,5.438000,94.440002,0.011490,0.024610,0.056880,0.018850,0.017560,0.005115,22.540001,16.670000,152.199997,1575.000000,0.137400,0.205000,0.400000,0.162500,0.236400,0.076780,0.000080,0.000200,0.000103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,1,13.370000,22.389999,86.239998,551.099976,0.111000,0.115900,0.061540,0.033500,0.172600,0.056230,0.324200,1.256000,2.287000,24.530001,0.010300,0.028910,0.051980,0.010930,0.011140,0.004239,25.450001,26.400000,166.100006,686.500000,0.141000,0.211300,0.410700,0.221600,0.206000,0.071150,0.000094,0.000155,0.000109
565,926682,1,20.129999,18.840000,131.199997,1261.000000,0.097800,0.103400,0.144000,0.097910,0.175200,0.055330,0.765500,1.108000,5.203000,99.040001,0.005769,0.024230,0.039500,0.016780,0.018980,0.002498,23.690001,25.410000,155.000000,1731.000000,0.116600,0.192200,0.321500,0.162800,0.257200,0.066370,0.000078,0.000169,0.000094
566,926954,1,16.600000,18.840000,108.300003,858.099976,0.084550,0.102300,0.092510,0.053020,0.159000,0.056480,0.456400,1.075000,3.425000,48.549999,0.005903,0.037310,0.047300,0.015570,0.013180,0.003892,18.980000,34.119999,126.699997,1124.000000,0.113900,0.309400,0.340300,0.141800,0.221800,0.078200,0.000062,0.000321,0.000126
567,927241,1,13.370000,18.840000,86.239998,1265.000000,0.117800,0.092630,0.061540,0.033500,0.179200,0.070160,0.726000,1.595000,5.772000,86.220001,0.006522,0.020450,0.071170,0.016640,0.023240,0.006185,14.970000,25.410000,97.660004,1821.000000,0.165000,0.211900,0.226700,0.099930,0.282200,0.080040,0.000120,0.000193,0.000146


In [75]:
# Length of outlier revalued dataframe (unchanged)
len(data_cleaner_revalue.df)


569

## Standardization

## Normalization

In [76]:
# Create Manipulator object instance from dataframe
data_normalized = DataManipulator(current_df, deep=True)
data_normalized.normalize_df(sep_index=2)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,concavity_dispersion_mean,concavity_dispersion_se,concavity_dispersion_worst
0,842302,1,0.007925,0.004573,0.054099,0.440986,0.000052,0.000122,0.000132,0.000065,0.000107,0.000035,0.000482,0.000399,0.003784,0.067580,0.000003,0.000022,0.000024,0.000007,0.000013,0.000003,0.011181,0.007635,0.081325,0.889462,0.000071,0.000293,0.000314,0.000117,0.000203,0.000052,0.000000,0.000000,0.000000
1,842517,1,0.008666,0.007486,0.055988,0.558619,0.000036,0.000033,0.000037,0.000030,0.000076,0.000024,0.000229,0.000309,0.001432,0.031209,0.000002,0.000006,0.000008,0.000006,0.000006,0.000001,0.010528,0.009862,0.066899,0.824026,0.000052,0.000079,0.000102,0.000078,0.000116,0.000038,0.000000,0.000000,0.000000
2,84300903,1,0.009367,0.010109,0.061842,0.572276,0.000052,0.000076,0.000094,0.000061,0.000098,0.000029,0.000355,0.000374,0.002181,0.044731,0.000003,0.000019,0.000018,0.000010,0.000011,0.000002,0.011212,0.012145,0.072545,0.812984,0.000069,0.000202,0.000214,0.000116,0.000172,0.000042,0.000000,0.000000,0.000000
3,84348301,1,0.016325,0.029133,0.110899,0.551922,0.000204,0.000406,0.000345,0.000150,0.000371,0.000139,0.000708,0.001652,0.004925,0.038925,0.000013,0.000107,0.000081,0.000027,0.000085,0.000013,0.021314,0.037881,0.141333,0.811515,0.000300,0.001238,0.000982,0.000368,0.000949,0.000247,0.000000,0.000001,0.000001
4,84358402,1,0.009883,0.006985,0.065808,0.631774,0.000049,0.000065,0.000096,0.000051,0.000088,0.000029,0.000369,0.000381,0.002649,0.046002,0.000006,0.000012,0.000028,0.000009,0.000009,0.000002,0.010979,0.008120,0.074137,0.767189,0.000067,0.000100,0.000195,0.000079,0.000115,0.000037,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,1,0.008541,0.008870,0.056256,0.585933,0.000044,0.000046,0.000097,0.000055,0.000068,0.000022,0.000466,0.000498,0.003040,0.062872,0.000004,0.000011,0.000021,0.000010,0.000004,0.000002,0.010082,0.010459,0.065804,0.803033,0.000056,0.000084,0.000163,0.000088,0.000082,0.000028,0.000000,0.000000,0.000000
565,926682,1,0.009344,0.013114,0.060903,0.585355,0.000045,0.000048,0.000067,0.000045,0.000081,0.000026,0.000355,0.001143,0.002415,0.045974,0.000003,0.000011,0.000018,0.000008,0.000009,0.000001,0.010997,0.017756,0.071951,0.803528,0.000054,0.000089,0.000149,0.000076,0.000119,0.000031,0.000000,0.000000,0.000000
566,926954,1,0.011644,0.019696,0.075966,0.601905,0.000059,0.000072,0.000065,0.000037,0.000112,0.000040,0.000320,0.000754,0.002402,0.034055,0.000004,0.000026,0.000033,0.000011,0.000009,0.000003,0.013313,0.023933,0.088872,0.788417,0.000080,0.000217,0.000239,0.000099,0.000156,0.000055,0.000000,0.000000,0.000000
567,927241,1,0.009230,0.013142,0.062774,0.566806,0.000053,0.000124,0.000157,0.000068,0.000107,0.000031,0.000325,0.000715,0.002586,0.038632,0.000003,0.000028,0.000032,0.000007,0.000010,0.000003,0.011533,0.017663,0.082713,0.815932,0.000074,0.000389,0.000421,0.000119,0.000183,0.000056,0.000000,0.000000,0.000000


In [77]:
# Create Manipulator object instance from dataframe
data_remove_normalized = DataManipulator(data_cleaner_revalue.df, deep=True)
data_remove_normalized.normalize_df(sep_index=2)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,concavity_dispersion_mean,concavity_dispersion_se,concavity_dispersion_worst
0,842302,1,0.014689,0.008476,0.100270,0.817345,0.000097,0.000076,0.000050,0.000027,0.000146,0.000050,0.000265,0.000739,0.001867,0.020029,0.000005,0.000040,0.000044,0.000013,0.000025,0.000005,0.020723,0.014150,0.079742,0.560546,0.000132,0.000173,0.000185,0.000082,0.000230,0.000097,0.000000,0.000000,0.000000
1,842517,1,0.010061,0.008692,0.065004,0.269554,0.000041,0.000038,0.000043,0.000034,0.000089,0.000028,0.000266,0.000359,0.001662,0.036234,0.000003,0.000006,0.000009,0.000007,0.000007,0.000002,0.012223,0.011450,0.077672,0.956718,0.000061,0.000091,0.000118,0.000091,0.000135,0.000044,0.000000,0.000000,0.000000
2,84300903,1,0.009367,0.010109,0.061842,0.572276,0.000052,0.000076,0.000094,0.000016,0.000098,0.000029,0.000355,0.000374,0.002181,0.044731,0.000003,0.000019,0.000018,0.000010,0.000011,0.000002,0.011212,0.012145,0.072545,0.812984,0.000069,0.000202,0.000214,0.000048,0.000172,0.000042,0.000000,0.000000,0.000000
3,84348301,1,0.016325,0.029133,0.110899,0.551923,0.000137,0.000132,0.000345,0.000150,0.000256,0.000088,0.000708,0.001652,0.004925,0.038925,0.000013,0.000029,0.000081,0.000027,0.000027,0.000005,0.021314,0.037881,0.141333,0.811517,0.000188,0.000303,0.000324,0.000143,0.000403,0.000114,0.000000,0.000001,0.000000
4,84358402,1,0.009883,0.006985,0.065808,0.631774,0.000049,0.000065,0.000096,0.000051,0.000088,0.000029,0.000369,0.000381,0.002649,0.046002,0.000006,0.000012,0.000028,0.000009,0.000009,0.000002,0.010979,0.008120,0.074137,0.767189,0.000067,0.000100,0.000195,0.000079,0.000115,0.000037,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,1,0.014831,0.024837,0.095666,0.611333,0.000123,0.000129,0.000068,0.000037,0.000191,0.000062,0.000360,0.001393,0.002537,0.027211,0.000011,0.000032,0.000058,0.000012,0.000012,0.000005,0.028232,0.029285,0.184254,0.761531,0.000156,0.000234,0.000456,0.000246,0.000229,0.000079,0.000000,0.000000,0.000000
565,926682,1,0.009346,0.008747,0.060911,0.585434,0.000045,0.000048,0.000067,0.000045,0.000081,0.000026,0.000355,0.000514,0.002416,0.045981,0.000003,0.000011,0.000018,0.000008,0.000009,0.000001,0.010998,0.011797,0.071961,0.803638,0.000054,0.000089,0.000149,0.000076,0.000119,0.000031,0.000000,0.000000,0.000000
566,926954,1,0.011645,0.013217,0.075974,0.601969,0.000059,0.000072,0.000065,0.000037,0.000112,0.000040,0.000320,0.000754,0.002403,0.034058,0.000004,0.000026,0.000033,0.000011,0.000009,0.000003,0.013315,0.023936,0.088882,0.788501,0.000080,0.000217,0.000239,0.000099,0.000156,0.000055,0.000000,0.000000,0.000000
567,927241,1,0.006014,0.008475,0.038793,0.569029,0.000053,0.000042,0.000028,0.000015,0.000081,0.000032,0.000327,0.000717,0.002596,0.038784,0.000003,0.000009,0.000032,0.000007,0.000010,0.000003,0.006734,0.011430,0.043930,0.819131,0.000074,0.000095,0.000102,0.000045,0.000127,0.000036,0.000000,0.000000,0.000000


## MIN-MAX

In [27]:
# Create Manipulator object instance from dataframe
data_remove_minmax = DataManipulator(data_cleaner_remove.df, deep=True)
data_revalue_minmax = DataManipulator(data_cleaner_revalue.df, deep=True)

In [28]:
# For removed outlier
# Minmax between 0 and 1
data_remove_minmax.minmax_scale_columns(columns_to_standardize, (0, 1))


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
1,842517,1,0.834244,0.383627,0.822731,0.766613,0.298029,0.283514,0.273787,0.449232,0.470459,0.225168,0.390233,0.163702,0.325647,0.442916,0.191281,0.149438,0.166966,0.469681,0.161906,0.253450,0.842053,0.384927,0.745000,0.798449,0.388053,0.241722,0.284604,0.688634,0.371706,0.429800
2,84300903,1,0.780220,0.549262,0.795956,0.686872,0.631318,0.672280,0.621928,0.818822,0.657914,0.336577,0.574027,0.186920,0.472010,0.574254,0.260450,0.521792,0.343986,0.721346,0.393931,0.353304,0.771964,0.456573,0.701698,0.687077,0.575923,0.619353,0.530569,0.899667,0.642409,0.411586
4,84358402,1,0.817054,0.220371,0.843043,0.747812,0.506636,0.542627,0.623819,0.667734,0.468271,0.297651,0.584576,0.184466,0.577189,0.576953,0.659762,0.308565,0.510592,0.660708,0.260806,0.405586,0.721125,0.157148,0.699636,0.626657,0.512084,0.270929,0.471198,0.601629,0.250627,0.274981
5,843786,1,0.335748,0.285102,0.358046,0.216272,0.875318,0.720601,0.497164,0.517862,0.671043,0.878188,0.200164,0.232171,0.180025,0.134222,0.362148,0.430567,0.329623,0.398528,0.371025,0.402414,0.372162,0.396418,0.364218,0.250879,0.892385,0.778723,0.630816,0.644576,0.759097,0.877308
6,844359,1,0.691817,0.488815,0.699935,0.581199,0.430621,0.428763,0.355072,0.473752,0.457330,0.250336,0.302201,0.180918,0.298767,0.310129,0.123159,0.159651,0.202334,0.364178,0.156516,0.123419,0.737907,0.528557,0.706509,0.640635,0.574099,0.354424,0.445753,0.715291,0.469887,0.362256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560,925292,0,0.433974,0.830081,0.439387,0.296207,0.493096,0.445986,0.140580,0.275544,0.269876,0.394295,0.227446,0.495795,0.262762,0.151668,0.343154,0.338513,0.185907,0.569926,0.348119,0.423750,0.363771,0.714768,0.342223,0.235143,0.390789,0.304899,0.156202,0.388004,0.214868,0.356312
563,926125,1,0.855731,0.732032,0.915982,0.780227,0.635340,0.977036,1.000000,0.943662,0.716265,0.631879,0.771008,0.291659,0.986560,0.737324,0.279070,0.563747,0.704219,0.919734,0.341921,0.511110,0.807503,0.587698,0.884528,0.736676,0.542180,0.609988,0.777359,0.941133,0.427854,0.552618
564,926424,1,0.895021,0.603522,0.906749,0.865802,0.650087,0.461774,0.768431,0.889245,0.407732,0.210403,0.965442,0.392413,0.852774,1.000000,0.570777,0.367910,0.466607,0.860147,0.087798,0.321397,0.864758,0.485975,0.795175,0.830463,0.544916,0.280930,0.483803,0.820437,0.155270,0.203769
565,926682,1,0.807232,0.882437,0.807035,0.724473,0.473120,0.401971,0.453686,0.626825,0.426696,0.180201,0.592124,0.921149,0.548212,0.607236,0.231960,0.303321,0.354578,0.588153,0.299073,0.154077,0.777887,0.886448,0.718881,0.696997,0.322389,0.250611,0.378725,0.602740,0.315872,0.143309


In [29]:
# For revalued outlier
data_revalue_minmax.minmax_scale_columns(columns_to_standardize, (0, 1))


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,1,0.810140,0.038417,0.859645,0.736368,0.996968,0.386911,0.254613,0.266932,0.588898,0.444189,0.252014,0.296282,0.243941,0.163545,0.429632,0.803255,0.684895,0.695138,0.819446,0.758815,0.988668,0.218969,0.391499,0.274866,0.911850,0.344351,0.332893,0.424151,0.504819,0.996878
1,842517,1,1.000000,0.462156,0.969535,0.350021,0.486736,0.313015,0.359537,0.559124,0.604988,0.257384,0.511848,0.203120,0.421078,0.620657,0.321995,0.185895,0.237094,0.586947,0.222288,0.377703,0.966572,0.469691,0.898086,0.970940,0.527196,0.297159,0.354772,0.789474,0.475904,0.530440
2,84300903,1,0.935242,0.661697,0.937983,0.909833,0.863574,0.742235,0.816715,0.266932,0.811746,0.384734,0.751303,0.231927,0.610332,0.804701,0.406803,0.649087,0.488464,0.901445,0.540847,0.526510,0.886119,0.557113,0.845886,0.835508,0.733547,0.740911,0.661380,0.424151,0.822490,0.507961
3,84348301,1,0.326661,0.611812,0.367642,0.208330,0.655449,0.386911,0.998759,0.838247,0.588898,0.444189,0.455095,0.432547,0.428571,0.188454,0.678188,0.312423,0.721606,0.817784,0.401362,0.328292,0.395467,0.597113,0.401525,0.209727,0.602324,0.344351,0.332893,0.424151,0.504819,0.390259
4,84358402,1,0.979395,0.265482,0.993472,0.990554,0.722601,0.599092,0.819197,0.831076,0.602574,0.340238,0.765047,0.228884,0.746333,0.808484,0.896397,0.383842,0.725048,0.825668,0.358073,0.604423,0.827762,0.191753,0.843400,0.762035,0.663428,0.331480,0.587372,0.689728,0.320884,0.339369
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,1,0.470160,0.727064,0.461865,0.350021,0.884796,0.509825,0.254613,0.266932,0.535800,0.240507,0.252014,0.486901,0.243941,0.163545,0.787293,0.457664,0.662588,0.478756,0.120542,0.478961,0.992635,0.592990,0.958571,0.274866,0.699489,0.343232,0.603084,0.940577,0.198795,0.251483
565,926682,1,0.967621,0.523509,0.951039,0.959639,0.684705,0.443799,0.595780,0.780159,0.556718,0.205984,0.774882,0.406457,0.708865,0.850920,0.371871,0.377318,0.503505,0.734998,0.410611,0.229612,0.892918,0.552165,0.866600,0.847571,0.455074,0.307605,0.472100,0.691002,0.404418,0.176866
566,926954,1,0.707852,0.523509,0.701882,0.613654,0.483856,0.437989,0.382747,0.422470,0.426388,0.250096,0.408649,0.388521,0.425383,0.385136,0.384157,0.601875,0.602932,0.681997,0.196019,0.429263,0.626062,0.911340,0.632115,0.514749,0.428028,0.526217,0.499706,0.601868,0.262249,0.361536
567,927241,1,0.470160,0.523509,0.461865,0.963074,0.987873,0.386911,0.254613,0.266932,0.588898,0.774837,0.728081,0.671160,0.799585,0.732652,0.440909,0.312423,0.907202,0.728866,0.568226,0.757670,0.398867,0.552165,0.391499,0.896919,0.939898,0.344351,0.332893,0.424151,0.504819,0.390259


## Standard-Scaler

In [30]:
# Create Manipulator object instance from dataframes
data_remove_stdscale = DataManipulator(data_cleaner_remove.df, deep=True)
data_revalue_stdscale = DataManipulator(data_cleaner_revalue.df, deep=True)


In [31]:
# For removed outlier
data_remove_stdscale.standardize_columns(columns_to_standardize)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
1,842517,1,2.224859,-0.275471,2.086169,2.480931,-0.811123,-0.397576,0.205810,0.830214,0.151425,-0.936730,0.893910,-0.905510,0.611615,1.478012,-0.624367,-0.719919,-0.477640,0.518889,-0.899173,0.114705,2.235281,-0.302860,1.956635,2.536527,-0.334504,-0.360396,0.000953,1.371178,-0.181647,0.513669
2,84300903,1,1.933363,0.600615,1.945675,2.048186,1.130272,1.580205,1.997254,2.611405,1.268360,-0.334883,1.894128,-0.790970,1.459772,2.211954,-0.248963,1.407383,0.635505,2.019094,0.517778,0.759524,1.889113,0.070575,1.732509,1.986246,0.641926,1.529480,1.220261,2.354829,1.554048,0.413569
4,84358402,1,2.132111,-1.138970,2.192752,2.378902,0.404006,0.920621,2.006981,1.883255,0.138386,-0.545166,1.951538,-0.803072,2.069273,2.227038,1.918234,0.189191,1.683170,1.657624,-0.295200,1.097138,1.638020,-1.490104,1.721836,1.687713,0.310129,-0.214227,0.925946,0.965637,-0.957984,-0.337183
5,843786,1,-0.464861,-0.796591,-0.352138,-0.505711,2.551568,1.826029,1.355252,1.160967,1.346589,2.590965,-0.140458,-0.567724,-0.232255,-0.247027,0.302982,0.886202,0.545189,0.094736,0.377893,1.076658,-0.085505,-0.242970,-0.014255,-0.168983,2.286690,2.327058,1.717211,1.165819,2.302228,2.973074
6,844359,1,1.456368,0.280894,1.441833,1.474712,-0.038782,0.341354,0.624084,0.948384,0.073196,-0.800770,0.414834,-0.820577,0.455845,0.735977,-0.994089,-0.661572,-0.255237,-0.110028,-0.932087,-0.724986,1.720905,0.445772,1.757411,1.756777,0.632446,0.203628,0.799810,1.495428,0.447870,0.142464
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560,925292,0,0.065133,2.085933,0.074675,-0.071911,0.325132,0.428974,-0.479641,-0.006851,-1.043740,-0.023082,0.008016,0.732853,0.247200,-0.149536,0.199898,0.360290,-0.358536,1.116464,0.238008,1.214434,-0.126948,1.416352,-0.128097,-0.246735,-0.320285,-0.044225,-0.635564,-0.030095,-1.187265,0.109792
563,926125,1,2.340796,1.567331,2.575479,2.554814,1.153700,3.130595,3.942714,3.213055,1.616044,1.260375,2.966108,-0.274240,4.441541,3.123219,-0.147908,1.647079,2.900743,3.201708,0.200157,1.778573,2.064635,0.754033,2.678821,2.231311,0.466547,1.482611,2.443658,2.548109,0.178364,1.188650
564,926424,1,2.552793,0.887609,2.527032,3.019223,1.239603,0.509293,2.751120,2.950797,-0.222336,-1.016492,4.024231,0.222823,3.666266,4.591103,1.435282,0.528235,1.406577,2.846505,-1.351741,0.553479,2.347420,0.223825,2.216337,2.694705,0.480768,-0.164179,0.988429,1.985528,-1.569399,-0.728547
565,926682,1,2.079111,2.362857,2.003811,2.252245,0.208773,0.205056,1.131524,1.686099,-0.109338,-1.179644,1.992615,2.831323,1.901357,2.396267,-0.403589,0.159229,0.702113,1.225114,-0.061510,-0.527011,1.918367,2.311189,1.821448,2.035259,-0.675781,-0.315910,0.467537,0.970814,-0.539647,-1.060824


In [32]:
# For revalued outlier
data_revalue_stdscale.standardize_columns(columns_to_standardize)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,1,1.518324,-2.360404,1.749817,1.537012,2.016808,-0.099871,-0.243940,-0.309980,0.076963,-0.041339,-0.197573,-0.570850,-0.192154,-0.344344,-0.079236,2.169033,1.594242,1.097549,1.995475,1.960554,2.549768,-1.479690,-0.176104,-0.247174,1.714516,-0.152408,-0.096004,-0.116936,0.032959,2.871652
1,842517,1,2.420823,-0.278212,2.266581,-0.188753,-0.848256,-0.442606,0.178757,0.900654,0.168599,-0.959064,1.049534,-1.001176,0.732814,1.765634,-0.663867,-0.766540,-0.514861,0.553020,-0.970525,0.128861,2.448361,-0.298361,2.131283,2.890826,-0.305606,-0.371568,-0.004868,1.405122,-0.128343,0.597267
2,84300903,1,2.112994,0.702305,2.118203,2.311862,1.267772,1.548146,2.020554,-0.309980,1.346108,-0.333428,2.198828,-0.868111,1.721056,2.615161,-0.203234,1.435956,0.669070,2.135900,0.611709,0.844054,2.079132,0.113550,1.893525,2.280282,0.778105,1.689233,1.272258,-0.116936,1.805044,0.487658
3,84348301,1,-0.779901,0.457176,-0.563858,-0.821675,0.099105,-0.099871,2.753939,2.057145,0.076963,-0.041339,0.777138,0.058572,0.771944,-0.229371,1.270794,-0.164894,1.767149,1.714828,-0.081093,-0.108619,-0.172641,0.302018,-0.130439,-0.540829,0.088949,-0.152408,-0.096004,-0.116936,0.032959,-0.086267
4,84358402,1,2.322878,-1.244643,2.379144,2.672435,0.476176,0.884235,2.030554,2.027432,0.154853,-0.552023,2.264795,-0.882171,2.431225,2.632621,2.455992,0.174705,1.783359,1.754510,-0.296100,1.218516,1.811312,-1.607926,1.882203,1.949055,0.409854,-0.212179,0.963987,0.989549,-0.993103,-0.334409
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,1,-0.097779,1.023510,-0.120771,-0.188753,1.386937,0.470210,-0.243940,-0.309980,-0.225432,-1.041979,-0.197573,0.309638,-0.192154,-0.344344,1.863392,0.525732,1.489177,0.008492,-1.475885,0.615523,2.567970,0.282588,2.406781,-0.247174,0.599240,-0.157605,1.029433,2.034671,-1.674158,-0.762950
565,926682,1,2.266908,0.023269,2.179601,2.534343,0.263382,0.163978,1.130491,1.816471,-0.106306,-1.211579,2.311995,-0.061939,2.235574,2.828501,-0.392965,0.143684,0.739914,1.298165,-0.035151,-0.582891,2.110335,0.090234,1.987873,2.334662,-0.684379,-0.323059,0.483841,0.994854,-0.527119,-1.126790
566,926954,1,1.032094,0.023269,1.007926,0.988864,-0.864429,0.137030,0.272263,0.334459,-0.848550,-0.994868,0.554217,-0.144791,0.755293,0.678495,-0.326235,1.211461,1.208203,1.031412,-1.101000,0.376666,0.885640,1.782565,0.919848,0.834255,-0.826419,0.692183,0.598831,0.623491,-1.320189,-0.226323
567,927241,1,-0.097779,0.023269,-0.120771,2.549687,1.965737,-0.099871,-0.243940,-0.309980,0.076963,1.583054,2.087368,1.160750,2.709298,2.282590,-0.017984,-0.164894,2.641289,1.267301,0.747697,1.955047,-0.157040,0.090234,-0.176104,2.557128,1.861817,-0.152408,-0.096004,-0.116936,0.032959,-0.086267


## Normalizing

In [33]:
# Create Manipulator object instance from dataframes to Normalize
data_remove_minmax_norm = DataManipulator(data_remove_minmax.df, deep=True)
data_revalue_minmax_norm = DataManipulator(data_revalue_minmax.df, deep=True)
data_remove_stdscale_norm = DataManipulator(data_remove_stdscale.df, deep=True)
data_revalue_stdscale_norm = DataManipulator(data_revalue_stdscale.df, deep=True)

In [34]:
# Normalize dataframes
data_remove_minmax_norm.normalize_df(sep_index=2)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
1,842517,1,0.237385,0.167115,0.242173,0.208983,0.192081,0.204544,0.189224,0.249130,0.200173,0.102405,0.174650,0.056871,0.143611,0.174719,0.079243,0.158757,0.104659,0.219472,0.119855,0.107494,0.234873,0.138914,0.213494,0.209046,0.175227,0.188440,0.161428,0.273727,0.195455,0.125227
2,84300903,1,0.277533,0.074855,0.286360,0.254013,0.172092,0.184317,0.211895,0.226812,0.159060,0.101105,0.198566,0.062659,0.196056,0.195976,0.224105,0.104812,0.173435,0.224426,0.088589,0.137767,0.244948,0.053379,0.237649,0.212860,0.173942,0.092028,0.160054,0.204358,0.085132,0.093404
4,84358402,1,0.274510,0.193960,0.277732,0.230618,0.170869,0.170132,0.140892,0.187983,0.181467,0.099332,0.119912,0.071788,0.118550,0.123058,0.048869,0.063349,0.080285,0.144505,0.062105,0.048972,0.292799,0.209730,0.280340,0.254202,0.227801,0.140634,0.176873,0.283825,0.186450,0.143742
5,843786,1,0.149260,0.191235,0.154822,0.101754,0.273155,0.250858,0.106619,0.138443,0.271185,0.297662,0.154141,0.160937,0.138067,0.105038,0.165839,0.139813,0.080696,0.183381,0.067944,0.156858,0.162824,0.196838,0.149479,0.115964,0.277290,0.191493,0.113984,0.208149,0.184852,0.274484
6,844359,1,0.120550,0.188043,0.131659,0.079588,0.283378,0.271300,0.191078,0.195348,0.281505,0.261978,0.056935,0.091721,0.066334,0.037625,0.074748,0.147537,0.104052,0.140193,0.119109,0.089490,0.121737,0.206285,0.125101,0.081509,0.264950,0.261923,0.207143,0.248818,0.287866,0.215238
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
482,912519,0,0.201998,0.386370,0.204518,0.137873,0.229517,0.207589,0.065434,0.128255,0.125617,0.183529,0.105867,0.230773,0.122306,0.070595,0.159725,0.157565,0.086532,0.265279,0.162036,0.197239,0.169321,0.332697,0.159291,0.109450,0.181897,0.141918,0.072706,0.180601,0.100013,0.165849
483,912558,0,0.212896,0.182122,0.227886,0.194112,0.158065,0.243076,0.248789,0.234773,0.178199,0.157205,0.191818,0.072562,0.245445,0.183438,0.069429,0.140254,0.175202,0.228819,0.085066,0.127158,0.200898,0.146213,0.220061,0.183277,0.134888,0.151758,0.193398,0.234143,0.106445,0.137485
484,912600,0,0.248595,0.167630,0.251852,0.240479,0.180563,0.128259,0.213434,0.246990,0.113249,0.058440,0.268154,0.108994,0.236860,0.277753,0.158535,0.102188,0.129601,0.238908,0.024386,0.089269,0.240189,0.134981,0.220862,0.230663,0.151352,0.078029,0.134377,0.227879,0.043127,0.056597
485,913063,0,0.260648,0.284931,0.260585,0.233926,0.152766,0.129793,0.146491,0.202396,0.137776,0.058185,0.191192,0.297431,0.177013,0.196071,0.074898,0.097940,0.114490,0.189910,0.096568,0.049750,0.251173,0.286227,0.232121,0.225054,0.104097,0.080920,0.122287,0.194619,0.101992,0.046273


In [35]:
data_revalue_minmax_norm.normalize_df(sep_index=2)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,1,0.246822,0.011704,0.261904,0.224346,0.303742,0.117878,0.077572,0.081325,0.179417,0.135329,0.076780,0.090267,0.074320,0.049827,0.130894,0.244724,0.208664,0.211785,0.249657,0.231185,0.301213,0.066712,0.119276,0.083742,0.277809,0.104912,0.101421,0.129224,0.153801,0.303714
1,842517,1,0.322592,0.149088,0.312765,0.112914,0.157017,0.100976,0.115984,0.180369,0.195164,0.083030,0.165118,0.065525,0.135836,0.200219,0.103873,0.059968,0.076485,0.189345,0.071708,0.121844,0.311809,0.151519,0.289716,0.313218,0.170069,0.095861,0.114447,0.254678,0.153523,0.171116
2,84300903,1,0.242645,0.171675,0.243356,0.236053,0.224051,0.192570,0.211894,0.069255,0.210604,0.099818,0.194923,0.060173,0.158348,0.208777,0.105543,0.168403,0.126730,0.233877,0.140321,0.136601,0.229900,0.144541,0.219462,0.216769,0.190316,0.192227,0.171593,0.110044,0.213392,0.131789
3,84348301,1,0.115421,0.216175,0.129901,0.073610,0.231594,0.136710,0.352897,0.296183,0.208079,0.156948,0.160801,0.152834,0.151430,0.066587,0.239628,0.110390,0.254969,0.288952,0.141815,0.115997,0.139733,0.210982,0.141873,0.074104,0.212823,0.121672,0.117623,0.149868,0.178371,0.137893
4,84358402,1,0.263429,0.071407,0.267215,0.266430,0.194359,0.161138,0.220340,0.223535,0.162075,0.091514,0.205775,0.061563,0.200742,0.217458,0.241105,0.103242,0.195017,0.222081,0.096311,0.162572,0.222644,0.051576,0.226850,0.204965,0.178443,0.089159,0.157986,0.185517,0.086308,0.091280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,1,0.156484,0.241989,0.153723,0.116498,0.294487,0.169685,0.084743,0.088843,0.178331,0.080048,0.083878,0.162055,0.081191,0.054433,0.262035,0.152325,0.220529,0.159345,0.040120,0.159413,0.330379,0.197365,0.319042,0.091484,0.232811,0.114238,0.200725,0.313053,0.066165,0.083701
565,926682,1,0.278472,0.150661,0.273700,0.276175,0.197052,0.127721,0.171460,0.224523,0.160218,0.059280,0.223004,0.116975,0.204005,0.244887,0.107021,0.108589,0.144904,0.211525,0.118170,0.066080,0.256973,0.158908,0.249399,0.243923,0.130966,0.088526,0.135866,0.198864,0.116388,0.050900
566,926954,1,0.250161,0.185012,0.248051,0.216870,0.170999,0.154789,0.135266,0.149304,0.150689,0.088386,0.144420,0.137306,0.150334,0.136110,0.135764,0.212708,0.213081,0.241023,0.069275,0.151705,0.221256,0.322075,0.223395,0.181917,0.151268,0.185969,0.176600,0.212705,0.092681,0.127770
567,927241,1,0.137651,0.153271,0.135223,0.281964,0.289225,0.113278,0.074544,0.078151,0.172415,0.226853,0.213164,0.196499,0.234099,0.214502,0.129087,0.091470,0.265606,0.213394,0.166362,0.221827,0.116778,0.161660,0.114621,0.262596,0.275179,0.100817,0.097463,0.124181,0.147799,0.114258


In [36]:
data_remove_stdscale_norm.normalize_df(sep_index=2)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
1,842517,1,0.231278,0.071848,0.232751,0.245014,0.135209,0.189032,0.238921,0.312389,0.151727,-0.040060,0.226585,-0.094620,0.174625,0.264604,-0.029782,0.168358,0.076022,0.241534,0.061939,0.090858,0.225985,0.008443,0.207251,0.237604,0.076790,0.182964,0.145973,0.281696,0.185903,0.049473
2,84300903,1,0.268250,-0.143299,0.275880,0.299300,0.050830,0.115827,0.252507,0.236940,0.017411,-0.068590,0.245531,-0.101038,0.260344,0.280193,0.241341,0.023803,0.211767,0.208553,-0.037140,0.138036,0.206086,-0.187476,0.216632,0.212338,0.039019,-0.026953,0.116497,0.121491,-0.120528,-0.042422
4,84358402,1,0.286318,0.055223,0.283461,0.289925,-0.007624,0.067109,0.122693,0.186450,0.014390,-0.157429,0.081555,-0.161323,0.089618,0.144691,-0.195435,-0.130063,-0.050179,-0.021631,-0.183246,-0.142531,0.338326,0.087638,0.345503,0.345378,0.124337,0.040033,0.157241,0.293997,0.088050,0.028008
5,843786,1,-0.008259,0.086062,0.003045,-0.026272,0.322862,0.294277,0.054850,0.089005,0.316562,0.399512,0.189883,0.084226,0.163275,0.109117,0.144089,0.110785,-0.021416,0.129481,-0.128610,0.222853,0.052538,0.092226,0.042066,0.030821,0.284738,0.188206,0.026773,0.147222,0.124406,0.404607
6,844359,1,-0.036297,0.095547,-0.014548,-0.045646,0.322619,0.306976,0.232517,0.199150,0.319676,0.280551,-0.035956,-0.041874,-0.012482,-0.045277,-0.053803,0.129688,0.061379,0.036042,0.043874,0.032021,-0.010353,0.126677,0.010960,-0.022356,0.240062,0.314310,0.223122,0.220383,0.397110,0.228231
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
482,912519,0,0.017595,0.563493,0.020173,-0.019426,0.087831,0.115883,-0.129570,-0.001851,-0.281955,-0.006235,0.002165,0.197972,0.066778,-0.040396,0.054000,0.097328,-0.096855,0.301601,0.064295,0.328066,-0.034294,0.382612,-0.034604,-0.066653,-0.086521,-0.011947,-0.171691,-0.008130,-0.320727,0.029659
483,912558,0,0.185891,0.124468,0.204528,0.202887,0.091620,0.248612,0.313106,0.255161,0.128336,0.100091,0.235550,-0.021778,0.352719,0.248026,-0.011746,0.130801,0.230359,0.254259,0.015895,0.141243,0.163960,0.059881,0.212735,0.177197,0.037050,0.117740,0.194060,0.202355,0.014165,0.094395
484,912600,0,0.221203,0.076912,0.218970,0.261619,0.107413,0.044131,0.238388,0.255690,-0.019266,-0.088080,0.348705,0.019308,0.317686,0.397825,0.124369,0.045772,0.121882,0.246653,-0.117130,0.047960,0.203407,0.019395,0.192048,0.233500,0.041659,-0.014226,0.085649,0.172048,-0.135990,-0.063130
485,913063,0,0.252074,0.286476,0.242945,0.273065,0.025312,0.024861,0.137188,0.204425,-0.013256,-0.143022,0.241587,0.343273,0.230523,0.290527,-0.048932,0.019305,0.085125,0.148534,-0.007458,-0.063895,0.232585,0.280212,0.220835,0.246758,-0.081933,-0.038301,0.056685,0.117703,-0.065428,-0.128616


In [37]:
data_revalue_stdscale_norm.normalize_df(sep_index=2)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,1,0.205930,-0.320142,0.237328,0.208465,0.273540,-0.013545,-0.033086,-0.042043,0.010439,-0.005607,-0.026797,-0.077424,-0.026062,-0.046703,-0.010747,0.294186,0.216227,0.148860,0.270646,0.265910,0.345825,-0.200690,-0.023885,-0.033524,0.232540,-0.020671,-0.013021,-0.015860,0.004470,0.389482
1,842517,1,0.366222,-0.042088,0.342889,-0.028555,-0.128324,-0.066957,0.027042,0.136251,0.025506,-0.145087,0.158774,-0.151458,0.110860,0.267105,-0.100430,-0.115962,-0.077888,0.083661,-0.146821,0.019494,0.370388,-0.045136,0.322421,0.437324,-0.046232,-0.056211,-0.000736,0.212567,-0.019416,0.090355
2,84300903,1,0.252204,0.083826,0.252826,0.275941,0.151320,0.184785,0.241171,-0.036999,0.160670,-0.039798,0.262449,-0.103617,0.205423,0.312142,-0.024258,0.171394,0.079859,0.254938,0.073013,0.100745,0.248163,0.013553,0.226009,0.272172,0.092874,0.201625,0.151855,-0.013957,0.215448,0.058206
3,84348301,1,-0.162174,0.095066,-0.117250,-0.170861,0.020608,-0.020767,0.572661,0.427768,0.016004,-0.008596,0.161600,0.012180,0.160520,-0.047696,0.264252,-0.034289,0.367465,0.356586,-0.016863,-0.022586,-0.035899,0.062802,-0.027124,-0.112461,0.018496,-0.031692,-0.019963,-0.024316,0.006854,-0.017939
4,84358402,1,0.262362,-0.140579,0.268717,0.301844,0.053783,0.099872,0.229345,0.228993,0.017490,-0.062349,0.255802,-0.099639,0.274600,0.297347,0.277397,0.019732,0.201425,0.198167,-0.033444,0.137628,0.204582,-0.181611,0.212589,0.220140,0.046292,-0.023965,0.108879,0.111767,-0.112168,-0.037771
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,1,-0.016562,0.173365,-0.020457,-0.031972,0.234924,0.079646,-0.041319,-0.052505,-0.038184,-0.176494,-0.033466,0.052447,-0.032548,-0.058326,0.315628,0.089050,0.252242,0.001438,-0.249990,0.104259,0.434971,0.047866,0.407669,-0.041867,0.101501,-0.026696,0.174369,0.344639,-0.283574,-0.129231
565,926682,1,0.290796,0.002985,0.279596,0.325102,0.033786,0.021035,0.145018,0.233015,-0.013637,-0.155420,0.296580,-0.007946,0.286777,0.362837,-0.050409,0.018432,0.094915,0.166527,-0.004509,-0.074772,0.270711,0.011575,0.255002,0.299488,-0.087791,-0.041442,0.062066,0.127619,-0.067618,-0.144543
566,926954,1,0.221358,0.004991,0.216174,0.212086,-0.185398,0.029389,0.058393,0.071733,-0.181992,-0.213373,0.118865,-0.031054,0.161991,0.145520,-0.069969,0.259827,0.259128,0.221211,-0.236136,0.080785,0.189947,0.382314,0.197284,0.178926,-0.177246,0.148455,0.128434,0.133723,-0.283146,-0.048540
567,927241,1,-0.013253,0.003154,-0.016369,0.345572,0.266427,-0.013536,-0.033062,-0.042013,0.010431,0.214560,0.282912,0.157323,0.367205,0.309371,-0.002438,-0.022349,0.357988,0.171764,0.101339,0.264978,-0.021284,0.012230,-0.023868,0.346581,0.252342,-0.020657,-0.013012,-0.015849,0.004467,-0.011692


## Save Finalised Datasets

In [38]:
# Normalized Only Dataset
data_normalized = DataCleaner(data_normalized.df)
data_normalized.save_clean_data('../data/clean_normalized.csv')

In [78]:
# Outlier Removed and Normalized Dataset
data_removed_normalized = DataCleaner(data_normalized.df)
data_removed_normalized.save_clean_data('../data/clean_add_out_removed_normalized.csv')


In [40]:
## Outlier Removed MinMax Scaled Dataframe
removed_minmax = DataCleaner(data_remove_minmax.df)
# removed_minmax.df['diagnosis'].apply(lambda x: 'M' if int(x) == 1 else 'B')
removed_minmax.save_clean_data('../data/out_removed_minmax_scale.csv')

In [41]:
## Outlier Revalued MinMax Scaled Dataframe
revalued_minmax = DataCleaner(data_revalue_minmax.df)
revalued_minmax.save_clean_data('../data/out_revalued_minmax_scale.csv')


In [42]:
## Outlier Removed Standard-Scalar Scaled Dataframe
removed_stdscale = DataCleaner(data_remove_stdscale.df)
removed_stdscale.save_clean_data('../data/out_removed_std_scale.csv')


In [43]:
## Outlier Revalued Standard-Scalar Scaled Dataframe
revalued_stdscale = DataCleaner(data_revalue_stdscale.df)
revalued_stdscale.save_clean_data('../data/out_revalued_std_scale.csv')


In [44]:
## Outlier Removed MinMax Scaled Dataframe
removed_minmax_norm = DataCleaner(data_remove_minmax_norm.df)
removed_minmax_norm.save_clean_data('../data/out_removed_minmax_scale_norm.csv')


In [45]:
## Outlier Revalued MinMax Scaled Dataframe
revalued_minmax_norm = DataCleaner(data_revalue_minmax.df)
revalued_minmax_norm.save_clean_data('../data/out_revalued_minmax_scale_norm.csv')


In [46]:
## Outlier Removed Standard-Scalar Scaled Dataframe
removed_stdscale_norm = DataCleaner(data_remove_stdscale_norm.df)
removed_stdscale_norm.save_clean_data('../data/out_removed_std_scale_norm.csv')


In [47]:
## Outlier Revalued Standard-Scalar Scaled Dataframe
revalued_stdscale_norm = DataCleaner(data_revalue_stdscale_norm.df)
revalued_stdscale_norm.save_clean_data('../data/out_revalued_std_scale_norm.csv')
