# Data Exploration

## Importing working Libraries and Scripts

In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import sys
import os

# Importing Scripts
sys.path.append(os.path.abspath(os.path.join('..')))
from scripts.logger_creator import CreateLogger
from scripts.data_loader import load_df_from_csv
from scripts.data_information import DataInfo
from scripts.data_cleaner import DataCleaner
from scripts.data_manipulation import DataManipulator
from scripts.utilities import calculate_circumference


In [60]:
# Configuring Notebook Settings
pd.set_option('max_column', None)
pd.set_option('display.float_format', '{:.6f}'.format)
%matplotlib inline

## Loading Data CSV File

In [3]:
# Declaring Data File-Path
DATAPATH = '../data/data.csv'

In [4]:
# Loading Breast Cancer Data-Set
data_df = load_df_from_csv(DATAPATH, na_values=['none'])

In [5]:
# Extracting Information from the data
# Instantiate DataInfo Object using our dataset dataframe
data_info = DataInfo(data_df, deep=True)

In [6]:
# View Data Details
data_info.get_basic_description()

The DataFrame containes 569 rows and 33 columns.
Current DataFrame Memory Usage:
105962
Current DataFrame Memory Usage of columns is :
DataFrame Information: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    uint32 
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float32
 3   texture_mean             569 non-null    float32
 4   perimeter_mean           569 non-null    float32
 5   area_mean                569 non-null    float32
 6   smoothness_mean          569 non-null    float32
 7   compactness_mean         569 non-null    float32
 8   concavity_mean           569 non-null    float32
 9   concave points_mean      569 non-null    float32
 10  symmetry_mean            569 non-null    float32
 11  fractal_dimension_mean   569 

In [7]:
data_info.get_size()

The DataFrame containes 569 rows and 33 columns.


(569, 33)

In [8]:
data_info.get_total_missing_values()

The total number of missing values is 569
3.03 % missing values.


569

In [9]:
data_info.get_columns_with_missing_values()

['Unnamed: 32']

In [10]:
data_info.get_column_based_missing_values()

Unnamed: 0,missing_count,type
Unnamed: 32,569,float32


In [11]:
# Remove the entire column which is missing the entire data
data_info.df.drop(['Unnamed: 32'], axis=1, inplace=True)


In [12]:
data_info.get_total_entries()

The DataFrame containes 18208 entries.


18208

In [13]:
data_info.get_dispersion_params().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,Mode,Median
id,569.0,30371831.4323,125020585.6122,8670.0,869218.0,906024.0,8813129.0,911320502.0,8670.0,906024.0
radius_mean,569.0,14.1273,3.524,6.981,11.7,13.37,15.78,28.11,12.34,13.37
texture_mean,569.0,19.2896,4.301,9.71,16.17,18.84,21.8,39.28,14.93,18.84
perimeter_mean,569.0,91.969,24.299,43.79,75.17,86.24,104.1,188.5,82.61,86.24
area_mean,569.0,654.889,351.9141,143.5,420.3,551.1,782.7,2501.0,512.2,551.1
smoothness_mean,569.0,0.0964,0.0141,0.0526,0.0864,0.0959,0.1053,0.1634,0.1007,0.0959
compactness_mean,569.0,0.1043,0.0528,0.0194,0.0649,0.0926,0.1304,0.3454,0.1147,0.0926
concavity_mean,569.0,0.0888,0.0797,0.0,0.0296,0.0615,0.1307,0.4268,0.0,0.0615
concave points_mean,569.0,0.0489,0.0388,0.0,0.0203,0.0335,0.074,0.2012,0.0,0.0335
symmetry_mean,569.0,0.1812,0.0274,0.106,0.1619,0.1792,0.1957,0.304,0.1601,0.1792


In [14]:
len(data_info.get_duplicates())

0

In [15]:
data_info.get_object_columns()

['diagnosis']

In [16]:
# Remaining are all Numeric Types
len(data_info.get_numeric_columns())

30

## Feature Extraction

In [18]:
# Create A Data Manipulator Class Instance from the dataframe
data_manipulator = DataManipulator(data_info.df, deep=True)

### Concavity Dispersion Feature

In [62]:
# Add Concavity Dispersion for mean concave points and mean area
data_manipulator.add_column(
    'concavity_dispersion_mean', 'concave points_mean', 'area_mean', calculate_concavity_dispersion)


In [64]:
# Add Concavity Dispersion for mean concave points and mean area
data_manipulator.add_column(
    'concavity_dispersion_se', 'concave points_se', 'area_se', calculate_concavity_dispersion)


In [65]:
# Add Concavity Dispersion for mean concave points and mean area
data_manipulator.add_column(
    'concavity_dispersion_worst', 'concave points_worst', 'area_worst', calculate_concavity_dispersion)


In [66]:
data_manipulator.df.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'circumference_mean',
       'circumference_se', 'circumference_worst', 'concavity_dispersion_mean',
       'concavity_dispersion_se', 'concavity_dispersion_worst'],
      dtype='object')