In [None]:
# Data Cleaning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('BreastCancer.csv')

In [3]:
data.shape

(116, 8)

In [4]:
data.head(n=20)

Unnamed: 0,Age,BMI,Glucose,Insulin,Leptin,Adiponectin,Resistin,Classification
0,48,23.5,70.0,2.707,8.8071,9.7024,7.99585,Cancer
1,83,20.690495,92.0,3.115,8.8438,5.429285,4.06405,Cancer
2,82,23.12467,91.0,4.498,17.9393,22.43204,9.27715,Cancer
3,68,21.367521,77.0,3.226,9.8827,7.16956,12.766,Cancer
4,86,21.111111,92.0,3.549,6.6994,4.81924,10.57635,Cancer
5,49,22.854458,92.0,3.226,6.8317,13.67975,10.3176,Cancer
6,89,22.7,77.0,4.69,6.964,5.589865,12.9361,Cancer
7,76,23.8,118.0,6.47,4.311,13.25132,5.1042,Cancer
8,73,22.0,97.0,3.35,4.47,10.358725,6.28445,Cancer
9,75,23.0,83.0,4.952,17.127,11.57899,7.0913,Cancer


In [5]:
data.tail(n=2)

Unnamed: 0,Age,BMI,Glucose,Insulin,Leptin,Adiponectin,Resistin,Classification
114,72,25.59,82.0,2.82,24.96,33.75,3.27,Healthy Control
115,86,27.18,138.0,19.91,90.28,14.11,4.35,Healthy Control


In [6]:
data.describe()

Unnamed: 0,Age,BMI,Glucose,Insulin,Leptin,Adiponectin,Resistin
count,116.0,112.0,115.0,115.0,116.0,115.0,115.0
mean,57.301724,27.58984,98.034783,9.941957,26.61508,10.184783,14.578449
std,16.112766,5.102647,22.472178,10.083331,19.183294,6.873161,12.342139
min,24.0,18.37,60.0,2.432,4.311,1.65602,3.21
25%,45.0,22.884615,86.0,4.3545,12.313675,5.470395,6.871175
50%,56.0,27.662416,92.0,5.819,20.271,8.300955,10.69548
75%,71.0,31.25,102.0,10.8265,37.3783,11.84398,17.46559
max,89.0,38.578759,201.0,58.46,90.28,38.04,82.1


In [8]:
data['Classification'].value_counts()

Cancer             76
Healthy Control    40
Name: Classification, dtype: int64

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116 entries, 0 to 115
Data columns (total 8 columns):
Age               116 non-null int64
BMI               112 non-null float64
Glucose           115 non-null float64
Insulin           115 non-null float64
Leptin            116 non-null float64
Adiponectin       115 non-null float64
Resistin          115 non-null float64
Classification    116 non-null object
dtypes: float64(6), int64(1), object(1)
memory usage: 6.8+ KB


In [None]:
# Imputasi missing value

In [10]:
np.sum(data.isnull())

Age               0
BMI               4
Glucose           1
Insulin           1
Leptin            0
Adiponectin       1
Resistin          1
Classification    0
dtype: int64

In [11]:
data.isnull().sum()

Age               0
BMI               4
Glucose           1
Insulin           1
Leptin            0
Adiponectin       1
Resistin          1
Classification    0
dtype: int64

In [58]:
data.isnull().sum().sum()

1183

In [12]:
#imputasi missing value data numerik
data['BMI'] = data['BMI'].fillna((data['BMI'].mean())) 
data['Age'] = data['Age'].fillna((data['Age'].median()))
data.isnull().any()

Age               False
BMI               False
Glucose            True
Insulin            True
Leptin            False
Adiponectin        True
Resistin           True
Classification    False
dtype: bool

In [13]:
# imputasi untuk data numerik yang lain
data['BMI']=data['BMI'].fillna(99)
data['Insulin']=data['Insulin'].fillna(111)
data.isnull().any()

Age               False
BMI               False
Glucose            True
Insulin           False
Leptin            False
Adiponectin        True
Resistin           True
Classification    False
dtype: bool

In [14]:
#imputasi data kategorik dengan modus
data['Age'] = data.fillna(data['Age'].value_counts().index[0])
data.isnull().any()

Age               False
BMI               False
Glucose            True
Insulin           False
Leptin            False
Adiponectin        True
Resistin           True
Classification    False
dtype: bool

In [15]:
#imputasi data kategorik dengan modus sesuai kolom masing2
data = data.apply(lambda x:x.fillna(x.value_counts().index[0]))
data.isnull().any()

Age               False
BMI               False
Glucose           False
Insulin           False
Leptin            False
Adiponectin       False
Resistin          False
Classification    False
dtype: bool

In [16]:
# Encode Labels

In [17]:
from sklearn.preprocessing import LabelEncoder

In [18]:
# Encode untuk Sex, Steroid, Anaroxia, Spleen Pabable, Spiders tanpa membuat kolom baru
data['Age'] = LabelEncoder().fit_transform(data['Age'])
data['BMI'] = LabelEncoder().fit_transform(data['BMI'])
data['Glucose'] = LabelEncoder().fit_transform(data['Glucose'])
data['Insulin'] = LabelEncoder().fit_transform(data['Insulin'])
data['Leptin'] = LabelEncoder().fit_transform(data['Leptin'])
data['Adiponectin'] = LabelEncoder().fit_transform(data['Adiponectin'])
data['Resistin'] = LabelEncoder().fit_transform(data['Resistin'])
data['Classification'] = LabelEncoder().fit_transform(data['Classification'])
data.head()

Unnamed: 0,Age,BMI,Glucose,Insulin,Leptin,Adiponectin,Resistin,Classification
0,17,33,1,3,14,68,37,0
1,47,5,19,9,15,26,4,0
2,46,30,18,33,50,107,45,0
3,35,14,5,11,21,41,66,0
4,49,11,19,19,4,20,56,0


In [None]:
## Standardize

In [21]:
from sklearn.preprocessing import StandardScaler

In [22]:
data['Age'] = StandardScaler().fit(data[['Age']]).transform(data[['Age']])
data['Classification'] = StandardScaler().fit(data[['Classification']]).transform(data[['Classification']])
data['Adiponectin'] = StandardScaler().fit(data[['Adiponectin']]).transform(data[['Adiponectin']])
data['Leptin'] = StandardScaler().fit(data[['Leptin']]).transform(data[['Leptin']])
data['Resistin'] = StandardScaler().fit(data[['Resistin']]).transform(data[['Resistin']])

  return self.partial_fit(X, y)
  """


In [23]:
data.head()

Unnamed: 0,Age,BMI,Glucose,Insulin,Leptin,Adiponectin,Resistin,Classification
0,-0.603124,33,1,3,-1.299086,0.372814,-0.602551,-0.725476
1,1.578655,5,19,9,-1.269222,-0.884176,-1.600632,-0.725476
2,1.505929,30,18,33,-0.22398,1.54002,-0.360592,-0.725476
3,0.705943,14,5,11,-1.090038,-0.435251,0.27455,-0.725476
4,1.724107,11,19,19,-1.597727,-1.063746,-0.027898,-0.725476


In [None]:
# Normalize

In [24]:
from sklearn.preprocessing import Normalizer 

In [25]:
data['Age'] = Normalizer().fit(data[['Age']]).transform(data[['Age']])
data['BMI'] = Normalizer().fit(data[['BMI']]).transform(data[['BMI']])
data['Adiponectin'] = Normalizer().fit(data[['Adiponectin']]).transform(data[['Adiponectin']])

In [26]:
data.head()

Unnamed: 0,Age,BMI,Glucose,Insulin,Leptin,Adiponectin,Resistin,Classification
0,-1.0,1.0,1,3,-1.299086,1.0,-0.602551,-0.725476
1,1.0,1.0,19,9,-1.269222,-1.0,-1.600632,-0.725476
2,1.0,1.0,18,33,-0.22398,1.0,-0.360592,-0.725476
3,1.0,1.0,5,11,-1.090038,-1.0,0.27455,-0.725476
4,1.0,1.0,19,19,-1.597727,-1.0,-0.027898,-0.725476
