##### Import Library 

In [171]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

##### Membaca csv yang tidak dirusak  dan juga yang telah dirusak

In [172]:
df = pd.read_csv('Breast_Cancer.csv') # Dataset yang tidak dirusak
df_rusak = pd.read_csv('Breast_Cancer_Rusak.csv') # Dataset yang telah dirusak sediki

##### Membagi dataset menjadi **training set** dan **testing set** dengan proporsi **70:30**

In [173]:
X = df.iloc[:,:-1] 
y = df.iloc[:,-1] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

##### Menampilkan hasil splitting 

In [174]:
print("Jumlah Data X_train : ", X_train.shape)
print("Jumlah Data X_test  : ", X_test.shape)
print("Jumlah Data y_train : ", y_train.shape)
print("Jumlah Data y_test  : ", y_test.shape)

Jumlah Data X_train :  (2816, 15)
Jumlah Data X_test  :  (1208, 15)
Jumlah Data y_train :  (2816,)
Jumlah Data y_test  :  (1208,)


##### Copy dataset menjadi *df_normalized*

In [175]:
df_norm = df.copy()
df_norm.columns

Index(['Age', 'Race', 'Marital Status', 'T Stage ', 'N Stage', '6th Stage',
       'differentiate', 'Grade', 'A Stage', 'Tumor Size', 'Estrogen Status',
       'Progesterone Status', 'Regional Node Examined',
       'Reginol Node Positive', 'Survival Months', 'Status'],
      dtype='object')

##### Melakukan normalisasi pada kolom 'Tumor Size' dan 'Regional Node Examined' menggunakan MinMaxScaler

In [176]:
min_max_scaler = MinMaxScaler()
x_scaler = min_max_scaler.fit_transform(df_norm[['Tumor Size', 'Regional Node Examined']])

In [177]:
df_normilized = pd.DataFrame(x_scaler)
df_normilized.head(10)

Unnamed: 0,0,1
0,0.021583,0.383333
1,0.244604,0.216667
2,0.446043,0.216667
3,0.122302,0.016667
4,0.28777,0.033333
5,0.136691,0.283333
6,0.05036,0.166667
7,0.208633,0.133333
8,0.733813,0.316667
9,0.223022,0.333333


##### Copy dataset menjadi *df_standardized*


In [178]:
df_standardized = df.copy()

##### Melakukan standarisasi pada dataset menggunakan Standard Scaler

In [179]:
ss = StandardScaler()
standardized_data = ss.fit_transform(df_standardized[['Tumor Size', 'Regional Node Examined']])
print('Nilai STD setelah scaling:', np.nanstd(standardized_data))

Nilai STD setelah scaling: 1.0


In [180]:
print('Nilai sebelum Standarisasi :')
print('Standar Deviasi : ', np.std(df_standardized))

print('\nNilai setelah Standarisasi : ', np.std(standardized_data))

Nilai sebelum Standarisasi :
Standar Deviasi :  Age                        8.962021
Tumor Size                21.117072
Regional Node Examined     8.098668
Reginol Node Positive      5.108696
Survival Months           22.918581
dtype: float64

Nilai setelah Standarisasi :  1.0


  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


##### Data Cleaning pada data yang memiliki *duplicate*
###### Melihat nilai duplicate sebelum dibersihkan

In [181]:
df_rusak.duplicated().sum()

14

In [182]:
df_rusak.drop_duplicates(inplace=True)

###### Melihat nilai duplicate setelah dibersihkan/didrop

In [183]:
df_rusak.duplicated().sum()

0

##### Data Cleaning pada data dengan nilai *null*

###### Melihat info 

In [184]:
df_rusak.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4023 entries, 0 to 4036
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Age                     4023 non-null   int64 
 1   Race                    4023 non-null   object
 2   Marital Status          4023 non-null   object
 3   T Stage                 4023 non-null   object
 4   N Stage                 4023 non-null   object
 5   6th Stage               4023 non-null   object
 6   differentiate           4023 non-null   object
 7   Grade                   4023 non-null   object
 8   A Stage                 4023 non-null   object
 9   Tumor Size              4023 non-null   int64 
 10  Estrogen Status         4017 non-null   object
 11  Progesterone Status     4019 non-null   object
 12  Regional Node Examined  4023 non-null   int64 
 13  Reginol Node Positive   4023 non-null   int64 
 14  Survival Months         4023 non-null   int64 
 15  Stat

In [185]:
df_rusak.isna().sum()

Age                       0
Race                      0
Marital Status            0
T Stage                   0
N Stage                   0
6th Stage                 0
differentiate             0
Grade                     0
A Stage                   0
Tumor Size                0
Estrogen Status           6
Progesterone Status       4
Regional Node Examined    0
Reginol Node Positive     0
Survival Months           0
Status                    0
dtype: int64

##### Membuat kelas SimpleImputer ber-strategy dengan nilai "*most_frequent*"

In [186]:
imputer_most_frequent = SimpleImputer(strategy="most_frequent")

In [187]:
df_rusak["Estrogen Status"] = imputer_most_frequent.fit_transform(df_rusak[["Estrogen Status"]])
df_rusak["Progesterone Status"] = imputer_most_frequent.fit_transform(df_rusak[["Progesterone Status"]])

df_rusak.isna().sum()

Age                       0
Race                      0
Marital Status            0
T Stage                   0
N Stage                   0
6th Stage                 0
differentiate             0
Grade                     0
A Stage                   0
Tumor Size                0
Estrogen Status           0
Progesterone Status       0
Regional Node Examined    0
Reginol Node Positive     0
Survival Months           0
Status                    0
dtype: int64

##### Mengubah tipe data int ke float

In [188]:
df_rusak.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 4023 entries, 0 to 4036
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Age                     4023 non-null   int64 
 1   Race                    4023 non-null   object
 2   Marital Status          4023 non-null   object
 3   T Stage                 4023 non-null   object
 4   N Stage                 4023 non-null   object
 5   6th Stage               4023 non-null   object
 6   differentiate           4023 non-null   object
 7   Grade                   4023 non-null   object
 8   A Stage                 4023 non-null   object
 9   Tumor Size              4023 non-null   int64 
 10  Estrogen Status         4023 non-null   object
 11  Progesterone Status     4023 non-null   object
 12  Regional Node Examined  4023 non-null   int64 
 13  Reginol Node Positive   4023 non-null   int64 
 14  Survival Months         4023 non-null   int64 
 15  Stat

In [189]:
df_rusak['Tumor Size'] = df_rusak['Tumor Size'].astype('float64')
df_rusak['Regional Node Examined'] = df_rusak['Regional Node Examined'].astype('float64')
df_rusak['Reginol Node Positive'] = df_rusak['Reginol Node Positive'].astype('float64')
df_rusak['Survival Months'] = df_rusak['Survival Months'].astype('float64')
df_rusak.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4023 entries, 0 to 4036
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Age                     4023 non-null   int64  
 1   Race                    4023 non-null   object 
 2   Marital Status          4023 non-null   object 
 3   T Stage                 4023 non-null   object 
 4   N Stage                 4023 non-null   object 
 5   6th Stage               4023 non-null   object 
 6   differentiate           4023 non-null   object 
 7   Grade                   4023 non-null   object 
 8   A Stage                 4023 non-null   object 
 9   Tumor Size              4023 non-null   float64
 10  Estrogen Status         4023 non-null   object 
 11  Progesterone Status     4023 non-null   object 
 12  Regional Node Examined  4023 non-null   float64
 13  Reginol Node Positive   4023 non-null   float64
 14  Survival Months         4023 non-null   

##### One Hot Encoding

In [190]:
oh_encoder  = OneHotEncoder(sparse=False)

In [191]:
kolom_enc = oh_encoder.fit_transform(df_rusak[["Status"]])
kolom_enc = pd.DataFrame(kolom_enc)

In [192]:
df_rusak.join(kolom_enc)
df_rusak.head(10)

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4.0,Positive,Positive,24.0,1.0,60.0,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35.0,Positive,Positive,14.0,5.0,62.0,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63.0,Positive,Positive,14.0,7.0,75.0,Alive
3,58,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,18.0,Positive,Positive,2.0,1.0,84.0,Alive
4,47,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41.0,Positive,Positive,3.0,1.0,50.0,Alive
5,51,White,Single,T1,N1,IIA,Moderately differentiated,2,Regional,20.0,Positive,Positive,18.0,2.0,89.0,Alive
6,51,White,Married,T1,N1,IIA,Well differentiated,1,Regional,8.0,Positive,Positive,11.0,1.0,54.0,Alive
7,40,White,Married,T2,N1,IIB,Moderately differentiated,2,Regional,30.0,Positive,Positive,9.0,1.0,14.0,Dead
8,40,White,Divorced,T4,N3,IIIC,Poorly differentiated,3,Regional,103.0,Positive,Positive,20.0,18.0,70.0,Alive
9,69,White,Married,T4,N3,IIIC,Well differentiated,1,Distant,32.0,Positive,Positive,21.0,12.0,92.0,Alive
