In [69]:
# Import dependencies
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from joblib import dump

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import pandas as pd

# for display in html format
# from IPython.display import HTML

                                            Load & Clean the Data

In [82]:
# Load cytology data in with 10 features
df_10 = pd.read_csv("./db/cytology_ml.csv")
df_10.head(2)

Unnamed: 0,id,thickness,size,shape,adhesion,single,nuclei,chromatin,nucleoli,mitosis,class
0,1000025,5,1,1,1,2,1.0,3,1,1,0
1,1002945,5,4,4,5,7,10.0,3,2,1,0


In [71]:
# Load wisconsin data in with 32 features 
df_32 = pd.read_csv("./db/Wisconsindata.csv")
df_32.head(2)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,


In [72]:
# Explore cytology data
print(df_10.shape)
print(df_10.dtypes)
df_10.head(n=2)

(699, 11)
id             int64
thickness      int64
size           int64
shape          int64
adhesion       int64
single         int64
nuclei       float64
chromatin      int64
nucleoli       int64
mitosis        int64
class          int64
dtype: object


Unnamed: 0,id,thickness,size,shape,adhesion,single,nuclei,chromatin,nucleoli,mitosis,class
0,1000025,5,1,1,1,2,1.0,3,1,1,0
1,1002945,5,4,4,5,7,10.0,3,2,1,0


In [73]:
# Explore wisconsin data
print(df_32.shape)
print(df_32.dtypes)
df_32.head(n=2)

(569, 33)
id                           int64
diagnosis                   object
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,


In [74]:
# Finding missing values for both datasets 
print('cytology missing values:\n{}'.format(df_10.isnull().sum()))
print('wisconsin missing values:\n{}'.format(df_32.isnull().sum()))

# Find duplicated records
print('cytology duplicated nums: {}'.format(df_10.duplicated().sum()))
print('wisconsin duplicated nums: {}'.format(df_32.duplicated().sum()))

# Find the unique values of 'diagnosis'.
print('\nUnique values of "diagnosis": {}'.format(df_10['class'].unique()))
print('\nUnique values of "diagnosis": {}'.format(df_32['diagnosis'].unique()))

cytology missing values:
id            0
thickness     0
size          0
shape         0
adhesion      0
single        0
nuclei       16
chromatin     0
nucleoli      0
mitosis       0
class         0
dtype: int64
wisconsin missing values:
id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst 

In [75]:
# Droppimg NaN values from cytolgy dataset 
df_10.dropna(inplace=True)

In [85]:
# Check how many malignant and bening results in both datasets 
sum_results = df_32['diagnosis'].count()
m = df_32[df_32['diagnosis'] == "M"]['diagnosis'].count()

sum_results_2 = df_10['class'].count()
m_2 = df_10[df_10['class'] == "1"]['class'].count()

print("Malignant Wisconsin: ", m)
print("Benign Wisconsin: ", sum_results - m)

print("Malignant Cytology: ", m_2)
print("Benign Cytology: ", sum_results_2 - m_2)

Malignant Wisconsin:  212
Benign Wisconsin:  357
Malignant Cytology:  0
Benign Cytology:  699


In [62]:
# Rename values "M" & "B" to 0 & 1 in order to match the cytology dataset => join 2 datasets
df_32['diagnosis'] = df_32['diagnosis'].replace({'M': '1', 'B': '0'})
df_32.head(2)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,


In [63]:
# Check how many malignant and bening results in both datasets 
sum_results = df_32['diagnosis'].count()
m = df_32[df_32['diagnosis'] == "1"]['diagnosis'].count()
print("Malignant: ", m)
print("Benign: ", sum_results - m)

Malignant:  212
Benign:  357
