# Breast Cancer synthetic data
## Part 3. Modeling

#### by Grzegorz Furdyn

## Contents

#### CRISP-DM Phase 3: Data Preparation
* [Missing values handling](#miss)

#### CRISP-DM Phase 4: Modeling
* [KNN](#knn)
* [Logistic Regression](#lr)

In [125]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 1000)
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,plot_confusion_matrix
from sklearn.metrics import precision_recall_curve,plot_precision_recall_curve,plot_roc_curve
from sklearn import metrics, set_config
set_config(display="diagram")
np.random.seed(42)
RANDOM_STATE = 42
TEST_SIZE = 0.2

In [161]:
df = pd.read_csv('Breast_cancer_transformed.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'Breast_cancer_transformed.csv'

## CRISP-DM Phase 3: Data Preparation

<a id='miss'></a>
### Missing values handling

In [127]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5241 entries, 0 to 5240
Data columns (total 48 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   leeft                  5241 non-null   int64  
 1   vit_stat               5241 non-null   int64  
 2   vit_stat_int           5241 non-null   int64  
 3   tumsoort               5241 non-null   int64  
 4   diag_basis             5241 non-null   int64  
 5   topo_sublok            5241 non-null   object 
 6   later                  5241 non-null   object 
 7   morf                   5241 non-null   int64  
 8   gedrag                 5241 non-null   int64  
 9   diffgrad               5241 non-null   int64  
 10  ct                     5241 non-null   object 
 11  cn                     5241 non-null   object 
 12  cm                     5241 non-null   object 
 13  pt                     5241 non-null   object 
 14  pn                     5241 non-null   object 
 15  pm  

Defining subset of features as X

In [128]:
df = df[['leeft', 'tumsoort', 'diag_basis', 'topo_sublok', 'later', 'morf', 'gedrag', 'diffgrad', 'ct', 'cn', 'cm',
       'pt', 'pn', 'pm', 'stadium', 'cstadium', 'pstadium', 'ond_lymf',
       'pos_lymf', 'er_stat', 'pr_stat', 'her2_stat', 'dcis_comp', 'multifoc',
       'tum_afm', 'swk', 'swk_uitslag', 'mari', 'mari_uitslag', 'okd',
       'org_chir', 'uitgebr_chir_code', 'dir_reconstr', 'chemo', 'target',
       'horm', 'rt', 'meta_rt', 'meta_chir', 'survival_5Y']]

In [129]:
categorical_features = [ 'tumsoort',
                         'diag_basis',
                         'topo_sublok',
                         'later',
                         'morf',
                         'gedrag',
                         'ct',
                         'cn',
                         'cm',
                         'pt',
                         'pn',
                         'pm',
                         'stadium',
                         'cstadium', 
                         'pstadium',
                         'er_stat',
                         'pr_stat',
                         'her2_stat',
                         'dcis_comp',
                         'multifoc',
                         'swk',
                         'swk_uitslag',
                         'mari',
                         'mari_uitslag',
                         'okd',
                         'org_chir',
                         'uitgebr_chir_code',
                         'dir_reconstr',
                         'chemo',
                         'target',
                         'horm',
                         'rt',
                         'meta_rt',
                         'meta_chir']

In [130]:
numeric_features = ['leeft',
                     'diffgrad',
                     'ond_lymf',
                     'pos_lymf',
                     'tum_afm']

In [131]:
numeric_features

['leeft', 'diffgrad', 'ond_lymf', 'pos_lymf', 'tum_afm']

In [132]:
df[numeric_features].describe()

Unnamed: 0,leeft,diffgrad,ond_lymf,pos_lymf,tum_afm
count,5241.0,5241.0,5215.0,4836.0,4075.0
mean,63.037397,3.112192,6.199233,2.283912,21.308221
std,13.689291,2.596167,14.858352,11.490108,24.634316
min,18.0,1.0,0.0,0.0,0.0
25%,54.0,2.0,1.0,0.0,8.0
50%,63.0,2.0,2.0,0.0,15.0
75%,72.0,3.0,5.0,1.0,24.0
max,104.0,9.0,99.0,99.0,245.0


In [133]:
df.isna().sum()[df.isna().sum() != 0].sort_values(ascending = False)

tum_afm     1166
pos_lymf     405
ond_lymf      26
dtype: int64

In [134]:
df[numeric_features] = df[numeric_features].fillna(df[numeric_features].median())

In [135]:
df.isna().sum()[df.isna().sum() != 0].sort_values(ascending = False)

Series([], dtype: int64)

In [136]:
df[categorical_features] = df[categorical_features].astype('category')

## CRISP-DM Phase 4: Modeling

<a id='knn'></a>
### KNN

In [137]:
#categorical_features

In [138]:
numeric_features

['leeft', 'diffgrad', 'ond_lymf', 'pos_lymf', 'tum_afm']

In [139]:
def preprocess(df):
    df_fin = pd.merge(df[categorical_features], df[numeric_features], left_index=True, right_index=True)
    df_fin = df_fin.join(df['survival_5Y'])
    return df_fin

In [140]:
df = preprocess(df)

In [141]:
X = df.drop('survival_5Y', axis = 1)

In [142]:
y = df['survival_5Y']

In [143]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

In [144]:
X_train.shape

(4192, 39)

In [145]:
X_test.shape

(1049, 39)

In [146]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

In [147]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [148]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [149]:
model_KNN = KNeighborsClassifier()

In [150]:
rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model_KNN)])

In [151]:
rf.fit(X_train, y_train)

In [152]:
y_pred_rf =rf.predict(X_test)

In [153]:
confusion_matrix(y_test,y_pred_rf)

array([[ 21, 164],
       [ 43, 821]], dtype=int64)

In [154]:
print(classification_report(y_test,y_pred_rf))

              precision    recall  f1-score   support

           0       0.33      0.11      0.17       185
           1       0.83      0.95      0.89       864

    accuracy                           0.80      1049
   macro avg       0.58      0.53      0.53      1049
weighted avg       0.74      0.80      0.76      1049



<a id='lr'></a>
### Logistic Regression

In [155]:
model_LR = LogisticRegressionCV()

In [156]:
rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model_LR)])

In [157]:
rf.fit(X_train, y_train)

In [158]:
y_pred_rf =rf.predict(X_test)

In [159]:
confusion_matrix(y_test,y_pred_rf)

array([[  7, 178],
       [  3, 861]], dtype=int64)

In [160]:
print(classification_report(y_test,y_pred_rf))

              precision    recall  f1-score   support

           0       0.70      0.04      0.07       185
           1       0.83      1.00      0.90       864

    accuracy                           0.83      1049
   macro avg       0.76      0.52      0.49      1049
weighted avg       0.81      0.83      0.76      1049

