# Breast Cancer One Hot Encoding 

Benjamin Frost 2022


In [1]:
import pandas as pd
import numpy as np
import torch.multiprocessing as mp
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, MinMaxScaler
from matplotlib import pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import PolynomialFeatures
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from scipy.interpolate import interp1d
from Categorization import Categorizer
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
import copy
from torch.nn.functional import one_hot
import imblearn
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint
from collections import Counter
from tslearn.clustering import TimeSeriesKMeans, silhouette_score
from tslearn.utils import to_time_series_dataset
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from dask.dataframe import from_pandas
from tsfresh.utilities.distribution import MultiprocessingDistributor
from sklearn.model_selection import StratifiedKFold, train_test_split
import hashlib 
from sklearn.metrics import precision_recall_fscore_support
from importlib import reload
from temporalHelper import TemporalHelper as TH
from concurrent.futures import ProcessPoolExecutor
import os
from torch_explain.models.explainer import Explainer
import time
from torchmetrics.functional import precision_recall
from sklearn.feature_selection import mutual_info_classif, chi2
from sklearn.linear_model import LassoCV
from torch_explain.logic.metrics import formula_consistency
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold
from func_timeout import func_set_timeout, func_timeout, FunctionTimedOut

### Loading in the data

In [2]:
cancerDF = pd.read_csv('../LEN_Test/data/breast-w_csv.csv')
targetDF = cancerDF[['Class']]

cancerDF = cancerDF.drop(columns=['Class'])

categorisationTypes = {}

In [3]:
targetDF.value_counts()

Class    
benign       458
malignant    241
dtype: int64

In [4]:
cancerDF.describe()

Unnamed: 0,Clump_Thickness,Cell_Size_Uniformity,Cell_Shape_Uniformity,Marginal_Adhesion,Single_Epi_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses
count,699.0,699.0,699.0,699.0,699.0,683.0,699.0,699.0,699.0
mean,4.41774,3.134478,3.207439,2.806867,3.216023,3.544656,3.437768,2.866953,1.589413
std,2.815741,3.051459,2.971913,2.855379,2.2143,3.643857,2.438364,3.053634,1.715078
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0
50%,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0
75%,6.0,5.0,5.0,4.0,4.0,6.0,5.0,4.0,1.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0


In [5]:
rowsWithNaN = sum(cancerDF.isnull().any(axis=1))
print(f"{cancerDF.shape[0]} rows in df, {rowsWithNaN} containing NaN values")

699 rows in df, 16 containing NaN values


### Missing values dealt with by filling with the mode.

In [7]:
cancerDF = cancerDF.apply(lambda x: x.fillna(x.mode()[0]))

cancerDF['Bare_Nuclei'] = cancerDF['Bare_Nuclei'].astype(np.int64)

In [8]:
cancerDF

Unnamed: 0,Clump_Thickness,Cell_Size_Uniformity,Cell_Shape_Uniformity,Marginal_Adhesion,Single_Epi_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1
...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1
695,2,1,1,1,2,1,1,1,1
696,5,10,10,3,7,3,8,10,2
697,4,8,6,4,3,4,10,6,1


In [70]:
cancerDF.describe()

Unnamed: 0,Clump_Thickness,Cell_Size_Uniformity,Cell_Shape_Uniformity,Marginal_Adhesion,Single_Epi_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,4.41774,3.134478,3.207439,2.806867,3.216023,3.486409,3.437768,2.866953,1.589413
std,2.815741,3.051459,2.971913,2.855379,2.2143,3.621929,2.438364,3.053634,1.715078
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0
50%,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0
75%,6.0,5.0,5.0,4.0,4.0,5.0,5.0,4.0,1.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0


In [71]:
rowsWithNaN = sum(cancerDF.isnull().any(axis=1))
print(f"{cancerDF.shape[0]} rows in df, {rowsWithNaN} containing NaN values")

699 rows in df, 0 containing NaN values


### One hot encoding

In [80]:
cancerDFCategorised = pd.get_dummies(cancerDF.astype(str))

cancerDFCategorised

Unnamed: 0,Clump_Thickness_1,Clump_Thickness_10,Clump_Thickness_2,Clump_Thickness_3,Clump_Thickness_4,Clump_Thickness_5,Clump_Thickness_6,Clump_Thickness_7,Clump_Thickness_8,Clump_Thickness_9,...,Normal_Nucleoli_9,Mitoses_1,Mitoses_10,Mitoses_2,Mitoses_3,Mitoses_4,Mitoses_5,Mitoses_6,Mitoses_7,Mitoses_8
0,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
694,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
695,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
696,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
697,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [82]:
targetDF

Unnamed: 0,Class
0,benign
1,benign
2,benign
3,benign
4,benign
...,...
694,benign
695,benign
696,malignant
697,malignant


### Encoding the target variable

In [83]:
categories = {'benign' : 0, 'malignant' : 1}

targetDF = targetDF.iloc[:,0].map(categories)

targetDF

0      0
1      0
2      0
3      0
4      0
      ..
694    0
695    0
696    1
697    1
698    1
Name: Class, Length: 699, dtype: int64

In [85]:
cancerDFCategorised['Mortality14Days'] = targetDF

cancerDFCategorised

Unnamed: 0,Clump_Thickness_1,Clump_Thickness_10,Clump_Thickness_2,Clump_Thickness_3,Clump_Thickness_4,Clump_Thickness_5,Clump_Thickness_6,Clump_Thickness_7,Clump_Thickness_8,Clump_Thickness_9,...,Mitoses_1,Mitoses_10,Mitoses_2,Mitoses_3,Mitoses_4,Mitoses_5,Mitoses_6,Mitoses_7,Mitoses_8,Mortality14Days
0,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
694,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
695,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
696,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
697,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [87]:
cancerDFCategorised.to_csv("./categorisedData/breastCancer.csv", index=False)