# Loading Libraries and Dataset

In [2]:
# Installing and loading libraries
import pandas as pd
import numpy as np

In [3]:
# Loading the CSV dataset
df = pd.read_csv("Dataset/CubanDataset.csv")

# Viewing The Data

In [4]:
# Viewing a portion of the dataset
df.head()

Unnamed: 0,id,age,menarche,menopause,agefirst,children,breastfeeding,nrelbc,biopsies,hyperplasia,...,weight,exercise,alcohol,tobacco,allergies,emotional,depressive,histologicalclass,birads,cancer
0,1,50,11,No,No,0,No,Mother,1.0,Yes,...,83.4,No,Yes,Yes,Rhinitis,Sad,Yes,3.0,3A,Yes
1,2,46,12,No,36,1,3 months,Mother/Sister,2.0,Yes,...,78.2,2,Yes,No,Medicines,Joy,No,3.0,3B,Yes
2,3,47,13,No,26,1,3 months,Sister,1.0,Yes,...,82.6,1,No,Yes,Laryngitis,Sad,No,4.0,3B,Yes
3,4,49,11,47,21,1,No,Daughter,1.0,No,...,79.4,No,Yes,Yes,No,Joy,No,3.0,4B,Yes
4,5,54,14,42,16,1,1 month,Mother,2.0,Yes,...,81.5,No,Yes,No,No,Sad,Yes,4.0,3C,Yes


In [5]:
# Viewing the shape of the dataset
df.shape

(1697, 23)

In [6]:
# Viewing the datatype of each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1697 entries, 0 to 1696
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 1697 non-null   int64  
 1   age                1697 non-null   int64  
 2   menarche           1697 non-null   int64  
 3   menopause          1697 non-null   object 
 4   agefirst           1697 non-null   object 
 5   children           1697 non-null   object 
 6   breastfeeding      1697 non-null   object 
 7   nrelbc             1697 non-null   object 
 8   biopsies           1696 non-null   float64
 9   hyperplasia        1697 non-null   object 
 10  race               1697 non-null   object 
 11  year               1160 non-null   float64
 12  imc                1690 non-null   float64
 13  weight             1687 non-null   float64
 14  exercise           1697 non-null   object 
 15  alcohol            1697 non-null   object 
 16  tobacco            1697 

In [7]:
# Viewing the statistics of numerical data
df.describe()

Unnamed: 0,id,age,menarche,biopsies,year,imc,weight,histologicalclass
count,1697.0,1697.0,1697.0,1696.0,1160.0,1690.0,1687.0,1160.0
mean,849.0,51.483206,11.725987,1.308373,2010.863793,25.534615,70.497214,3.72931
std,490.02602,11.929919,1.839937,1.184392,4.636249,4.961554,12.808854,1.767066
min,1.0,20.0,8.0,0.0,2001.0,5.0,13.0,1.0
25%,425.0,45.0,10.0,0.0,2008.0,22.8,61.9,3.0
50%,849.0,53.0,12.0,1.0,2011.0,25.1,69.2,3.0
75%,1273.0,61.0,13.0,2.0,2015.0,27.3,79.2,4.0
max,1697.0,90.0,17.0,5.0,2018.0,88.8,240.0,11.0


In [8]:
# Viewing the unique values of categorical data
df.select_dtypes(include = "object").apply(pd.unique)

menopause        [No, 47, 42, 44, 46, 43, 51, 40, 41, 48, 39, 4...
agefirst         [No, 36, 26, 21, 16, 20, 27, 24, 30, 35, 25, 1...
children                                    [0, 1, 2, 3, 4, 5, 5+]
breastfeeding    [No, 3 months, 1 month, 2 months, 4 months, 8 ...
nrelbc           [Mother, Mother/Sister, Sister, Daughter, No, ...
hyperplasia                                              [Yes, No]
race                                         [White, Mixed, Black]
exercise                   [No, 2, 1, Diary, 3, 5, 4, NO, 6, 0, 7]
alcohol                                                  [Yes, No]
tobacco                                                  [Yes, No]
allergies        [Rhinitis, Medicines, Laryngitis, No, Dermatit...
emotional                                               [Sad, Joy]
depressive                                               [Yes, No]
birads                            [3A, 3B, 4B, 3C, 5C, 5B, 6, nan]
cancer                                                   [Yes,

In [9]:
# Viewing the amount of distinct values in each column
df.select_dtypes(include = "object").nunique()

menopause        29
agefirst         33
children          7
breastfeeding    42
nrelbc           18
hyperplasia       2
race              3
exercise         11
alcohol           2
tobacco           2
allergies        24
emotional         2
depressive        2
birads            7
cancer            2
dtype: int64

In [10]:
# Viewing the amount of null values in each column
df.isnull().sum()

id                     0
age                    0
menarche               0
menopause              0
agefirst               0
children               0
breastfeeding          0
nrelbc                 0
biopsies               1
hyperplasia            0
race                   0
year                 537
imc                    7
weight                10
exercise               0
alcohol                0
tobacco                0
allergies            276
emotional              0
depressive             0
histologicalclass    537
birads               537
cancer                 0
dtype: int64

# Data Preprocessing

In [11]:
# Handling missing values in the year data
yearMedian = df["year"].median()
df.fillna({"year" : yearMedian}, inplace = True)

# Viewing the amount of null values in the year data
df["year"].isnull().sum()

0

In [12]:
# Viewing data with missing allergies value
df[df["allergies"].isna()]

Unnamed: 0,id,age,menarche,menopause,agefirst,children,breastfeeding,nrelbc,biopsies,hyperplasia,...,weight,exercise,alcohol,tobacco,allergies,emotional,depressive,histologicalclass,birads,cancer
1160,1161,61,13,48,29,1,6,No,0.0,No,...,90.0,0,No,No,,Joy,No,,,No
1165,1166,54,9,33,29,2,7,No,0.0,No,...,78.0,0,No,Yes,,Joy,No,,,No
1167,1168,62,11,52,20,3,4,No,0.0,No,...,58.0,0,No,Yes,,Joy,No,,,No
1168,1169,55,10,53,23,2,3,No,0.0,No,...,70.0,0,No,Yes,,Joy,No,,,No
1169,1170,74,10,50,25,2,7,No,0.0,No,...,65.0,0,No,No,,Joy,No,,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1687,1688,20,13,0,0,0,0,No,0.0,No,...,50.0,5,Yes,No,,Joy,Yes,,,No
1689,1690,45,12,0,26,2,8,No,0.0,No,...,65.0,1,No,No,,Joy,No,,,No
1690,1691,21,12,0,0,0,0,No,0.0,No,...,60.0,5,No,No,,Joy,No,,,No
1691,1692,21,12,0,0,0,0,No,0.0,No,...,67.0,5,Yes,No,,Joy,Yes,,,No


In [13]:
# Handling missing values in the allergies data
# Replacing missing values with the "No" value
df.fillna({"allergies" : "No"}, inplace = True)

# Viewing the amount of missing values in the allergies data
df["allergies"].isnull().sum()

0

## Handling histologicalclass missing values using random forest imputation

In [14]:
# Selecting required columns for imputation
histImputeDf = df.drop(["id", "birads"], axis = 1)
histImputeDf

Unnamed: 0,age,menarche,menopause,agefirst,children,breastfeeding,nrelbc,biopsies,hyperplasia,race,...,imc,weight,exercise,alcohol,tobacco,allergies,emotional,depressive,histologicalclass,cancer
0,50,11,No,No,0,No,Mother,1.0,Yes,White,...,27.6,83.4,No,Yes,Yes,Rhinitis,Sad,Yes,3.0,Yes
1,46,12,No,36,1,3 months,Mother/Sister,2.0,Yes,Mixed,...,27.3,78.2,2,Yes,No,Medicines,Joy,No,3.0,Yes
2,47,13,No,26,1,3 months,Sister,1.0,Yes,Black,...,24.6,82.6,1,No,Yes,Laryngitis,Sad,No,4.0,Yes
3,49,11,47,21,1,No,Daughter,1.0,No,Black,...,28.6,79.4,No,Yes,Yes,No,Joy,No,3.0,Yes
4,54,14,42,16,1,1 month,Mother,2.0,Yes,White,...,28.4,81.5,No,Yes,No,No,Sad,Yes,4.0,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1692,42,12,32,20,2,3,No,0.0,No,White,...,22.0,60.0,0,Yes,No,Other,Joy,No,,No
1693,21,12,0,0,0,0,No,0.0,No,White,...,22.9,70.0,0,No,No,No,Joy,No,,No
1694,21,9,0,0,0,0,No,0.0,No,White,...,20.7,55.0,3,Yes,Yes,Other,Joy,No,,No
1695,25,12,0,0,0,0,No,0.0,No,Mixed,...,23.9,69.0,5,Yes,Yes,Other,Joy,No,,No


In [17]:
# Removing rows that contains null values in columns other then histologicalclass
histImputeDf = histImputeDf.drop("histologicalclass", axis = 1).dropna().join(histImputeDf["histologicalclass"])
histImputeDf.isnull().sum()

age                    0
menarche               0
menopause              0
agefirst               0
children               0
breastfeeding          0
nrelbc                 0
biopsies               0
hyperplasia            0
race                   0
year                   0
imc                    0
weight                 0
exercise               0
alcohol                0
tobacco                0
allergies              0
emotional              0
depressive             0
cancer                 0
histologicalclass    526
dtype: int64

In [24]:
# Splitting data into complete data and data with missing histologicalclass values
histImputeMissingDf = histImputeDf.loc[df.isnull().any(axis = 1)]
histImputeCompleteDf = histImputeDf.loc[~df.isnull().any(axis = 1)]

In [34]:
# Encode categorical data

In [35]:
# Model Stuff
# https://www.geeksforgeeks.org/handling-missing-values-with-random-forest/

## Handling birads missing values using random forest imputation

In [38]:
# Viewing data with missing histologicalclass data
df[df["histologicalclass"].isna()]

Unnamed: 0,id,age,menarche,menopause,agefirst,children,breastfeeding,nrelbc,biopsies,hyperplasia,...,weight,exercise,alcohol,tobacco,allergies,emotional,depressive,histologicalclass,birads,cancer
1160,1161,61,13,48,29,1,6,No,0.0,No,...,90.0,0,No,No,No,Joy,No,,,No
1161,1162,73,13,50,30,1,2,No,0.0,No,...,80.0,0,No,Yes,Other,Joy,No,,,No
1162,1163,57,12,45,23,4,10,Mother,0.0,No,...,70.0,0,No,No,Rhinitis,Joy,No,,,No
1163,1164,48,10,32,18,3,8,No,0.0,No,...,75.0,0,No,Yes,Medicines,Joy,No,,,No
1164,1165,42,12,0,0,0,0,No,0.0,No,...,83.0,2,No,No,Medicines,Joy,No,,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1692,1693,42,12,32,20,2,3,No,0.0,No,...,60.0,0,Yes,No,Other,Joy,No,,,No
1693,1694,21,12,0,0,0,0,No,0.0,No,...,70.0,0,No,No,No,Joy,No,,,No
1694,1695,21,9,0,0,0,0,No,0.0,No,...,55.0,3,Yes,Yes,Other,Joy,No,,,No
1695,1696,25,12,0,0,0,0,No,0.0,No,...,69.0,5,Yes,Yes,Other,Joy,No,,,No


In [39]:
# Handling missing values in the histologicalclass data
df.fillna({"histologicalclass" : 0.0}, inplace = True)

# Viewing the amount of missing values in the allergies data
df["histologicalclass"].isnull().sum()

0

In [40]:
# Viewing data with missing birads data
df[df["birads"].isna()]

Unnamed: 0,id,age,menarche,menopause,agefirst,children,breastfeeding,nrelbc,biopsies,hyperplasia,...,weight,exercise,alcohol,tobacco,allergies,emotional,depressive,histologicalclass,birads,cancer
1160,1161,61,13,48,29,1,6,No,0.0,No,...,90.0,0,No,No,No,Joy,No,0.0,,No
1161,1162,73,13,50,30,1,2,No,0.0,No,...,80.0,0,No,Yes,Other,Joy,No,0.0,,No
1162,1163,57,12,45,23,4,10,Mother,0.0,No,...,70.0,0,No,No,Rhinitis,Joy,No,0.0,,No
1163,1164,48,10,32,18,3,8,No,0.0,No,...,75.0,0,No,Yes,Medicines,Joy,No,0.0,,No
1164,1165,42,12,0,0,0,0,No,0.0,No,...,83.0,2,No,No,Medicines,Joy,No,0.0,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1692,1693,42,12,32,20,2,3,No,0.0,No,...,60.0,0,Yes,No,Other,Joy,No,0.0,,No
1693,1694,21,12,0,0,0,0,No,0.0,No,...,70.0,0,No,No,No,Joy,No,0.0,,No
1694,1695,21,9,0,0,0,0,No,0.0,No,...,55.0,3,Yes,Yes,Other,Joy,No,0.0,,No
1695,1696,25,12,0,0,0,0,No,0.0,No,...,69.0,5,Yes,Yes,Other,Joy,No,0.0,,No


In [41]:
# Handling missing values in the birads data