# Loading Libraries and Dataset

In [280]:
# Installing and loading libraries
import pandas as pd
import numpy as np

# For imputing missing values
from sklearn.ensemble import RandomForestRegressor

In [281]:
# Loading the CSV dataset
df = pd.read_csv("Dataset/CubanDataset.csv")

# Viewing The Data

In [282]:
# Viewing a portion of the dataset
df.head()

Unnamed: 0,id,age,menarche,menopause,agefirst,children,breastfeeding,nrelbc,biopsies,hyperplasia,...,weight,exercise,alcohol,tobacco,allergies,emotional,depressive,histologicalclass,birads,cancer
0,1,50,11,No,No,0,No,Mother,1.0,Yes,...,83.4,No,Yes,Yes,Rhinitis,Sad,Yes,3.0,3A,Yes
1,2,46,12,No,36,1,3 months,Mother/Sister,2.0,Yes,...,78.2,2,Yes,No,Medicines,Joy,No,3.0,3B,Yes
2,3,47,13,No,26,1,3 months,Sister,1.0,Yes,...,82.6,1,No,Yes,Laryngitis,Sad,No,4.0,3B,Yes
3,4,49,11,47,21,1,No,Daughter,1.0,No,...,79.4,No,Yes,Yes,No,Joy,No,3.0,4B,Yes
4,5,54,14,42,16,1,1 month,Mother,2.0,Yes,...,81.5,No,Yes,No,No,Sad,Yes,4.0,3C,Yes


In [283]:
# Viewing the shape of the dataset
df.shape

(1697, 23)

In [284]:
# Viewing the datatype of each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1697 entries, 0 to 1696
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 1697 non-null   int64  
 1   age                1697 non-null   int64  
 2   menarche           1697 non-null   int64  
 3   menopause          1697 non-null   object 
 4   agefirst           1697 non-null   object 
 5   children           1697 non-null   object 
 6   breastfeeding      1697 non-null   object 
 7   nrelbc             1697 non-null   object 
 8   biopsies           1696 non-null   float64
 9   hyperplasia        1697 non-null   object 
 10  race               1697 non-null   object 
 11  year               1160 non-null   float64
 12  imc                1690 non-null   float64
 13  weight             1687 non-null   float64
 14  exercise           1697 non-null   object 
 15  alcohol            1697 non-null   object 
 16  tobacco            1697 

In [285]:
# Viewing the statistics of numerical data
df.describe()

Unnamed: 0,id,age,menarche,biopsies,year,imc,weight,histologicalclass
count,1697.0,1697.0,1697.0,1696.0,1160.0,1690.0,1687.0,1160.0
mean,849.0,51.483206,11.725987,1.308373,2010.863793,25.534615,70.497214,3.72931
std,490.02602,11.929919,1.839937,1.184392,4.636249,4.961554,12.808854,1.767066
min,1.0,20.0,8.0,0.0,2001.0,5.0,13.0,1.0
25%,425.0,45.0,10.0,0.0,2008.0,22.8,61.9,3.0
50%,849.0,53.0,12.0,1.0,2011.0,25.1,69.2,3.0
75%,1273.0,61.0,13.0,2.0,2015.0,27.3,79.2,4.0
max,1697.0,90.0,17.0,5.0,2018.0,88.8,240.0,11.0


In [286]:
# Viewing the unique values of categorical data
df.select_dtypes(include = "object").apply(pd.unique)

menopause        [No, 47, 42, 44, 46, 43, 51, 40, 41, 48, 39, 4...
agefirst         [No, 36, 26, 21, 16, 20, 27, 24, 30, 35, 25, 1...
children                                    [0, 1, 2, 3, 4, 5, 5+]
breastfeeding    [No, 3 months, 1 month, 2 months, 4 months, 8 ...
nrelbc           [Mother, Mother/Sister, Sister, Daughter, No, ...
hyperplasia                                              [Yes, No]
race                                         [White, Mixed, Black]
exercise                   [No, 2, 1, Diary, 3, 5, 4, NO, 6, 0, 7]
alcohol                                                  [Yes, No]
tobacco                                                  [Yes, No]
allergies        [Rhinitis, Medicines, Laryngitis, No, Dermatit...
emotional                                               [Sad, Joy]
depressive                                               [Yes, No]
birads                            [3A, 3B, 4B, 3C, 5C, 5B, 6, nan]
cancer                                                   [Yes,

In [287]:
# Viewing the amount of distinct values in each column
df.select_dtypes(include = "object").nunique()

menopause        29
agefirst         33
children          7
breastfeeding    42
nrelbc           18
hyperplasia       2
race              3
exercise         11
alcohol           2
tobacco           2
allergies        24
emotional         2
depressive        2
birads            7
cancer            2
dtype: int64

In [288]:
# Viewing the amount of null values in each column
df.isnull().sum()

id                     0
age                    0
menarche               0
menopause              0
agefirst               0
children               0
breastfeeding          0
nrelbc                 0
biopsies               1
hyperplasia            0
race                   0
year                 537
imc                    7
weight                10
exercise               0
alcohol                0
tobacco                0
allergies            276
emotional              0
depressive             0
histologicalclass    537
birads               537
cancer                 0
dtype: int64

# Data Preprocessing

## Cleaning messy data
Columns processed: "menopause", "agefirst", "breastfeeding", "exercise"

In [289]:
# Viewing the unique values of the menopause column
df["menopause"].unique()

array(['No', '47', '42', '44', '46', '43', '51', '40', '41', '48', '39',
       '49', '45', '50', '37', '52', '38', '32', '0', '33', '55', '53',
       '35', '54', '30', '60', '56', '34', '36'], dtype=object)

In [290]:
# Changing menopause column's datatype into integer and replacing the "No" value with -1
df["menopause"] = df["menopause"].replace(["No", "0"], -1).astype(int)
df["menopause"].unique()

array([-1, 47, 42, 44, 46, 43, 51, 40, 41, 48, 39, 49, 45, 50, 37, 52, 38,
       32, 33, 55, 53, 35, 54, 30, 60, 56, 34, 36])

In [291]:
# Viewing the unique values of the agefirst column
df["agefirst"].unique()

array(['No', '36', '26', '21', '16', '20', '27', '24', '30', '35', '25',
       '18', '28', '23', '22', '17', '29', '19', '31', '14', '34', '33',
       '39', '32', '37', '38', '15', '0', '43', '46', '40', '10', '9'],
      dtype=object)

In [292]:
# Changing agefirst column's datatype into integer and replacing the "No" value with -1
df["agefirst"] = df["agefirst"].replace(["No", "0"], -1).astype(int)
df["agefirst"].unique()

array([-1, 36, 26, 21, 16, 20, 27, 24, 30, 35, 25, 18, 28, 23, 22, 17, 29,
       19, 31, 14, 34, 33, 39, 32, 37, 38, 15, 43, 46, 40, 10,  9])

In [293]:
# Viewing the unique values of the breastfeeding column
df["breastfeeding"].unique()

array(['No', '3 months', '1 month', '2 months', '4 months', '8 months',
       '6 months', '5 months', '7 months', '10 months', '9 months', 'No ',
       '1 month ', '12 months', '6', '2', '10', '8', '0', '7', '4', '3',
       '5', '12', '11', '18', '16', '36', '15', '1', '72', '13', '24',
       '9', '48', '14', '17', '22', '26', '25', '28', '21'], dtype=object)

In [294]:
# Cleaning the breastfeeding column's data
# Replacing the "No" and "No " value with -1
df["breastfeeding"] = df["breastfeeding"].replace(["No", "No ", "0"], -1)

# Removing month(s) string suffixes
df["breastfeeding"] = df["breastfeeding"].astype(str).str.replace(r" months?", "", regex = True) # (Operation converts datatype to string)

# Converting column datatype into integer
df["breastfeeding"] = df["breastfeeding"].astype(int)
df["breastfeeding"].unique()

array([-1,  3,  1,  2,  4,  8,  6,  5,  7, 10,  9, 12, 11, 18, 16, 36, 15,
       72, 13, 24, 48, 14, 17, 22, 26, 25, 28, 21])

In [295]:
# Viewing the unique values of the exercise column
df["exercise"].unique()

array(['No', '2', '1', 'Diary', '3', '5', '4', 'NO', '6', '0', '7'],
      dtype=object)

In [296]:
# Viewing the count of each unique value of the exercise column
df["exercise"].value_counts()

exercise
No       712
0        394
2        206
3        162
1        113
5         34
Diary     28
4         24
7         14
NO         5
6          5
Name: count, dtype: int64

In [297]:
# Replacing the 0, "No", and "NO" value with -1
df["exercise"] = df["exercise"].replace(["No", "NO", "0"], -1)

# Replacing the "Diary" value with 0 and changing exercise column's datatype into integer
df["exercise"] = df["exercise"].replace("Diary", 0).astype(int)

df["exercise"].unique()

array([-1,  2,  1,  0,  3,  5,  4,  6,  7])

## Creating new binary columns to help identify data with multiple values
Columns processed: "nrelbc", "allergies"

In [298]:
# Viewing the unique values of the nrelbc column
df["nrelbc"].unique()

array(['Mother', 'Mother/Sister', 'Sister', 'Daughter', 'No',
       'Mother/Daughter', 'Sister/Daughter', 'Cousin', 'Aunt',
       'Grandmother', 'Mother/Grandmother', 'Aunt/Cousin',
       'Grandmother/Aunt', 'Sister/Grandmother', 'Grandmother ',
       'Mother/Aunt', 'Mother/Aunt/Cousin',
       'Mother/Grandmother/Aunt/Cousin'], dtype=object)

In [299]:
# Creating new columns to identify which relatives has experienced breast cancer before
# Identifying available values
familyMembers = ["Mother", "Sister", "Daughter", "Cousin", "Aunt", "Grandmother", "No"]

# Creating the new columns
for member in familyMembers:
    df[member] = df["nrelbc"].str.contains(member).astype(int)

# Dropping the original column
df.drop(columns = ["nrelbc"], inplace = True)

In [300]:
# Renaming the result columns
df = df.rename(columns = {
    "Mother" : "nrelbc_mother",
    "Sister" : "nrelbc_sister",
    "Daughter" : "nrelbc_daughter",
    "Cousin" : "nrelbc_cousin",
    "Aunt" : "nrelbc_aunt",
    "Grandmother" : "nrelbc_grandma",
    "No" : "nrelbc_absent"
})

In [301]:
# Viewing the amount of unique values in each column
df.nunique()

id                   1697
age                    61
menarche               10
menopause              28
agefirst               32
children                7
breastfeeding          28
biopsies                6
hyperplasia             2
race                    3
year                   18
imc                   188
weight                292
exercise                9
alcohol                 2
tobacco                 2
allergies              24
emotional               2
depressive              2
histologicalclass      11
birads                  7
cancer                  2
nrelbc_mother           2
nrelbc_sister           2
nrelbc_daughter         2
nrelbc_cousin           2
nrelbc_aunt             2
nrelbc_grandma          2
nrelbc_absent           2
dtype: int64

In [302]:
# Viewing data with missing allergies value
df[df["allergies"].isna()]

Unnamed: 0,id,age,menarche,menopause,agefirst,children,breastfeeding,biopsies,hyperplasia,race,...,histologicalclass,birads,cancer,nrelbc_mother,nrelbc_sister,nrelbc_daughter,nrelbc_cousin,nrelbc_aunt,nrelbc_grandma,nrelbc_absent
1160,1161,61,13,48,29,1,6,0.0,No,Mixed,...,,,No,0,0,0,0,0,0,1
1165,1166,54,9,33,29,2,7,0.0,No,Mixed,...,,,No,0,0,0,0,0,0,1
1167,1168,62,11,52,20,3,4,0.0,No,Mixed,...,,,No,0,0,0,0,0,0,1
1168,1169,55,10,53,23,2,3,0.0,No,Mixed,...,,,No,0,0,0,0,0,0,1
1169,1170,74,10,50,25,2,7,0.0,No,Black,...,,,No,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1687,1688,20,13,-1,-1,0,-1,0.0,No,Mixed,...,,,No,0,0,0,0,0,0,1
1689,1690,45,12,-1,26,2,8,0.0,No,White,...,,,No,0,0,0,0,0,0,1
1690,1691,21,12,-1,-1,0,-1,0.0,No,White,...,,,No,0,0,0,0,0,0,1
1691,1692,21,12,-1,-1,0,-1,0.0,No,Black,...,,,No,0,0,0,0,0,0,1


In [303]:
# Handling missing values in the allergies data
# Replacing missing values with the "No" value
df.fillna({"allergies" : "No"}, inplace = True)

# Viewing the amount of missing values in the allergies data
df["allergies"].isnull().sum()

0

In [304]:
# Viewing the unique values of the allergies column
df["allergies"].unique()

array(['Rhinitis', 'Medicines', 'Laryngitis', 'No', 'Dermatitis', 'Other',
       'Medicines/Other', 'Dermatitis/Rhinitis', 'Medicines/Rhinitis',
       'Medicines/Rhinitis/Laryngitis', 'Medicines/Rhinitis/Other',
       'Rhinitis/Laryngitis', 'Dermatitis/Rhinitis/Laryngitis',
       'Rhinitis/Other', 'Dermatitis/Rhinitis/Other',
       'Rhinitis/Laryngitis/Medicines', 'Laryngitis/Other',
       'Dermatitis/Rhinitis/Medicines', 'Rhinitis/Medicines',
       'Dermatitis/Medicines/Other', 'Laryngitis/Medicines',
       'Rhinitis/Laryngitis/Other', 'Dermatitis/Laryngitis',
       'Dermatitis/Rhinitis/Laryngitis/Medicines/Other'], dtype=object)

In [305]:
# Creating new columns to identify which allergies exist in patients
# Identifying available values
allergies = ["Rhinitis", "Medicines", "Laryngitis", "Dermatitis", "Other", "No"]

# Creating the new columns
for allergy in allergies:
    df[allergy] = df["allergies"].str.contains(allergy).astype(int)

# Dropping the original column
df.drop(columns = ["allergies"], inplace = True)

In [306]:
# Renaming the result columns
df = df.rename(columns = {
    "Rhinitis" : "rhinitis_allergy",
    "Medicines" : "medicines_allergy",
    "Laryngitis" : "laryngitis_allergy",
    "Dermatitis" : "dermatitis_allergy",
    "Other" : "other_allergy",
    "No" : "no_allergy"
})

In [307]:
# Viewing the amount of unique values in each column
df.nunique()

id                    1697
age                     61
menarche                10
menopause               28
agefirst                32
children                 7
breastfeeding           28
biopsies                 6
hyperplasia              2
race                     3
year                    18
imc                    188
weight                 292
exercise                 9
alcohol                  2
tobacco                  2
emotional                2
depressive               2
histologicalclass       11
birads                   7
cancer                   2
nrelbc_mother            2
nrelbc_sister            2
nrelbc_daughter          2
nrelbc_cousin            2
nrelbc_aunt              2
nrelbc_grandma           2
nrelbc_absent            2
rhinitis_allergy         2
medicines_allergy        2
laryngitis_allergy       2
dermatitis_allergy       2
other_allergy            2
no_allergy               2
dtype: int64

## Handling missing values for data with low significance
Columns processed: "year", "biopsies", "imc", "weight"

In [308]:
# Handling missing values in the year data
yearMedian = df["year"].median()
df.fillna({"year" : yearMedian}, inplace = True)

# Viewing the amount of null values in the year data
df["year"].isnull().sum()

0

In [309]:
# Removing rows with missing values in all columns except for the histologicalclass and birads columns
df = df.drop(columns = ["histologicalclass", "birads"]).dropna().join(df["histologicalclass"]).join(df["birads"])
# Columns affected: "biopsies", "imc", "weight"

In [310]:
# Viewing the amount of null values in each column
df.isnull().sum()

id                      0
age                     0
menarche                0
menopause               0
agefirst                0
children                0
breastfeeding           0
biopsies                0
hyperplasia             0
race                    0
year                    0
imc                     0
weight                  0
exercise                0
alcohol                 0
tobacco                 0
emotional               0
depressive              0
cancer                  0
nrelbc_mother           0
nrelbc_sister           0
nrelbc_daughter         0
nrelbc_cousin           0
nrelbc_aunt             0
nrelbc_grandma          0
nrelbc_absent           0
rhinitis_allergy        0
medicines_allergy       0
laryngitis_allergy      0
dermatitis_allergy      0
other_allergy           0
no_allergy              0
histologicalclass     526
birads                526
dtype: int64

## Preparing data needed for imputation

In [311]:
# Viewing which columns are categorical
df.select_dtypes(include = "object").apply(pd.unique)

children                 [0, 1, 2, 3, 4, 5, 5+]
hyperplasia                           [Yes, No]
race                      [White, Mixed, Black]
alcohol                               [Yes, No]
tobacco                               [Yes, No]
emotional                            [Sad, Joy]
depressive                            [Yes, No]
cancer                                [Yes, No]
birads         [3A, 3B, 4B, 3C, 5C, 5B, 6, nan]
dtype: object

In [312]:
# Encoding categorical data using one hot encoding
oneHotChildren = pd.get_dummies(df["children"], drop_first = True)
oneHotHyperplasia = pd.get_dummies(df["hyperplasia"], drop_first = True)
oneHotRace = pd.get_dummies(df["race"], drop_first = True)
oneHotAlcohol = pd.get_dummies(df["alcohol"], drop_first = True)
oneHotTobacco = pd.get_dummies(df["tobacco"], drop_first = True)
oneHotEmotional = pd.get_dummies(df["emotional"], drop_first = True)
oneHotDepressive = pd.get_dummies(df["depressive"], drop_first = True)

In [313]:
# Renaming the columns
oneHotChildren = oneHotChildren.rename(columns = {
    "1" : "1_children",
    "2" : "2_children",
    "3" : "3_children",
    "4" : "4_children",
    "5" : "5_children",
    "5+" : "more_than_5_children"
})

oneHotHyperplasia = oneHotHyperplasia.rename(columns = {
    "Yes" : "hyperplasia"
})

oneHotRace = oneHotRace.rename(columns = {
    "Mixed" : "is_mixed_race",
    "White" : "is_white_race"
})

oneHotAlcohol = oneHotAlcohol.rename(columns = {
    "Yes" : "consumed_alcohol"
})

oneHotTobacco = oneHotTobacco.rename(columns = {
    "Yes" : "consumed_tobacco"
})

oneHotEmotional = oneHotEmotional.rename(columns = {
    "Sad" : "is_sad"
})

oneHotDepressive = oneHotDepressive.rename(columns = {
    "Yes" : "is_depressive"
})

In [314]:
# Selecting relevant non categorical features for imputation
relevantDf = df.select_dtypes(exclude = "object")
relevantDf["birads"] = df["birads"]

# Combining all relevant features and newly encoded data into one dataframe
preppedDf = pd.concat([relevantDf, oneHotChildren, oneHotHyperplasia, oneHotRace, oneHotAlcohol, oneHotTobacco, oneHotEmotional, oneHotDepressive], axis = 1)
preppedDf.nunique()

id                      1686
age                       61
menarche                  10
menopause                 28
agefirst                  32
breastfeeding             28
biopsies                   6
year                      18
imc                      188
weight                   292
exercise                   9
nrelbc_mother              2
nrelbc_sister              2
nrelbc_daughter            2
nrelbc_cousin              2
nrelbc_aunt                2
nrelbc_grandma             2
nrelbc_absent              2
rhinitis_allergy           2
medicines_allergy          2
laryngitis_allergy         2
dermatitis_allergy         2
other_allergy              2
no_allergy                 2
histologicalclass         11
birads                     7
1_children                 2
2_children                 2
3_children                 2
4_children                 2
5_children                 2
more_than_5_children       2
hyperplasia                2
is_mixed_race              2
is_white_race 

## Handling histologicalclass missing values using random forest imputation

In [315]:
# Selecting required columns for imputation
histImputeDf = preppedDf.drop(["birads"], axis = 1)
histImputeDf

Unnamed: 0,id,age,menarche,menopause,agefirst,breastfeeding,biopsies,year,imc,weight,...,4_children,5_children,more_than_5_children,hyperplasia,is_mixed_race,is_white_race,consumed_alcohol,consumed_tobacco,is_sad,is_depressive
0,1,50,11,-1,-1,-1,1.0,2011.0,27.6,83.4,...,False,False,False,True,False,True,True,True,True,True
1,2,46,12,-1,36,3,2.0,2013.0,27.3,78.2,...,False,False,False,True,True,False,True,False,False,False
2,3,47,13,-1,26,3,1.0,2011.0,24.6,82.6,...,False,False,False,True,False,False,False,True,True,False
3,4,49,11,47,21,-1,1.0,2011.0,28.6,79.4,...,False,False,False,False,False,False,True,True,False,False
4,5,54,14,42,16,1,2.0,2007.0,28.4,81.5,...,False,False,False,True,False,True,True,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1692,1693,42,12,32,20,3,0.0,2011.0,22.0,60.0,...,False,False,False,False,False,True,True,False,False,False
1693,1694,21,12,-1,-1,-1,0.0,2011.0,22.9,70.0,...,False,False,False,False,False,True,False,False,False,False
1694,1695,21,9,-1,-1,-1,0.0,2011.0,20.7,55.0,...,False,False,False,False,False,True,True,True,False,False
1695,1696,25,12,-1,-1,-1,0.0,2011.0,23.9,69.0,...,False,False,False,False,True,False,True,True,False,False


In [316]:
# Splitting data into complete data and data with missing histologicalclass values
histTrainDf = histImputeDf.loc[~df.isnull().any(axis = 1)]
histTestDf = histImputeDf.loc[df.isnull().any(axis = 1)]

In [317]:
# Initializing the model
rf_hist = RandomForestRegressor()

# Selecting relevant features
features = histImputeDf.drop(columns = ["histologicalclass", "id"]).columns.tolist()

# Training the model
rf_hist.fit(histTrainDf[features], histTrainDf["histologicalclass"])
# https://www.geeksforgeeks.org/handling-missing-values-with-random-forest/

In [318]:
# Prediciting the missing histologicalclass values
pred_hist = rf_hist.predict(histTestDf[features])
pred_hist

array([4.32, 3.87, 4.16, 4.54, 3.39, 4.17, 5.7 , 4.93, 4.89, 4.35, 5.07,
       4.45, 3.74, 3.95, 4.4 , 4.85, 5.26, 4.78, 4.2 , 3.92, 3.79, 4.02,
       3.65, 5.91, 4.29, 4.48, 6.04, 5.84, 4.63, 4.71, 4.45, 3.8 , 3.66,
       4.42, 4.4 , 3.72, 4.13, 4.46, 4.83, 5.63, 5.2 , 3.94, 4.8 , 4.58,
       5.81, 4.48, 5.76, 4.67, 3.55, 5.07, 5.04, 4.36, 5.24, 5.89, 4.1 ,
       3.79, 4.12, 4.12, 4.66, 5.1 , 3.22, 4.35, 4.51, 5.41, 4.2 , 4.57,
       3.82, 4.01, 3.53, 3.9 , 4.02, 3.77, 4.51, 4.18, 3.69, 3.84, 3.69,
       3.94, 5.02, 4.66, 4.23, 5.26, 4.  , 5.02, 3.97, 5.33, 4.55, 3.9 ,
       4.71, 3.83, 4.56, 4.94, 3.91, 4.05, 3.99, 3.55, 5.22, 4.45, 4.47,
       3.52, 3.87, 3.73, 4.25, 4.41, 5.87, 4.59, 4.98, 5.37, 4.  , 5.34,
       4.02, 3.65, 4.98, 4.46, 3.52, 4.02, 5.01, 3.23, 4.62, 4.32, 4.08,
       3.78, 4.36, 3.82, 5.05, 4.62, 3.82, 5.18, 3.68, 4.  , 3.83, 4.29,
       3.63, 3.79, 3.52, 5.2 , 5.21, 3.97, 4.05, 4.35, 5.95, 4.39, 5.2 ,
       3.92, 4.93, 4.64, 4.96, 4.69, 3.78, 4.06, 4.

In [319]:
# Rounding float values from prediction and changing the values to integer
pred_hist = pred_hist.round(0).astype(int)

In [320]:
# Joining the results with the test dataframe
histTestDf.loc[:, "histologicalclass"] = pred_hist
histTestDf.loc[:, "histologicalclass"] = histTestDf["histologicalclass"].astype(int)
histTestDf["histologicalclass"].value_counts()

histologicalclass
4.0    287
5.0    178
6.0     39
3.0     17
7.0      5
Name: count, dtype: int64

In [321]:
# Joining the test dataframe with the training dataframe
histTrainDf.loc[:, "histologicalclass"] = histTrainDf["histologicalclass"].astype(int)
newImputedDf = pd.concat([histTrainDf, histTestDf], ignore_index = True)
newImputedDf = newImputedDf.sort_values(by = "id", ascending = True)

# Rejoining the birads column that got dropped
newImputedDf["birads"] = preppedDf["birads"]

## Handling birads missing values using random forest imputation