In [99]:
# import libraries

import pandas as pd
import numpy as np


The dataset has been downloaded from https://archive.ics.uci.edu/ml/datasets/Cervical+cancer+%28Risk+Factors%29#

In [100]:
# read csv

df = pd.read_csv("risk_factors_cervical_cancer.csv")

In [101]:
df.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
2,34,1.0,?,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,?,?,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,?,?,0,0,0,0,0,0,0,0


At first glance, the thing that comes to notice is the '?' present as a value in lots of places.

In [102]:
df.columns

Index(['Age', 'Number of sexual partners', 'First sexual intercourse',
       'Num of pregnancies', 'Smokes', 'Smokes (years)', 'Smokes (packs/year)',
       'Hormonal Contraceptives', 'Hormonal Contraceptives (years)', 'IUD',
       'IUD (years)', 'STDs', 'STDs (number)', 'STDs:condylomatosis',
       'STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis',
       'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis',
       'STDs:pelvic inflammatory disease', 'STDs:genital herpes',
       'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV',
       'STDs:Hepatitis B', 'STDs:HPV', 'STDs: Number of diagnosis',
       'STDs: Time since first diagnosis', 'STDs: Time since last diagnosis',
       'Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller',
       'Citology', 'Biopsy'],
      dtype='object')

Only the relevant columns are kept and all others are deleted.

In [103]:
df = df.drop(['Smokes (packs/year)', 'STDs:condylomatosis',
       'STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis',
       'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis',
       'STDs:pelvic inflammatory disease', 'STDs:genital herpes',
       'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV',
       'STDs:Hepatitis B', 'STDs:HPV', 'Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller',
       'Citology'], axis = 1) 

In [104]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 16 columns):
Age                                 858 non-null int64
Number of sexual partners           858 non-null object
First sexual intercourse            858 non-null object
Num of pregnancies                  858 non-null object
Smokes                              858 non-null object
Smokes (years)                      858 non-null object
Hormonal Contraceptives             858 non-null object
Hormonal Contraceptives (years)     858 non-null object
IUD                                 858 non-null object
IUD (years)                         858 non-null object
STDs                                858 non-null object
STDs (number)                       858 non-null object
STDs: Number of diagnosis           858 non-null int64
STDs: Time since first diagnosis    858 non-null object
STDs: Time since last diagnosis     858 non-null object
Biopsy                              858 non-null int64
dt

This makes it clear that there is no missing value present directly, and the '?'s will have to be dealt with separately.

Other than 2 columns, the rest are of object type whereas they should be of numeric type.

In [105]:
# No missing values prsent explicitly

print(df.isnull().sum())


Age                                 0
Number of sexual partners           0
First sexual intercourse            0
Num of pregnancies                  0
Smokes                              0
Smokes (years)                      0
Hormonal Contraceptives             0
Hormonal Contraceptives (years)     0
IUD                                 0
IUD (years)                         0
STDs                                0
STDs (number)                       0
STDs: Number of diagnosis           0
STDs: Time since first diagnosis    0
STDs: Time since last diagnosis     0
Biopsy                              0
dtype: int64


In [106]:
for i in df.columns:
    print(i)
    print(df[i].unique())

Age
[18 15 34 52 46 42 51 26 45 44 27 43 40 41 39 37 38 36 35 33 31 32 30 23
 28 29 20 25 21 24 22 48 19 17 16 14 59 79 84 47 13 70 50 49]
Number of sexual partners
['4.0' '1.0' '5.0' '3.0' '2.0' '6.0' '?' '7.0' '15.0' '8.0' '10.0' '28.0'
 '9.0']
First sexual intercourse
['15.0' '14.0' '?' '16.0' '21.0' '23.0' '17.0' '26.0' '20.0' '25.0' '18.0'
 '27.0' '19.0' '24.0' '32.0' '13.0' '29.0' '11.0' '12.0' '22.0' '28.0'
 '10.0']
Num of pregnancies
['1.0' '4.0' '2.0' '6.0' '3.0' '5.0' '?' '8.0' '7.0' '0.0' '11.0' '10.0']
Smokes
['0.0' '1.0' '?']
Smokes (years)
['0.0' '37.0' '34.0' '1.266972909' '3.0' '12.0' '?' '18.0' '7.0' '19.0'
 '21.0' '15.0' '13.0' '16.0' '8.0' '4.0' '10.0' '22.0' '14.0' '0.5' '11.0'
 '9.0' '2.0' '5.0' '6.0' '1.0' '32.0' '24.0' '28.0' '20.0' '0.16']
Hormonal Contraceptives
['0.0' '1.0' '?']
Hormonal Contraceptives (years)
['0.0' '3.0' '15.0' '2.0' '8.0' '10.0' '5.0' '0.25' '7.0' '22.0' '19.0'
 '0.5' '1.0' '0.58' '9.0' '13.0' '11.0' '4.0' '12.0' '16.0' '0.33' '?'
 '0.16' '

Here, it is observed that almost all the values are given as strings.

The categorical features need to be converted to Boolean type.
The numeric features need to be converted to float type.
The missing values for each column are imputed by the mode(most frequent value), as has been followed in the book.

In [108]:
df = df.replace('?', np.NaN)
df.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,IUD (years),STDs,STDs (number),STDs: Number of diagnosis,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,,,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,,,0
2,34,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,,,0
3,52,5.0,16.0,4.0,1.0,37.0,1.0,3.0,0.0,0.0,0.0,0.0,0,,,0
4,46,3.0,21.0,4.0,0.0,0.0,1.0,15.0,0.0,0.0,0.0,0.0,0,,,0


All the '?', i.e., the missing values, are replaced with the numpy NaN representation.  

In [110]:
# converting all values to float

df = df.astype('float64')

In [111]:
# converting categorical features to Boolean

d = {0.0:True, 1.0:False}

df['Smokes'] = df['Smokes'].map(d)
df['Hormonal Contraceptives'] = df['Hormonal Contraceptives'].map(d)
df['IUD'] = df['IUD'].map(d)
df['STDs'] = df['STDs'].map(d)

df.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,IUD (years),STDs,STDs (number),STDs: Number of diagnosis,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Biopsy
0,18.0,4.0,15.0,1.0,True,0.0,True,0.0,True,0.0,True,0.0,0.0,,,0.0
1,15.0,1.0,14.0,1.0,True,0.0,True,0.0,True,0.0,True,0.0,0.0,,,0.0
2,34.0,1.0,,1.0,True,0.0,True,0.0,True,0.0,True,0.0,0.0,,,0.0
3,52.0,5.0,16.0,4.0,False,37.0,False,3.0,True,0.0,True,0.0,0.0,,,0.0
4,46.0,3.0,21.0,4.0,True,0.0,False,15.0,True,0.0,True,0.0,0.0,,,0.0


All that is left to do now is to impute the missing values.

In [112]:
df.mode(axis='rows', dropna=True)

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,IUD (years),STDs,STDs (number),STDs: Number of diagnosis,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Biopsy
0,23.0,2.0,15.0,1.0,True,0.0,False,0.0,True,0.0,True,0.0,0.0,1.0,1.0,0.0


In [113]:
cols = df.columns
df[cols]=df[cols].fillna(df.mode().iloc[0])

print(df.isnull().sum())
# no missing values 

Age                                 0
Number of sexual partners           0
First sexual intercourse            0
Num of pregnancies                  0
Smokes                              0
Smokes (years)                      0
Hormonal Contraceptives             0
Hormonal Contraceptives (years)     0
IUD                                 0
IUD (years)                         0
STDs                                0
STDs (number)                       0
STDs: Number of diagnosis           0
STDs: Time since first diagnosis    0
STDs: Time since last diagnosis     0
Biopsy                              0
dtype: int64


In [114]:
# the final dataset

df.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,IUD (years),STDs,STDs (number),STDs: Number of diagnosis,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Biopsy
0,18.0,4.0,15.0,1.0,True,0.0,True,0.0,True,0.0,True,0.0,0.0,1.0,1.0,0.0
1,15.0,1.0,14.0,1.0,True,0.0,True,0.0,True,0.0,True,0.0,0.0,1.0,1.0,0.0
2,34.0,1.0,15.0,1.0,True,0.0,True,0.0,True,0.0,True,0.0,0.0,1.0,1.0,0.0
3,52.0,5.0,16.0,4.0,False,37.0,False,3.0,True,0.0,True,0.0,0.0,1.0,1.0,0.0
4,46.0,3.0,21.0,4.0,True,0.0,False,15.0,True,0.0,True,0.0,0.0,1.0,1.0,0.0


In [115]:
# saving the dataset

df.to_csv('Cervical_Cancer_Dataset_Cleaned.csv')