In [1]:
import pandas as pd

In [27]:
# Loading the cervical cancer data set
# The data set can be found at http://archive.ics.uci.edu/ml/machine-learning-databases/00383/
# More information about the dataset can be found at http://archive.ics.uci.edu/ml/datasets/Cervical+cancer+%28Risk+Factors%29#
data = pd.read_csv('risk_factors_cervical_cancer_working_dataset.csv')

In [28]:
data.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0,0,0,0,0,0,0,0
2,34,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,,,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,,,0,0,0,0,0,0,0,0


#  Cleaning up the data

In [29]:
# Removing Error prone columns.
# STDs: Time since first diagnosis
# STDs: Time since last diagnosis
# Reason they contain a large set of NaN values and this is hard to work with.
data.dropna(axis=1, how='all')

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs:HPV,STDs: Number of diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.000000,0.00,0.0,0.00,0.0,...,0.0,0,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.000000,0.00,0.0,0.00,0.0,...,0.0,0,0,0,0,0,0,0,0,0
2,34,1.0,,1.0,0.0,0.000000,0.00,0.0,0.00,0.0,...,0.0,0,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.000000,37.00,1.0,3.00,0.0,...,0.0,0,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.000000,0.00,1.0,15.00,0.0,...,0.0,0,0,0,0,0,0,0,0,0
5,42,3.0,23.0,2.0,0.0,0.000000,0.00,0.0,0.00,0.0,...,0.0,0,0,0,0,0,0,0,0,0
6,51,3.0,17.0,6.0,1.0,34.000000,3.40,0.0,0.00,1.0,...,0.0,0,0,0,0,0,1,1,0,1
7,26,1.0,26.0,3.0,0.0,0.000000,0.00,1.0,2.00,1.0,...,0.0,0,0,0,0,0,0,0,0,0
8,45,1.0,20.0,5.0,0.0,0.000000,0.00,0.0,0.00,0.0,...,0.0,0,1,0,1,1,0,0,0,0
9,44,3.0,15.0,,1.0,1.266973,2.80,0.0,0.00,,...,0.0,0,0,0,0,0,0,0,0,0


In [30]:
# Removing unwanted columns
# First sexual intercourse
# IUD
# IUD (years)
# STDs:cervical condylomatosis
# STDs:vaginal condylomatosis
# STDs:vulvo-perineal condylomatosis
# STDs:syphilis
# STDs:genital herpes
# STDs:Hepatitis B
# STDs: Number of diagnosis
# Dx
# Hinselmann
# Schiller
# Biopsy
# STDs: Time since first diagnosis
# STDs: Time since last diagnosis

data = data.drop(['First sexual intercourse','IUD', 'IUD (years)', 'STDs (number)', 'Smokes (years)', 'Smokes (packs/year)',
           'STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis', 
           'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis',
           'STDs:genital herpes', 'STDs:Hepatitis B', 'STDs: Number of diagnosis', 'STDs: Time since first diagnosis', 'STDs: Time since last diagnosis',
           'Dx', 'Hinselmann', 'Schiller', 'Biopsy'],axis=1)

In [23]:
# We are left with
# - Age
# - Smoking 
# - Sexual partners
# - Hormonal contraceptives [bool & int)
# - STDs [bool & int ]
# - STDs : cervical condylomatosis 
# - (bool) STDs : pelvic inflammatory disease
# -(bool) STDs:molluscum contagiosum 
# - (bool) STDs:AIDS 
# - (bool) STDs:HIV 
# - (bool) STDs:HPV 
# - (bool) Dx:Cancer 
# - (bool) Dx:CIN 
# - (bool) Dx:HPV 
# - (int) Num of pregnancies 

# Our Target variable is 
# - (bool) Cytology

In [31]:
# drop rows with now values

data.head()

Unnamed: 0,Age,Number of sexual partners,Num of pregnancies,Smokes,Hormonal Contraceptives,Hormonal Contraceptives (years),STDs,STDs:condylomatosis,STDs:pelvic inflammatory disease,STDs:molluscum contagiosum,STDs:AIDS,STDs:HIV,STDs:HPV,Dx:Cancer,Dx:CIN,Dx:HPV,Citology
0,18,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0
1,15,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0
2,34,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0
3,52,5.0,4.0,1.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,1,0
4,46,3.0,4.0,0.0,1.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0


In [32]:
data = data.dropna()

In [33]:
data.head()

Unnamed: 0,Age,Number of sexual partners,Num of pregnancies,Smokes,Hormonal Contraceptives,Hormonal Contraceptives (years),STDs,STDs:condylomatosis,STDs:pelvic inflammatory disease,STDs:molluscum contagiosum,STDs:AIDS,STDs:HIV,STDs:HPV,Dx:Cancer,Dx:CIN,Dx:HPV,Citology
0,18,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0
1,15,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0
2,34,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0
3,52,5.0,4.0,1.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,1,0
4,46,3.0,4.0,0.0,1.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0


In [35]:
# saved filtered csv
data.to_csv('filtered_01_cervical_cancer_dataset.csv')