In [1]:
import numpy as np
import pandas as pd
import sklearn.datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("kag_risk_factors_cervical_cancer.csv")

In [3]:
print(df)

     Age Number of sexual partners First sexual intercourse  \
0     18                       4.0                     15.0   
1     15                       1.0                     14.0   
2     34                       1.0                        ?   
3     52                       5.0                     16.0   
4     46                       3.0                     21.0   
..   ...                       ...                      ...   
853   34                       3.0                     18.0   
854   32                       2.0                     19.0   
855   25                       2.0                     17.0   
856   33                       2.0                     24.0   
857   29                       2.0                     20.0   

    Num of pregnancies Smokes Smokes (years) Smokes (packs/year)  \
0                  1.0    0.0            0.0                 0.0   
1                  1.0    0.0            0.0                 0.0   
2                  1.0    0.0          

In [4]:
df = pd.DataFrame(df)

In [5]:
df.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
2,34,1.0,?,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,?,?,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,?,?,0,0,0,0,0,0,0,0


In [6]:
df.tail()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
853,34,3.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
854,32,2.0,19.0,1.0,0.0,0.0,0.0,1.0,8.0,0.0,...,?,?,0,0,0,0,0,0,0,0
855,25,2.0,17.0,0.0,0.0,0.0,0.0,1.0,0.08,0.0,...,?,?,0,0,0,0,0,0,1,0
856,33,2.0,24.0,2.0,0.0,0.0,0.0,1.0,0.08,0.0,...,?,?,0,0,0,0,0,0,0,0
857,29,2.0,20.0,1.0,0.0,0.0,0.0,1.0,0.5,0.0,...,?,?,0,0,0,0,0,0,0,0


In [7]:
df.shape

(858, 36)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 36 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Age                                 858 non-null    int64 
 1   Number of sexual partners           858 non-null    object
 2   First sexual intercourse            858 non-null    object
 3   Num of pregnancies                  858 non-null    object
 4   Smokes                              858 non-null    object
 5   Smokes (years)                      858 non-null    object
 6   Smokes (packs/year)                 858 non-null    object
 7   Hormonal Contraceptives             858 non-null    object
 8   Hormonal Contraceptives (years)     858 non-null    object
 9   IUD                                 858 non-null    object
 10  IUD (years)                         858 non-null    object
 11  STDs                                858 non-null    object

In [9]:
from sklearn.preprocessing import FunctionTransformer

def to_float(X):
    X[X == '?'] = float('nan')
    return X.astype('float64')

transformer = FunctionTransformer(to_float)

df_float64 = transformer.transform(df)

In [10]:
df = df_float64

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 36 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Age                                 858 non-null    float64
 1   Number of sexual partners           832 non-null    float64
 2   First sexual intercourse            851 non-null    float64
 3   Num of pregnancies                  802 non-null    float64
 4   Smokes                              845 non-null    float64
 5   Smokes (years)                      845 non-null    float64
 6   Smokes (packs/year)                 845 non-null    float64
 7   Hormonal Contraceptives             750 non-null    float64
 8   Hormonal Contraceptives (years)     750 non-null    float64
 9   IUD                                 741 non-null    float64
 10  IUD (years)                         741 non-null    float64
 11  STDs                                753 non-n

In [15]:
mean_sexual_partners = df['Number of sexual partners'].mean()

df['Number of sexual partners'] = df['Number of sexual partners'].fillna(mean_sexual_partners)

In [17]:
mean_first_sexual_intercourse = df['First sexual intercourse'].mean()

df['First sexual intercourse']  = df['First sexual intercourse'].fillna(mean_first_sexual_intercourse)

In [19]:
mean_num_of_pregnancies = df['Num of pregnancies'].mean()

df['Num of pregnancies'] = df['Num of pregnancies'].fillna(mean_num_of_pregnancies)

In [21]:
mean_smokes = df['Smokes'].mean()

df['Smokes'] = df['Smokes'].fillna(mean_smokes)

In [24]:
mean_smokes_years = df['Smokes (years)'].mean()

df['Smokes (years)'] = df['Smokes (years)'].fillna(mean_smokes_years)

In [34]:
mean_smokes_pack = df['Smokes (packs/year)'].mean()

df['Smokes (packs/year)'] = df['Smokes (packs/year)'].fillna(mean_smokes_pack)

In [30]:
mean_contraceptives = df['Hormonal Contraceptives'].mean()

df['Hormonal Contraceptives'] = df['Hormonal Contraceptives'].fillna(mean_contraceptives)

In [32]:
mean_contraceptives = df['Hormonal Contraceptives (years)'].mean()

df['Hormonal Contraceptives (years)'] = df['Hormonal Contraceptives (years)'].fillna(mean_contraceptives)

In [37]:
mean_iud = df['IUD'].mean()

df['IUD'] = df['IUD'].fillna(mean_iud)

In [38]:
mean_iud_years = df['IUD (years)'].mean()

df['IUD (years)'] = df['IUD (years)'].fillna(mean_iud_years)

In [46]:
mean_STDs = df['STDs'].mean()

df['STDs'] = df['STDs'].fillna(mean_STDs)

In [45]:
mean_STDs_number = df['STDs (number)'].mean()

df['STDs (number)'] = df['STDs (number)'].fillna(mean_STDs_number)

In [47]:
mean_STDs_condylomatosis = df['STDs:condylomatosis'].mean()

df['STDs:condylomatosis'] = df['STDs:condylomatosis'].fillna(mean_STDs_condylomatosis)

In [56]:
mean_STDs_cervical_condylomatosis = df['STDs:cervical condylomatosis'].mean()

df['STDs:cervical condylomatosis'] = df['STDs:cervical condylomatosis'].fillna(mean_STDs_cervical_condylomatosis)

In [55]:
mean_STDs_vaginal_condylomatosis = df['STDs:vaginal condylomatosis'].mean()

df['STDs:vaginal condylomatosis'] = df['STDs:vaginal condylomatosis'].fillna(mean_STDs_vaginal_condylomatosis)

In [59]:
mean_STDs_perineal_condylomatosis = df['STDs:vulvo-perineal condylomatosis'].mean()

df['STDs:vulvo-perineal condylomatosis'] = df['STDs:vulvo-perineal condylomatosis'].fillna(mean_STDs_perineal_condylomatosis)

In [61]:
mean_STDs_syphilis = df['STDs:syphilis'].mean()

df['STDs:syphilis'] = df['STDs:syphilis'].fillna(mean_STDs_syphilis)

In [63]:
mean_STDs_pelvic = df['STDs:pelvic inflammatory disease'].mean()

df['STDs:pelvic inflammatory disease'] = df['STDs:pelvic inflammatory disease'].fillna(mean_STDs_pelvic)

In [65]:
mean_STDs_herpes = df['STDs:genital herpes'].mean()

df['STDs:genital herpes'] = df['STDs:genital herpes'].fillna(mean_STDs_herpes)

In [67]:
mean_STDs_molluscum = df['STDs:molluscum contagiosum'].mean()

df['STDs:molluscum contagiosum'] = df['STDs:molluscum contagiosum'].fillna(mean_STDs_molluscum)

In [69]:
mean_STDs_AIDS = df['STDs:AIDS'].mean()

df['STDs:AIDS'] = df['STDs:AIDS'].fillna(mean_STDs_AIDS)

In [70]:
mean_STDs_HIV = df['STDs:HIV'].mean()

df['STDs:HIV'] = df['STDs:HIV'].fillna(mean_STDs_HIV)

In [74]:
mean_STDs_Hepatitis = df['STDs:Hepatitis B'].mean()

df['STDs:Hepatitis B'] = df['STDs:Hepatitis B'].fillna(mean_STDs_Hepatitis)

In [72]:
mean_STDs_HPV = df['STDs:HPV'].mean()

df['STDs:HPV'] = df['STDs:HIV'].fillna(mean_STDs_HPV)

In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 36 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Age                                 858 non-null    float64
 1   Number of sexual partners           858 non-null    float64
 2   First sexual intercourse            858 non-null    float64
 3   Num of pregnancies                  858 non-null    float64
 4   Smokes                              858 non-null    float64
 5   Smokes (years)                      858 non-null    float64
 6   Smokes (packs/year)                 858 non-null    float64
 7   Hormonal Contraceptives             858 non-null    float64
 8   Hormonal Contraceptives (years)     858 non-null    float64
 9   IUD                                 858 non-null    float64
 10  IUD (years)                         858 non-null    float64
 11  STDs                                858 non-n

In [76]:
df = df.dropna(axis=1)

In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 34 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Age                                 858 non-null    float64
 1   Number of sexual partners           858 non-null    float64
 2   First sexual intercourse            858 non-null    float64
 3   Num of pregnancies                  858 non-null    float64
 4   Smokes                              858 non-null    float64
 5   Smokes (years)                      858 non-null    float64
 6   Smokes (packs/year)                 858 non-null    float64
 7   Hormonal Contraceptives             858 non-null    float64
 8   Hormonal Contraceptives (years)     858 non-null    float64
 9   IUD                                 858 non-null    float64
 10  IUD (years)                         858 non-null    float64
 11  STDs                                858 non-n

In [79]:
df.isnull().sum()

Age                                   0
Number of sexual partners             0
First sexual intercourse              0
Num of pregnancies                    0
Smokes                                0
Smokes (years)                        0
Smokes (packs/year)                   0
Hormonal Contraceptives               0
Hormonal Contraceptives (years)       0
IUD                                   0
IUD (years)                           0
STDs                                  0
STDs (number)                         0
STDs:condylomatosis                   0
STDs:cervical condylomatosis          0
STDs:vaginal condylomatosis           0
STDs:vulvo-perineal condylomatosis    0
STDs:syphilis                         0
STDs:pelvic inflammatory disease      0
STDs:genital herpes                   0
STDs:molluscum contagiosum            0
STDs:AIDS                             0
STDs:HIV                              0
STDs:Hepatitis B                      0
STDs:HPV                              0


In [80]:
df.describe()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs:HPV,STDs: Number of diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
count,858.0,858.0,858.0,858.0,858.0,858.0,858.0,858.0,858.0,858.0,...,858.0,858.0,858.0,858.0,858.0,858.0,858.0,858.0,858.0,858.0
mean,26.820513,2.527644,16.9953,2.275561,0.145562,1.219721,0.453144,0.641333,2.256419,0.112011,...,0.023904,0.087413,0.020979,0.01049,0.020979,0.027972,0.040793,0.086247,0.051282,0.064103
std,8.497948,1.642267,2.791883,1.399325,0.350189,4.057885,2.209657,0.448671,3.519082,0.29326,...,0.143183,0.302545,0.143398,0.101939,0.143398,0.164989,0.197925,0.280892,0.220701,0.245078
min,13.0,1.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20.0,2.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,25.0,2.0,17.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,32.0,3.0,18.0,3.0,0.0,0.0,0.0,1.0,2.256419,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,84.0,28.0,32.0,11.0,1.0,37.0,37.0,1.0,30.0,1.0,...,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [81]:
df['Dx:Cancer'].value_counts()

0.0    840
1.0     18
Name: Dx:Cancer, dtype: int64

1 --> Positive

0 --> Negative

In [85]:
df = pd.DataFrame(df)

In [87]:
X = df.drop(columns='Dx:Cancer', axis=1)
Y = df['Dx:Cancer']

In [88]:
print(X)

      Age  Number of sexual partners  First sexual intercourse  \
0    18.0                        4.0                   15.0000   
1    15.0                        1.0                   14.0000   
2    34.0                        1.0                   16.9953   
3    52.0                        5.0                   16.0000   
4    46.0                        3.0                   21.0000   
..    ...                        ...                       ...   
853  34.0                        3.0                   18.0000   
854  32.0                        2.0                   19.0000   
855  25.0                        2.0                   17.0000   
856  33.0                        2.0                   24.0000   
857  29.0                        2.0                   20.0000   

     Num of pregnancies  Smokes  Smokes (years)  Smokes (packs/year)  \
0                   1.0     0.0             0.0                  0.0   
1                   1.0     0.0             0.0                

In [89]:
print(Y)

0      0.0
1      0.0
2      0.0
3      1.0
4      0.0
      ... 
853    0.0
854    0.0
855    0.0
856    0.0
857    0.0
Name: Dx:Cancer, Length: 858, dtype: float64


In [90]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [91]:
print(X.shape, X_train.shape, X_test.shape)

(858, 33) (686, 33) (172, 33)


Model Training

Logistic Regression

In [93]:
model = LogisticRegression()

In [94]:
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [95]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

In [96]:
print('Accuracy on training data = ', training_data_accuracy)

Accuracy on training data =  0.9956268221574344


In [97]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)

In [98]:
print('Accuracy on test data = ', test_data_accuracy)

Accuracy on test data =  0.9941860465116279


In [128]:
input_data = (30, 2, 18, 1, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)

# change the input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the numpy array as we are predicting for one datapoint
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The patient have no cervical cancer.')

else:
  print('The patient have cervical cancer.')


[0.]
The patient have no cervical cancer,


