# Cancer Test Results

In [26]:
# load dataset
import numpy as np
import pandas as pd 

df = pd.read_csv('data/cancer_test_data.csv') 

df.head()

Unnamed: 0,patient_id,test_result,has_cancer
0,79452,Negative,False
1,81667,Positive,True
2,76297,Negative,False
3,36593,Negative,False
4,53717,Negative,False


In [30]:
# number of patients
num_patients = df['patient_id'].nunique()
num_patients

2914

In [32]:
# number of patients with cancer
num_cancer = len(df[df['has_cancer'] == True])
num_cancer

306

In [33]:
# number of patients without cancer
num_notcancer = len(df[df['has_cancer'] == False])
num_notcancer

2608

In [36]:
# proportion of patients with cancer
pc = num_cancer / num_patients
pc

0.10501029512697323

In [38]:
# proportion of patients without cancer
pnc = num_notcancer / num_patients
pnc

0.8949897048730268

In [69]:
# proportion of patients with cancer who test positive
num_positive = len(df[df['test_result'] == 'Positive'])
num_positive

# people with cancer
df_can = df[df['has_cancer'] == True]

# people with cancer that test negative
df_can_pos = df_can[df_can['test_result'] == 'Positive']

# portion of people with cancer that test negative
len(df_can_pos) / num_cancer

0.9052287581699346

In [60]:
# proportion of patients with cancer who test negative
# P(N|C) = ?
# p(N|C) = P(N) * P(C|N) / P(C)

# do not use formula, count the people rather:

# people with cancer
df_can = df[df['has_cancer'] == True]

# people with cancer that test negative
df_can_neg = df_can[df_can['test_result'] == 'Negative']

# portion of people with cancer that test negative
len(df_can_neg) / num_cancer

0.09477124183006536

In [62]:
# proportion of patients without cancer who test positive
# P(P|!C) = ?
# p(P|!C) =  P(P) + p(!C|P) / P(!C)

# do not use formula, count the people rather:

# people with cancer
df_notcan = df[df['has_cancer'] == False]

# people with cancer that test positive
df_notcan_pos = df_notcan[df_notcan['test_result'] == 'Positive']

# portion of people with cancer that test negative
len(df_notcan_pos) / num_notcancer

0.2036042944785276

In [71]:
# proportion of patients without cancer who test negative
# P(N|!C) = ? 
# P(N|!C) =  P(N) * P(!C|N) / P(!C)

# people without cancer
df_notcan = df[df['has_cancer'] == False]

# people with cancer that test positive
df_notcan_neg = df_notcan[df_notcan['test_result'] == 'Negative']

# portion of people with cancer that test negative
len(df_notcan_neg) / num_notcancer

0.7963957055214724

In [72]:
# P(N|!C) = 1 - P(P|!C)
1 - (len(df_notcan_pos) / num_notcancer)

0.7963957055214724

In [76]:
# values
pc = 0.105          # P(C)
ph = 1 - 0.105      # P(!C)

ppc = 0.905         # P(positive|cancer) = 0.905
pnc = 0.0951        # P(negative|cancer) = 0.095
pph = 0.204         # P(positive|~cancer) = 0.204
pnh = 0.796         # P(negative|~cancer) = 0.796

In [79]:
# P(cancer|positive)
# ------------------

# P(C|P) = ?    
# P(C|P) = P(C) * P(P|C) / P(P)
# form this formulate P(P) is not known

# calculate p(P)
# P(P) = P(C, P) + P(!C, P)

# P(C, P) = P(C) * P(P|C)
pcp = pc * ppc

# P(!C, P) = P(!C ) * P(P|!C)
php = ph * pph

# P(P) = P(C, P) + P(!C, P)
pp = pcp + php

# P(C|P) = P(C) * P(P|C) / P(P)
pcp = pc * ppc / pp

pcp

0.34230291241151994

In [81]:
# P(~cancer|positive)
# P(!C | P) = ?

php = 1 - pcp
php

0.65769708758848

In [93]:
# P(cancer|negative)
# P(C|N)
# P(C|N) = P(C) *  P(N|C) / P(N)

# P(N) = ?
# P(N) = P(C, N) + P(!C, N)
# P(C, N)?  and P(!C, N)?

# P(C, N) = P(C) * P(N|C)
pc_n = pc * pnc

# P(h, N) = P(h) * P(N|h)
ph_n = ph * pnh

# P(N) = P(C, N) + P(!C, N)
pn = pc_n + ph_n

# P(C|N) = P(C) *  P(N|C) / P(N)
pcn = pc * pnc / pn
pcn

0.013822569180328776

In [94]:
# P(~cancer|negative)
# P(!C|N)
1- pcn

0.9861774308196712