## Initiate Session and Imports

In [None]:
spark

In [None]:
# set max columns, rows, column width in pandas so doesn't truncate
import pandas as pd
pd.set_option('display.max_colwidth',250) # or -1
pd.set_option('display.max_columns', None) # or 500
pd.set_option('display.max_rows', None) # or 500

# sets the cell width to 100% respective to the screen size
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:92% !important; }</style>"))

from pyspark.sql.functions import when, col

## Create Dictionaries for defined codes

### CUA Condition codes (to distinguish population)

In [None]:
CUA_ANY=['Q51.0','752.31', '204844007', '783231007', '783230008', '17142008', 
'Q51.1','Q51.10', 'Q51.11','752.2', '360422007', '10835661000119100', '21346009', '722431007', 
'Q51.2', '752.35',  '1230025003','22504001', 'Q51.3', '752.34', '31401003', '237223005', '237224004', 
'237225003', '237221007', '237220008', '237219002','Q51.818', '752.39','Q51.811', '752.32', '253832006', 
'Q51.4', '752.33', '1372004', 'Q51.810', '752.36', '38437003','Q52.11', '752.46', '142191000119104', 
'Q52.12', 'Q52.120', 'Q52.121', 'Q52.122', 'Q52.123', 'Q52.124', 'Q52.129', '752.47', '142201000119101']

### Cancer Codes

In [None]:
Uterine_Cancers=['233.2', 'D07.0', '92788005', '179.', 'C55', '371973000', '10708511000119108', '94665001', 
'371972005', '446022000', '94215000', '182.0', 'C54.1', '254878006', '1178986006', '699356008', '732201008', 
'1259458001', 'C54.2', '93915004', '94434002', 'C54.3', '449054005', '109882003', 'C54.9', '188193007', '449073009',
'860982007', '371971003', '182.1', 'C54.0', '94355001', '93844001', '702369008','182.8', 'C54.8', '109879008', 
'V10.42', 'Z85.4', '428941002', '672281000119109']

Ovarian_Cancers=['183.0', 'C56.9', 'Z85.43', 'V10.43', 'C56.1', 'C56.2', '369523007', '369530001', '369529006', 
'369522002', '10737861000119101', '10737911000119105', '369533004', '369526004', '369562003', '369570008', 
'93934004','94455000', '15635721000119108', '15930821000119105', '429090009', '16847781000119108','16847701000119100',
'141951000119104', '16847741000119103', '716855006', '422782004', '254849005', '363443007', '827162007', '254874008',
'254869000', '254863004', '254860001', '254852002', '254872007', '10737781000119101', '10737821000119106']

OV_FT=['183.0', 'C56.9', 'Z85.43', 'V10.43', 'C56.1', 'C56.2', 
                 '369523007', '369530001', '369529006', '369522002', 
                 '10737861000119101', '10737911000119105', '369533004', 
                 '369526004', '369562003', '369570008', '93934004','94455000', 
                 '15635721000119108', '15930821000119105', '429090009', 
                 '16847781000119108','16847701000119100','141951000119104', 
                 '16847741000119103', '716855006', '422782004', '254849005', 
                 '363443007', '827162007', '254874008','254869000', '254863004',
                 '254860001', '254852002', '254872007', '10737781000119101',
                 '10737821000119106','254856004', '770601003', '1259386005',
                 '424600001', '424486004', '423274005', '423480004','183.2 ', 
               'C57.00', '276870001', '1259320001', '1197275006', '94295001', 
               '371987000', '369521009', '369514009', '369520005','369513003', 
               '363444001', '369554006', '69548005', '369517002','369544007', 
               '124361000119101']

Cervical_Cancers=['233.1', 'D06.9', '92564006','Z85.41', 'V10.41', '180.0', 'C53.0', '180.1', 'C53.1', '180.8', 
'C53.8', '180.9', 'C53.9', '372097009', '93779009', '94279001', '372098004', '1259514003', '372097009', '208041000119100',
'93779009', '180.1', 'C53.1', '372100004', '372100004', '1259516001', '372099007', '93789008', '94290006', '180.8',
'C53.8', '188184006', '180.9', 'C53.9', '363354003', '188469005', '1259493008', '423973006', '109880006', '372024009',
'188180002', '764951002', '1259433006', '285432005', 'V10.41', 'Z85.41', '429484003', '1197268005', '254888007', '1259387001', 
'766930002', '722683006', '1197263001', '773775004']
        
Other_Gyn_Cancers=['233.39', '233.3', 'D07.39', 'D07.30', '233.30', '92594000', '233.32', 'D07.1', '92802003', 
'233.31', 'D07.2', '92791005', '181.', 'C58.', '236.1', '721567004', '94492000', 'V10.40', 'V10.44', 'Z85.40', 
'Z85.44', '184.8', 'C57.7', 'C57.8', '184.9', 'C57.9', '109878000', '415079009', '363514001', '184.1', 'C51.0', 
'363446004', '94361003', '93850006', '184.2', 'C51.1', '363447008', '94362005', '93851005', '184.3', 'C51.2', 
'371979001', '371980003', '94257006', '184.4', 'C51', '109885001', '94681006', '429635001', '94143002', 
'71111000119109', '447882007', '1144945000', '717731002', '1259681009', '254895003', '402912009', '254897006', 
'C51.9', '184.0', 'C52', '94668004', '372025005', '427844008', '71101000119106', '770686005', '363445000', 
'721563000', '254893005', '105121000119102', '722678003']           

Any_Gyn_Cancers=['233.2', 'D07.0', '92788005', '179.', 'C55', '371973000', '10708511000119108', '94665001', 
                 '371972005', '446022000', '94215000', '182.0', 'C54.1', '254878006', '1178986006', '699356008', 
                 '732201008', '1259458001', 'C54.2', '93915004', '94434002', 'C54.3', '449054005', '109882003', 
                 'C54.9', '188193007', '449073009', '860982007', '371971003', '182.1', 'C54.0', '94355001', 
                 '93844001', '702369008',  '182.8', 'C54.8', '109879008', 'V10.42', 'Z85.4', '428941002', 
                 '672281000119109', '183.0', 'C56.9', 'Z85.43', 'V10.43', 'C56.1', 'C56.2', '369523007', 
                 '369530001', '369529006', '369522002', '10737861000119101', '10737911000119105', '369533004', 
                 '369526004', '369562003', '369570008', '93934004','94455000', '15635721000119108', '15930821000119105', 
                 '429090009', '16847781000119108', '16847701000119100', '141951000119104', '16847741000119103', '716855006',
                 '422782004', '254849005', '363443007', '827162007', '254874008', '254869000', '254863004', '254860001', 
                 '254852002', '254872007', '10737781000119101', '10737821000119106', '183.2', 'C57.00', '276870001', 
                 '1259320001', '1197275006', '94295001', '371987000', '369521009', '369514009', '369520005', '369513003', 
                 '363444001', '369554006', '69548005', '369517002', '369544007', '124361000119101', '183.3', 'C57.10', 
                 '449259009', '94226006', '93728003', '369536007', '369535006', '183.4', 'C57.3', '448674007', '183.5', 
                 'C57.20', '94525009', '93994001', '188204000', '183.8', 'C57.4', '428322007', '428944005', '94664002', 
                 '94126000', '723173003', '1259487000', '183.9', '233.1', 'D06.9', '92564006','Z85.41', 'V10.41', '180.0', 
                 'C53.0', '180.1', 'C53.1', '180.8', 'C53.8', '180.9', 'C53.9', '372097009', '93779009', '94279001', '372098004',
                 '1259514003', '372097009', '208041000119100', '93779009', '180.1', 'C53.1', '372100004', '372100004', '1259516001', 
                 '372099007', '93789008', '94290006', '180.8', 'C53.8', '188184006', '180.9', 'C53.9', '363354003', '188469005', 
                 '1259493008', '423973006', '109880006', '372024009', '188180002', '764951002', '1259433006', '285432005', 'V10.41',
                 'Z85.41', '429484003', '1197268005', '254888007', '1259387001', '766930002', '722683006', '1197263001', '773775004', 
                 '233.39', '233.3', 'D07.39', 'D07.30', '233.30', '92594000', '233.32', 'D07.1', '92802003', '233.31', 'D07.2', '92791005',
                 '181.', 'C58.', '236.1', '721567004', '94492000', 'V10.40', 'V10.44', 'Z85.40', 'Z85.44', '184.8', 'C57.7', 'C57.8', '184.9', 
                 'C57.9', '109878000', '415079009', '363514001', '184.1', 'C51.0', '363446004', '94361003', '93850006', '184.2', 'C51.1', 
                 '363447008', '94362005', '93851005', '184.3', 'C51.2', '371979001', '371980003', '94257006', '184.4', 'C51', '109885001', 
                 '94681006', '429635001', '94143002', '71111000119109', '447882007', '1144945000', '717731002', '1259681009', '254895003', 
                 '402912009', '254897006', 'C51.9', '184.0', 'C52', '94668004', '372025005', '427844008', '71101000119106', '770686005', 
                 '363445000', '721563000', '254893005', '105121000119102', '722678003']
                  
Breast_Cancers=['233.0', 'D05', '92593006', '174', 'C50', '372064008', '254837009', '372137005', '145501000119108',
'188157005', '254837009', '94544002', '12241031000119108', '94297009', '12241071000119106', '93796005', '94176003',
'372093008', '94443006', '254838004', '763479005', '721595004', '1259422008', '286894008', '1259590008', '1259322009',
'448952004', 'V10.3', 'Z85.3', '429087003', '48901000119103', '473062003', '408643008', '278054005', '708921005', 
'703578005', '286897001', '1259442004', '1259383002', '254840009', '703577000', '713609000', '444712000', '254839007'] 


### Other Covariates

#### CUA related

In [None]:
#Endometriosis
Endometriosis=['N80.9', '617.9', 'N80.0', '617.0', 'N80.1', '617.1', 'N80.2', '617.2', 'N80.3', '617.3',  'N80.4', '617.4', 'N80.5', '617.5',
    '103678008', '396224008', '129103003', '233651009', '717700009', '26681001', '35543003', '76376003', '198247003', '266589005', '57493005', '5562006', 
    '17829005', '50993001', '8421002', '237115002', '9563009', '38780008', '61640006', '397318008', '44601009', '724457006', '84305006', '65099004', '29200001000004100',
    '396223002', '314049009', '5327000', '724456002', '15932101000119100', '52533003', '237117005', '198251001', '15932061000119100', '717698000', '10535001', '717699008',
    '12265461000119100', '64286001', '22611009', '15933541000119100', '416155004', '15965741000119100', '15965701000119100', '724454004', '724454004', '724454004', 
    '15965781000119100', '713174005', '715806006', '1144969001', '1144938000', '1144982009', '109861009']

In [None]:
#Infertility
Infertility=['N97', 'N97.0', 'N97.1', 'N97.2', 'N97.8', 'N97.9', '628.2', '628.3', '628.8', '628.4', '628.9'
    '763016003', '712584009', '237145004', '6738008', '26899006', '198456009', '198452006', '39446004', '763017007', '227561004']


In [None]:
#Renal Anomalies
Renal_Anomalies=["753.0", "753.15", "753.2", "753.20",  "753.22", "753.29", "753.3",  "753.4", "753.40", 
    "Q60", "Q60.3", "Q60.4", "Q60.5", "Q61.4", "Q62.10", "Q62.2", "Q62.39", "Q62.5", "Q62.8", "Q62.7", "Q62.63", 
    "Q63", "Q63.1", "Q63.2", "Q62.4", "Q62.1", "Q63.8","204942005", "762908000", "762907005", "204938007", "32659003", 
    "268232000", "361147005", "762914007", "762913001", "93290000", "204949001", "717744007", "717742006", "204950001", 
    "54967001", "253873007", "253874001", "41729002", "373584008", "49496001", "429191000124106", "197811007", "44796002", 
    "55536001", "55856005", "271387005", "44513007", "48061001", "361264003", "32659003", "92921005", "13530005", "16507009"]

In [None]:
#Dysmennorrhea  (primary or secondary)

Dysmen=['625.0', 'N94.4', '625.3', 'N94.5']

In [None]:
#Irregular Periods- absence, excessive or other

Irregular=['626.0', '626.2', '626.8', '626.9', 'N91.0', 'N91.1', 'N91.2'
          'N92.0', 'N92.1', 'N92.6', 'N93.9', 'N92' ]  

In [None]:
#Spinal/ vertebral anomalies (includes fused vertebral column)

spinal=['756.1', 'Q76', 'Q76.0', 'Q76.1 ', 'Q76.2', 'Q76.3', '756.19', 'Q76.4', 
        'Q76.5', 'Q76.6', 'Q76.8', 'Q76.9'  ]

In [None]:
#scoliosis (including idiopathic and khyposcoliosis)
##M41 encompasses all

scoliosis=['737.30', '737.81', '737.82', '737.83', 'M41']

In [None]:
#Segmental loss of fallopian tubes= unspecified or 
#other anomaly of FT and broad ligaments

FT_loss=['752.10', '752.19', 'Q50.6']

In [None]:
#NS hearing loss  = sensorineural hearing loss

hearing_loss=['389.1', 'H90.3', 'H90.5' ]

In [None]:
#Cloacal Dysplasia & Ectopia vesicae

cloac_dys=['Q64.1', 'Q64.0']

In [None]:
#Miscarriage (spontaneous abortion, all codes- complications or not)

Miscarriage=['634', 'O03', 'N96', '629.81' ]

In [None]:
#Early onset menstruation/ precoscious puberty

EOM= ['259.1', 'E30.1']

In [None]:
#(Trapped?)Blood in Repro system= hematometra, hematocolpos, hematosalpinx

Hemato=['N85.7', '621.4', 'N89.7', 'N83.6', '626.8', 'N93.9', 'N93.8']

In [None]:
#preterm labor with/without delivery birth  (with/ without infant death)
preterm=['644.2', '644.0', 'O60']

In [None]:
#Malpresentation & malposition or restricted growth
malpresent=['O32', 'O64', '763.0', '763.1', '652']

In [None]:
#Poor fetal growth/ resticted
poor_growth=['656.5', 'O36.59']

In [None]:
#Placental Retention with and without hemmorhage

placenta_reten=['667', '641' 'O73', 'O44.1', 'O44.3', 'O44.5']

In [None]:
#Maternal Care & Complications of Labor & Delivery
##O60  Preterm labor,O61  Failed induction of labor,O62  Abnormalities of forces of labor (661),O63  Long labor (662,
##O64  Obstructed labor due to malposition and malpresentation of fetus,O65  Obstructed labor due to maternal pelvic abnormality,
##O66  Other obstructed labor,O67  Labor and delivery complicated by intrapartum hemorrhage, not elsewhere classified,
###O69  Labor and delivery complicated by umbilical cord complications (663), O71  Other obstetric trauma (665),
###O72  Postpartum hemorrhage (666),O73  Retained placenta and membranes, without hemorrhage,
###O75  Other complications of labor and delivery, not elsewhere classified (669),
###032- malpresentation of fetus, O34- Maternal care for abnormality of pelvic organs, 
###O42- PPROM; O45- Premamture separation of placenta; O46- antepartum hemmorage; O20- hemmoraghe in early pregnancy
##O36.59- poor fetal growth; O44.1,3,5 Placental Retention with and without hemmorhage
##Obstructed- 660.0,660.1, 660.2, 660.3, 660.5, 660.6, 660.7, 660.8, 660.9

MCCLD=['O60', 'O61', 'O62', 'O63', 'O64', 'O65', 'O66', 'O67', 'O69', 'O71', 'O72', 'O73', 'O75',
     'O32', 'O34', 'O42', 'O46', 'O20', '644', '640', '641', '652', '654.0', '654.2', '654.3','654.4', 
       '654.5', '654.6', '654.7', '654.8', '654.9', '656.5' '669.9', 'O36.59', '667', '641' 'O73', 'O44.1', 
       'O44.3', 'O44.5', '658.1', '661', '660.0', '660.1', '660.2', '660.3', '660.5', '660.6', '660.7',
      '660.8', '660.9','662', '663', '666', '665', '669' ]


In [None]:
##High Risk Pregnancy

HRP= ['O09', 'V23.9']

In [None]:
#Preterm premature rupture of membrane- included in MCCLD
PPROM=['O42.01', 'O42.11', 'O42.91', '658.1']

In [None]:
#OCLD= Other complications of labor and delivery
OCLD=['O75', '669.9']

In [None]:
#C section, general/ any evidence-- Need to pull from both condition and procedure table
csect=['O82', 'O34.21', 'O75.82', '59510' '59514', '59618', '654.23', '161805006', '712655005',
      '725951006', '654.21', '712653003', '736018001', '17744000', '59525', '200151008',
      '236985002', '736026009', '654.20', '200144004', '398307005', '736118004', '649.82',
       '725949007', '712654009', '59620', '177141003', '200149009', '302254004', '64756007',
       '200150009', '57271003', '709004006', '736020003', '74.99', '302253005', '788180009',
       '74.1', '74.2', '84195007', '59622', '177143000', '649.81', '236986001', 'O75.82',
       '199331008', '10D00Z0', '10D00Z1', '10D00Z2']

In [None]:
## Classical C section (Procedure code); "elective upper segment cesarean;

ccsect=['74.0', '10D00Z0', '84195007', '736018001', '736020003', '236986001' ]

In [None]:
## Low c section- includes emergency and elective

lcsect=['74.1', '10D00Z1', '236985002', '736026009', '398307005', '736118004', '709004006',
       '788180009', '177143000']

In [None]:
## Extraperitoneal c-section
epcsect=['74.2', '10D00Z2', '57271003']

### Cancer risk factors/ lifestyle

In [None]:
#HPV
HPV=['R87.81', 'R87.82', 'R87.820', 'R87.821', '240532009', '35904009', '35904009', 
    '721587003', '787723002', '718591004', '871634008', '721266009', '720005005', '25361000087102', '302812006',
    '766839002', '24461000087106', '766848007', '24471000087102', '441667007', '766842008', '766842008',
    '766827005', '24431000087104', '126251000119100', '126241000119102', '24451000087108', '24441000087105']

In [None]:
#HIV
HIV=['042', 'B20']

In [None]:
#Other STIs = Gonorrhea, syphyllis, herpes, other, Chlamydia

STI=['098', 'A54', '097', '091', '095', 'A50', 'A51', 'A52', 
     'A53', '054.1', 'A60', 'A63', '099.8', 'A56', 'A55', '099.5']  

In [None]:
#Smoking and history of tobacco use
Smoker=['305.1', 'F17', 'Z87.891', 'Z72.0', 'V69.8', 'V15.82']

In [None]:
#Substance Abuse, other= opiod, sedatives, cannabis, amphetamines, cocaine, hallucinogens
Substance_Abuse= ['304', '304.0', '304.3', '304.5', '304.7', '305.2', '305.4', '305.7',
                  'F11', 'F13', 'F12', 'F15', 'F14', 'F16']

In [None]:
#Alcohol Dependence/ Abuse (not mental illness related)
Alcohol=['303', '305.00', '305.01', 'F10.1', 'F10.2']

In [None]:
#Diabetes Type 2

Db2=['250.00', '250.02', '250.10', '250.20', '250.22', '250.30'
                '250.12', '250.32', '250.40', '250.42', '250.50', '250.52',
                '250.60', '250.62', '250.70', '250.72', '250.80', '250.82',
                '250.90', '250.92', 'E11']

In [None]:
#Hypertensive diseases; Ischemic Heart Diseases; Pulmonary circ diseases; 
#other forms of heart disease

CVD=['401', '402', '403', '404', '405', '410', '411', '412', '413', '414', 
     '415', '416', '417', '420', '421', '422', '423', '424', '425',
     '426', '427', '428', '429', 'I10', 'I11', 'I12', 'I13', 'I15', 'I16',
     'I1A', 'I20', 'I21', 'I22', 'I23', 'I24', 'I25', 'I26', 'I27', 'I28',
     'I30', 'I31', 'I32', 'I33', 'I34', 'I35', 'I36', 'I37', 'I38', 'I39', 
     'I40', 'I41', 'I42', 'I43', 'I44', 'I45', 'I46', 'I47', 'I48', 'I49', 
     'I50', 'I51', 'I52']

In [None]:
#Obesity
Obesity=['278.00', '278.01', 'E66.9', 'E66.01', 'E66.09', 'E66.1', 'E66.2']

In [None]:
#Family history of Gyn cancer- ovary, uterus, cervix, & ovary
Family_History_Gyn=['V16.4', 'Z80.4']

In [None]:
#Family history of cancer- any/other
FHC=['Z80', 'V16' ]

In [None]:
#Menopause- based on  female climacteric states or asymptomatic
Menopause=['N951.1', 'Z78.0,' '627.2', 'V49.81']

In [None]:
##Pregnancy : Pregnant state or Z3A= weeks of gestation and can be further classified

Pregnancy=['Z33', 'Z3A', '765.2', 'V22', 'O82', 'O80', '650']

In [None]:
#Ectopic or Abdominal

Ectopic=['633.0', '633.9', 'O00']

In [None]:
#Lynch Syndrome= coded as Genetic susceptibility to malignant neoplasm 
#of other organs  (typically colorectal and endometrial)

lynch=['V84.09', 'Z15.09']

In [None]:
#PCOS

PCOS=['256.4', 'E28.2']

## Call in Condition/ Procedure Tables from Stored DB

In [None]:
spark.sql("use non_CUA_db")

In [None]:
## Skeleton To build from

pid= spark.sql("""
    SELECT *
    FROM personid_table
""")
pid

In [None]:
pid.count()

In [None]:
## Condition code Table (used for all)

non_con= spark.sql("""
    SELECT personid, standardid AS stdid, display
    FROM dem_con_left
""")
non_con

In [None]:
non_concat.limit(5).toPandas()

In [None]:
non_concat.select('personid').distinct().count()

In [None]:
## Procedure Table (only used for procedures- C sections)
non_pro= spark.sql("""
    SELECT personid, procedure_code,procedure_display
    FROM procedure_table
""")
non_pro

## Initial binary pulls

In [None]:
CUA_ANY = CUA_con.withColumn(
    "CUA_ANY",
    when(col("stdid").isin(CUA_ANY), 1).otherwise(0))

condition_column = 'CUA_ANY'

# Count the number of patients with the condition (where the column value is 1)

filtered_CUA = CUA_ANY.filter(col(condition_column) == 1)

filtered_CUA

distinct_CUA=filtered_CUA.dropDuplicates(['personid'])

print(distinct_CUA.count())

clean_CUA=distinct_CUA.drop('stdid', 'display')

In [None]:
clean_CUA.limit(5).toPandas(

In [None]:
non_endo = non_con.withColumn(
    "endo",
    when(col("stdid").isin(Endometriosis), 1).otherwise(0))

condition_column = 'endo'


# Count the number of patients with the condition (where the column value is 1)

filtered_endo = non_endo.filter(col(condition_column) == 1)

filtered_endo

distinct_endo=filtered_endo.dropDuplicates(['personid'])

print(distinct_endo.count())

clean_endo=distinct_endo.drop('stdid', 'display')

In [None]:
non_infertility = non_con.withColumn(
    "infertility",
    when(col("stdid").isin(Infertility), 1).otherwise(0))

# Count the number of patients with the condition (where the column value is 1)

filtered_infertility = non_infertility.filter(col(condition_column) == 1)

filtered_infertility

distinct_infertility=filtered_infertility.dropDuplicates(['personid'])

print(distinct_infertility.count())

clean_infertility=distinct_infertility.drop('stdid', 'display')

In [None]:
non_RA = non_con.withColumn(
    "RA",
    when(col("stdid").isin(Renal_Anomalies), 1).otherwise(0))

condition_column = 'RA'

# Count the number of patients with the condition (where the column value is 1)

filtered_RA = non_RA.filter(col(condition_column) == 1)

filtered_RA

distinct_RA=filtered_RA.dropDuplicates(['personid'])

print(distinct_RA.count())

clean_RA=distinct_RA.drop('stdid', 'display')

In [None]:
non_Dys = non_con.withColumn(
    "Dysmen",
    when(col("stdid").isin(Dysmen), 1).otherwise(0))

# Count the number of patients with the condition (where the column value is 1)

filtered_Dys = non_Dys.filter(col(condition_column) == 1)

filtered_Dys

distinct_Dys=filtered_Dys.dropDuplicates(['personid'])

print(distinct_Dys.count())

clean_Dys=distinct_Dys.drop('stdid', 'display')

In [None]:
non_Irreg = non_con.withColumn(
    "Irregular",
    when(col("stdid").isin(Irregular), 1).otherwise(0))

condition_column = 'Irregular'

# Count the number of patients with the condition (where the column value is 1)

filtered_Irreg = non_Irreg.filter(col(condition_column) == 1)

filtered_Irreg

distinct_Irreg=filtered_Irreg.dropDuplicates(['personid'])

print(distinct_Irreg.count())

clean_Irreg=distinct_Irreg.drop('stdid', 'display')

In [None]:
non_spinal = non_con.withColumn(
    "spinal",
    when(col("stdid").isin(spinal), 1).otherwise(0))

condition_column = 'spinal'

# Count the number of patients with the condition (where the column value is 1)

filtered_spinal = non_spinal.filter(col(condition_column) == 1)

filtered_spinal

distinct_spinal=filtered_spinal.dropDuplicates(['personid'])

print(distinct_spinal.count())

clean_spinal=distinct_spinal.drop('stdid', 'display')

In [None]:
non_scoliosis = non_con.withColumn(
    "scoliosis",
    when(col("stdid").isin(scoliosis), 1).otherwise(0))

condition_column = 'scoliosis'

# Count the number of patients with the condition (where the column value is 1)

filtered_scoliosis = non_scoliosis.filter(col(condition_column) == 1)

filtered_scoliosis

distinct_scoliosis=filtered_scoliosis.dropDuplicates(['personid'])

print(distinct_scoliosis.count())

clean_scoliosis=distinct_scoliosis.drop('stdid', 'display')

In [None]:
non_FT = non_con.withColumn(
    "FT_loss",
    when(col("stdid").isin(FT_loss), 1).otherwise(0))

condition_column = 'FT_loss'

# Count the number of patients with the condition (where the column value is 1)

filtered_FT = non_FT.filter(col(condition_column) == 1)

filtered_FT

distinct_FT=filtered_FT.dropDuplicates(['personid'])

print(distinct_FT.count())

clean_FT=distinct_FT.drop('stdid', 'display')

In [None]:
non_hl = non_con.withColumn(
    "hearing_loss",
    when(col("stdid").isin(hearing_loss), 1).otherwise(0))

condition_column = 'hearing_loss'

# Count the number of patients with the condition (where the column value is 1)

filtered_hl = non_hl.filter(col(condition_column) == 1)

filtered_hl

distinct_hl=filtered_hl.dropDuplicates(['personid'])

print(distinct_hl.count())

clean_hl=distinct_hl.drop('stdid', 'display')

In [None]:
non_cloacdys = non_con.withColumn(
    "cloac_dys",
    when(col("stdid").isin(cloac_dys), 1).otherwise(0))

condition_column = 'cloac_dys'

# Count the number of patients with the condition (where the column value is 1)

filtered_cloacdys = non_cloacdys.filter(col(condition_column) == 1)

filtered_cloacdys

distinct_cloacdys=filtered_cloacdys.dropDuplicates(['personid'])

print(distinct_cloacdys.count())

clean_cloacdys=distinct_cloacdys.drop('stdid', 'display')

In [None]:
non_mc = non_con.withColumn(
    "mc",
    when(col("stdid").isin(Miscarriage), 1).otherwise(0))

condition_column = 'mc'

# Count the number of patients with the condition (where the column value is 1)

filtered_mc = non_mc.filter(col(condition_column) == 1)

filtered_mc

distinct_mc=filtered_mc.dropDuplicates(['personid'])

print(distinct_mc.count())

clean_mc=distinct_mc.drop('stdid', 'display')

In [None]:
non_eom = non_con.withColumn(
    "EOM",
    when(col("stdid").isin(EOM), 1).otherwise(0))

condition_column = 'EOM'

# Count the number of patients with the condition (where the column value is 1)

filtered_eom = non_eom.filter(col(condition_column) == 1)

filtered_eom

distinct_eom=filtered_eom.dropDuplicates(['personid'])

print(distinct_eom.count())

clean_eom=distinct_eom.drop('stdid', 'display')

In [None]:
non_hemato = non_con.withColumn(
    "hemato",
    when(col("stdid").isin(Hemato), 1).otherwise(0))

condition_column = 'hemato'

# Count the number of patients with the condition (where the column value is 1)

filtered_hemato = non_hemato.filter(col(condition_column) == 1)

filtered_hemato

distinct_hemato=filtered_hemato.dropDuplicates(['personid'])

print(distinct_hemato.count())

clean_hemato=distinct_hemato.drop('stdid', 'display')

In [None]:
non_preterm = non_con.withColumn(
    "preterm",
    when(col("stdid").isin(preterm), 1).otherwise(0))

condition_column = 'preterm'

# Count the number of patients with the condition (where the column value is 1)

filtered_preterm = non_preterm.filter(col(condition_column) == 1)

filtered_preterm

distinct_preterm=filtered_preterm.dropDuplicates(['personid'])

print(distinct_preterm.count())

clean_preterm=distinct_preterm.drop('stdid', 'display')

In [None]:
non_mal = non_con.withColumn(
    "malpresent",
    when(col("stdid").isin(malpresent), 1).otherwise(0))

condition_column = 'malpresent'

# Count the number of patients with the condition (where the column value is 1)

filtered_mal = non_mal.filter(col(condition_column) == 1)

filtered_mal

distinct_mal=filtered_mal.dropDuplicates(['personid'])

print(distinct_mal.count())

clean_mal=distinct_mal.drop('stdid', 'display')

In [None]:
non_pg = non_con.withColumn(
    "poor_growth",
    when(col("stdid").isin(poor_growth), 1).otherwise(0))

condition_column = 'poor_growth'

# Count the number of patients with the condition (where the column value is 1)

filtered_pg = non_pg.filter(col(condition_column) == 1)

filtered_pg

distinct_pg=filtered_pg.dropDuplicates(['personid'])

print(distinct_pg.count())

clean_pg=distinct_pg.drop('stdid', 'display')

In [None]:
non_pr = non_con.withColumn(
    "placenta_reten",
    when(col("stdid").isin(placenta_reten), 1).otherwise(0))

condition_column = 'placenta_reten'

# Count the number of patients with the condition (where the column value is 1)

filtered_pr = non_pr.filter(col(condition_column) == 1)

filtered_pr

distinct_pr=filtered_pr.dropDuplicates(['personid'])

print(distinct_pr.count())

clean_pr=distinct_pr.drop('stdid', 'display')

In [None]:
non_HPV = non_con.withColumn(
    "HPV",
    when(col("stdid").isin(HPV), 1).otherwise(0))

condition_column = 'HPV'

# Count the number of patients with the condition (where the column value is 1)

filtered_HPV = non_HPV.filter(col(condition_column) == 1)

filtered_HPV

distinct_HPV=filtered_HPV.dropDuplicates(['personid'])

print(distinct_HPV.count())

clean_HPV=distinct_HPV.drop('stdid', 'display')

In [None]:
non_HIV = non_con.withColumn(
    "HIV",
    when(col("stdid").isin(HIV), 1).otherwise(0))

condition_column = 'HIV'

# Count the number of patients with the condition (where the column value is 1)

filtered_HIV = non_HIV.filter(col(condition_column) == 1)

filtered_HIV

distinct_HIV=filtered_HIV.dropDuplicates(['personid'])

print(distinct_HIV.count())

clean_HIV=distinct_HIV.drop('stdid', 'display')

In [None]:
non_STI = non_con.withColumn(
    "STI",
    when(col("stdid").isin(STI), 1).otherwise(0))

condition_column = 'STI'

# Count the number of patients with the condition (where the column value is 1)

filtered_STI = non_STI.filter(col(condition_column) == 1)

filtered_STI

distinct_STI=filtered_STI.dropDuplicates(['personid'])

print(distinct_STI.count())

clean_STI=distinct_STI.drop('stdid', 'display')

In [None]:
non_Smoker = non_con.withColumn(
    "Smoker",
    when(col("stdid").isin(Smoker), 1).otherwise(0))

condition_column = 'Smoker'

# Count the number of patients with the condition (where the column value is 1)

filtered_Smoker = non_Smoker.filter(col(condition_column) == 1)

filtered_Smoker

distinct_Smoker=filtered_Smoker.dropDuplicates(['personid'])

print(distinct_Smoker.count())

clean_Smoker=distinct_Smoker.drop('stdid', 'display')

In [None]:
non_SA = non_con.withColumn(
    "SA",
    when(col("stdid").isin(Substance_Abuse), 1).otherwise(0))

condition_column = 'SA'

# Count the number of patients with the condition (where the column value is 1)

filtered_SA = non_SA.filter(col(condition_column) == 1)

filtered_SA

distinct_SA=filtered_SA.dropDuplicates(['personid'])

print(distinct_SA.count())

clean_SA=distinct_SA.drop('stdid', 'display')

In [None]:
non_AA = non_con.withColumn(
    "AA",
    when(col("stdid").isin(Alcohol), 1).otherwise(0))

condition_column = 'AA'

# Count the number of patients with the condition (where the column value is 1)

filtered_AA = non_AA.filter(col(condition_column) == 1)

filtered_AA

distinct_AA=filtered_AA.dropDuplicates(['personid'])

print(distinct_AA.count())

clean_AA=distinct_AA.drop('stdid', 'display')

In [None]:
non_Db2 = non_con.withColumn(
    "Db2",
    when(col("stdid").isin(Db2), 1).otherwise(0))

condition_column = 'Db2'

# Count the number of patients with the condition (where the column value is 1)

filtered_Db2 = non_Db2.filter(col(condition_column) == 1)

filtered_Db2

distinct_Db2=filtered_Db2.dropDuplicates(['personid'])

print(distinct_Db2.count())

clean_Db2=distinct_Db2.drop('stdid', 'display')

In [None]:
non_CVD = non_con.withColumn(
    "CVD",
    when(col("stdid").isin(CVD), 1).otherwise(0))

condition_column = 'CVD'

# Count the number of patients with the condition (where the column value is 1)

filtered_CVD = non_CVD.filter(col(condition_column) == 1)

filtered_CVD

distinct_CVD=filtered_CVD.dropDuplicates(['personid'])

print(distinct_CVD.count())

clean_CVD=distinct_CVD.drop('stdid', 'display')

In [None]:
non_FHGC = non_con.withColumn(
    "FHGC",
    when(col("stdid").isin(Family_History_Gyn), 1).otherwise(0))

condition_column = 'FHGC'

# Count the number of patients with the condition (where the column value is 1)

filtered_FHGC = non_FHGC.filter(col(condition_column) == 1)

filtered_FHGC

distinct_FHGC=filtered_FHGC.dropDuplicates(['personid'])

print(distinct_FHGC.count())

clean_FHGC=distinct_FHGC.drop('stdid', 'display')

In [None]:
non_FHC = non_con.withColumn(
    "FHC",
    when(col("stdid").isin(FHC), 1).otherwise(0))

condition_column = 'FHC'

# Count the number of patients with the condition (where the column value is 1)

filtered_FHC = non_FHC.filter(col(condition_column) == 1)

filtered_FHC

distinct_FHC=filtered_FHC.dropDuplicates(['personid'])

print(distinct_FHC.count())

clean_FHC=distinct_FHC.drop('stdid', 'display')

In [None]:
non_meno = non_con.withColumn(
    "meno",
    when(col("stdid").isin(Menopause), 1).otherwise(0))

condition_column = 'meno'

# Count the number of patients with the condition (where the column value is 1)

filtered_meno= non_meno.filter(col(condition_column) == 1)

filtered_meno

distinct_meno=filtered_meno.dropDuplicates(['personid'])

print(distinct_meno.count())

clean_meno=distinct_meno.drop('stdid', 'display')

In [None]:
non_preg = non_con.withColumn(
    "preg",
    when(col("stdid").isin(Pregnancy), 1).otherwise(0))

condition_column = 'preg'


# Count the number of patients with the condition (where the column value is 1)

filtered_preg = non_preg.filter(col(condition_column) == 1)

filtered_preg

distinct_preg=filtered_preg.dropDuplicates(['personid'])

print(distinct_preg.count())

clean_preg=distinct_preg.drop('stdid', 'display')

In [None]:
non_ectop = non_con.withColumn(
    "ectop",
    when(col("stdid").isin(Ectopic), 1).otherwise(0))

condition_column = 'ectop'


# Count the number of patients with the condition (where the column value is 1)

filtered_ectop = non_ectop.filter(col(condition_column) == 1)

filtered_ectop

distinct_ectop=filtered_ectop.dropDuplicates(['personid'])

print(distinct_ectop.count())

clean_ectop=distinct_ectop.drop('stdid', 'display')

In [None]:
non_lynch = non_con.withColumn(
    "lynch",
    when(col("stdid").isin(lynch), 1).otherwise(0))

condition_column = 'lynch'


# Count the number of patients with the condition (where the column value is 1)

filtered_lynch = non_lynch.filter(col(condition_column) == 1)

filtered_lynch

distinct_lynch=filtered_lynch.dropDuplicates(['personid'])

print(distinct_lynch.count())

clean_lynch=distinct_lynch.drop('stdid', 'display')

In [None]:
non_PCOS = non_con.withColumn(
    "PCOS",
    when(col("stdid").isin(PCOS), 1).otherwise(0))

condition_column = 'PCOS'


# Count the number of patients with the condition (where the column value is 1)

filtered_PCOS = non_PCOS.filter(col(condition_column) == 1)

filtered_PCOS

distinct_PCOS=filtered_PCOS.dropDuplicates(['personid'])

print(distinct_PCOS.count())

clean_PCOS=distinct_PCOS.drop('stdid', 'display')

In [None]:
PPROMc = non_con.withColumn(
    "PPROM",
    when(col("stdid").isin(PPROM), 1).otherwise(0))

condition_column = 'PPROM'

filtered_PPROM = PPROMc.filter(col(condition_column) == 1)

filtered_PPROM

distinct_PPROM=filtered_PPROM.dropDuplicates(['personid'])

print(distinct_PPROM.count())


clean_PPROM=distinct_PPROM.drop('stdid', 'display')

In [None]:
MCCLDc = non_con.withColumn(
    "MCCLD",
    when(col("stdid").isin(MCCLD), 1).otherwise(0))

condition_column = 'MCCLD'

filtered_MCCLD = MCCLDc.filter(col(condition_column) == 1)

filtered_MCCLD

distinct_MCCLD=filtered_MCCLD.dropDuplicates(['personid'])

print(distinct_MCCLD.count())

clean_MCCLD=distinct_MCCLD.drop('stdid', 'display')

In [None]:
HRPc = non_con.withColumn(
    "HRP",
    when(col("stdid").isin(HRP), 1).otherwise(0))

condition_column = 'HRP'

filtered_HRP = HRPc.filter(col(condition_column) == 1)

filtered_HRP

distinct_HRP=filtered_HRP.dropDuplicates(['personid'])

print(distinct_HRP.count())

clean_HRP=distinct_HRP.drop('stdid', 'display')

In [None]:
csectc = non_con.withColumn(
    "csect",
    when(col("stdid").isin(csect), 1).otherwise(0))

condition_column = 'csect'

filtered_csect = csectc.filter(col(condition_column) == 1)

filtered_csect

distinct_csect=filtered_csect.dropDuplicates(['personid'])

print(distinct_csect.count())

clean_csect=distinct_csect.drop('stdid', 'display')

In [None]:
csectp = non_pro.withColumn(
    "csect",
    when(col("procedure_code").isin(csect), 1).otherwise(0))

condition_column = 'csect'

filtered_csect1 = csectp.filter(col(condition_column) == 1)

filtered_csect1

distinct_csect1=filtered_csect1.dropDuplicates(['personid'])

print(distinct_csect1.count())

clean_csect1=distinct_csect1.drop('procedure_code', 'procedure_display')

In [None]:
combined_csect=clean_csect.union(clean_csect1)
print(combined_csect.count())

In [None]:
csect2=combined_csect.dropDuplicates()
csect2.count()

In [None]:
ccsectc = non_con.withColumn(
    "ccsect",
    when(col("stdid").isin(ccsect), 1).otherwise(0))

condition_column = 'ccsect'

filtered_ccsect = ccsectc.filter(col(condition_column) == 1)

filtered_ccsect

distinct_ccsect=filtered_ccsect.dropDuplicates(['personid'])

print(distinct_ccsect.count())

clean_ccsect=distinct_ccsect.drop('stdid', 'display')

In [None]:
ccsectp = non_pro.withColumn(
    "ccsect",
    when(col("procedure_code").isin(ccsect), 1).otherwise(0))

condition_column = 'ccsect'

filtered_ccsect1 = ccsectp.filter(col(condition_column) == 1)

filtered_ccsect1

distinct_ccsect1=filtered_ccsect1.dropDuplicates(['personid'])

print(distinct_ccsect1.count())

clean_ccsect1=distinct_ccsect1.drop('procedure_code', 'procedure_display')

In [None]:
combined_ccsect=clean_ccsect.union(clean_ccsect1)
combined_ccsect.count()

In [None]:
ccsect2=combined_ccsect.dropDuplicates()
ccsect2.count()

In [None]:
lcsectc = non_con.withColumn(
    "lcsect",
    when(col("stdid").isin(lcsect), 1).otherwise(0))

condition_column = 'lcsect'

filtered_lcsect = lcsectc.filter(col(condition_column) == 1)

filtered_lcsect

distinct_lcsect=filtered_lcsect.dropDuplicates(['personid'])

print(distinct_lcsect.count())

clean_lcsect=distinct_lcsect.drop('stdid', 'display')

In [None]:
lcsectp = non_pro.withColumn(
    "lcsect",
    when(col("procedure_code").isin(lcsect), 1).otherwise(0))

condition_column = 'lcsect'

filtered_lcsect1 = lcsectp.filter(col(condition_column) == 1)

filtered_lcsect1

distinct_lcsect1=filtered_lcsect1.dropDuplicates(['personid'])

print(distinct_lcsect1.count())

clean_lcsect1=distinct_lcsect1.drop('procedure_code', 'procedure_display')

In [None]:
combined_lcsect=clean_lcsect.union(clean_lcsect1)
combined_lcsect.count()

In [None]:
lcsect2=combined_lcsect.dropDuplicates()
lcsect2.count()

In [None]:
epcsectc = non_con.withColumn(
    "epcsect",
    when(col("stdid").isin(epcsect), 1).otherwise(0))

condition_column = 'epcsect'

filtered_epcsect = epcsectc.filter(col(condition_column) == 1)

filtered_epcsect

distinct_epcsect=filtered_epcsect.dropDuplicates(['personid'])

print(distinct_epcsect.count())

clean_epcsect=distinct_epcsect.drop('stdid', 'display')

In [None]:
epcsectp = non_pro.withColumn(
    "epcsect",
    when(col("procedure_code").isin(epcsect), 1).otherwise(0))

condition_column = 'epcsect'

filtered_epcsect1 = epcsectp.filter(col(condition_column) == 1)

filtered_epcsect1

distinct_epcsect1=filtered_epcsect1.dropDuplicates(['personid'])

print(distinct_epcsect1.count())

clean_epcsect1=distinct_epcsect1.drop('procedure_code', 'procedure_display')

In [None]:
combined_epcsect=clean_epcsect.union(clean_epcsect1)
combined_epcsect.count()

In [None]:
epcsect2=combined_epcsect.dropDuplicates()
epcsect2.count()

#### Cancer (Outcomes last)

In [None]:
non_PCOS = non_con.withColumn(
    "PCOS",
    when(col("stdid").isin(PCOS), 1).otherwise(0))

condition_column = 'PCOS'


# Count the number of patients with the condition (where the column value is 1)

filtered_PCOS = non_PCOS.filter(col(condition_column) == 1)

filtered_PCOS

distinct_PCOS=filtered_PCOS.dropDuplicates(['personid'])

print(distinct_PCOS.count())

clean_PCOS=distinct_PCOS.drop('stdid', 'display')

In [None]:
UTC = non_con.withColumn(
    "UTC",
    when(col("stdid").isin(Uterine_Cancers), 1).otherwise(0))

condition_column = 'UTC'

filtered_UTC = UTC.filter(col(condition_column) == 1)

filtered_UTC

distinct_UTC=filtered_UTC.dropDuplicates(['personid'])

print(distinct_UTC.count())

clean_UTC=distinct_UTC.drop('stdid', 'display')

In [None]:
OVC = non_con.withColumn(
    "OVC2",
    when(col("stdid").isin(Ovarian_Cancers), 1).otherwise(0))

condition_column = 'OVC'

# Count the number of patients with the condition (where the column value is 1)

filtered_OVC = OVC.filter(col(condition_column) == 1)

filtered_OVC

distinct_OVC=filtered_OVC.dropDuplicates(['personid'])

print(distinct_OVC.count())

clean_OVC=distinct_OVC.drop('stdid', 'display')

In [None]:
OV_FT = non_con.withColumn(
    "OV_FT",
    when(col("stdid").isin(OV_FT), 1).otherwise(0))

condition_column = 'OV_FT'

filtered_OV_FT = OV_FT.filter(col(condition_column) == 1)

filtered_OV_FT

distinct_OV_FT=filtered_OV_FT.dropDuplicates(['personid'])

print(distinct_OV_FT.count())

clean_OV_FT=distinct_OV_FT.drop('stdid', 'display')

In [None]:
CVX = non_con.withColumn(
    "CVX",
    when(col("stdid").isin(Cervical_Cancers), 1).otherwise(0))

condition_column = 'CVX'

filtered_CVX = CVX.filter(col(condition_column) == 1)

filtered_CVX

distinct_CVX=filtered_CVX.dropDuplicates(['personid'])

print(distinct_CVX.count())

clean_CVX=distinct_CVX.drop('stdid', 'display')

In [None]:
Other_Gyn = non_con.withColumn(
    "Other_Gyn",
    when(col("stdid").isin(Other_Gyn_Cancers), 1).otherwise(0))

condition_column = 'Other_Gyn'

filtered_Other_Gyn = Other_Gyn.filter(col(condition_column) == 1)

filtered_Other_Gyn

distinct_Other_Gyn=filtered_Other_Gyn.dropDuplicates(['personid'])

print(distinct_Other_Gyn.count())

clean_Other_Gyn=distinct_Other_Gyn.drop('stdid', 'display')

In [None]:
Any_Gyn = non_con.withColumn(
    "Any_Gyn",
    when(col("stdid").isin(Any_Gyn_Cancers), 1).otherwise(0))

condition_column = 'Any_Gyn'

filtered_Any_Gyn = Any_Gyn.filter(col(condition_column) == 1)

filtered_Any_Gyn

distinct_Any_Gyn=filtered_Any_Gyn.dropDuplicates(['personid'])

print(distinct_Any_Gyn.count())

clean_Any_Gyn=distinct_Any_Gyn.drop('stdid', 'display')

In [None]:
Breastc = non_con.withColumn(
    "Breastc",
    when(col("stdid").isin(Breast_Cancers), 1).otherwise(0))

condition_column = 'Breastc'

filtered_Breastc = Breastc.filter(col(condition_column) == 1)

filtered_Breastc

distinct_Breastc=filtered_Breastc.dropDuplicates(['personid'])

print(distinct_Breastc.count())

clean_Breastc=distinct_Breastc.drop('stdid', 'display')

## Combine Data

In [None]:
pid = spark.sql("""
    select personid
    from personid_table
    """)
pid.cache()

In [None]:
pid_CUA_ANY= pid.join(clean_CUA_ANY, 'personid', 'left')

column_name = 'CUA_ANY'

# Replace "NaN" values with zeros in the specified column
Binary_CUA_ANY = pid_CUA_ANY.fillna({column_name: 0})
Binary_CUA_ANY1=Binary_CUA_ANY.dropDuplicates()

Binary_CUA_ANY1.select('personid').count()

In [None]:
pid_endo= Binary_CUA_ANY1.join(clean_endo, 'personid', 'left')

column_name = 'endo'

# Replace "NaN" values with zeros in the specified column
Binary_endo = pid_endo.fillna({column_name: 0})
Binary_endo1=Binary_endo.dropDuplicates()

Binary_endo1.select('personid').count()

In [None]:
pid_infertility= Binary_endo1.join(clean_infertility, 'personid', 'left')

column_name = 'infertility'

# Replace "NaN" values with zeros in the specified column
Binary_infertility = pid_infertility.fillna({column_name: 0})
Binary_infertility1=Binary_infertility.dropDuplicates()

Binary_infertility1.select('personid').count()

In [None]:
pid_RA= Binary_infertility1.join(clean_RA, 'personid', 'left')

column_name = 'RA'

# Replace "NaN" values with zeros in the specified column
Binary_RA = pid_RA.fillna({column_name: 0})
Binary_RA1=Binary_RA.dropDuplicates()

Binary_RA1.select('personid').count()

In [None]:
pid_Dys= Binary_RA1.join(clean_Dys, 'personid', 'left')

column_name = 'Dys'

# Replace "NaN" values with zeros in the specified column
Binary_Dys = pid_Dys.fillna({column_name: 0})
Binary_Dys1=Binary_Dys.dropDuplicates()

Binary_Dys1.select('personid').count()

In [None]:
pid_Irreg= Binary_Dys1.join(clean_Irreg, 'personid', 'left')

column_name = 'Irreg'

# Replace "NaN" values with zeros in the specified column
Binary_Irreg = pid_Irreg.fillna({column_name: 0})
Binary_Irreg1=Binary_Irreg.dropDuplicates()

Binary_Irreg1.select('personid').count()

In [None]:
pid_spinal= Binary_Irreg1.join(clean_spinal, 'personid', 'left')

column_name = 'spinal'

# Replace "NaN" values with zeros in the specified column
Binary_spinal = pid_spinal.fillna({column_name: 0})
Binary_spinal1=Binary_spinal.dropDuplicates()

Binary_spinal1.select('personid').count()

In [None]:
pid_scoliosis= Binary_spinal1.join(clean_scoliosis, 'personid', 'left')

column_name = 'scoliosis'

# Replace "NaN" values with zeros in the specified column
Binary_scoliosis = pid_scoliosis.fillna({column_name: 0})
Binary_scoliosis1=Binary_scoliosis.dropDuplicates()

Binary_scoliosis1.select('personid').count()

In [None]:
pid_FT= Binary_scoliosis1.join(clean_FT, 'personid', 'left')

column_name = 'FT'

# Replace "NaN" values with zeros in the specified column
Binary_FT = pid_FT.fillna({column_name: 0})
Binary_FT1=Binary_FT.dropDuplicates()

Binary_FT1.select('personid').count()

In [None]:
pid_hl= Binary_FT1.join(clean_hl, 'personid', 'left')

column_name = 'hl'

# Replace "NaN" values with zeros in the specified column
Binary_hl = pid_hl.fillna({column_name: 0})
Binary_hl1=Binary_hl.dropDuplicates()

Binary_hl1.select('personid').count()

In [None]:
pid_cloacdys= Binary_hl1.join(clean_cloacdys, 'personid', 'left')

column_name = 'cloacdys'

# Replace "NaN" values with zeros in the specified column
Binary_cloacdys = pid_cloacdys.fillna({column_name: 0})
Binary_cloacdys1=Binary_cloacdys.dropDuplicates()

Binary_cloacdys1.select('personid').count()

In [None]:
pid_mc= Binary_cloacdys1.join(clean_mc, 'personid', 'left')

column_name = 'mc'

# Replace "NaN" values with zeros in the specified column
Binary_mc = pid_mc.fillna({column_name: 0})
Binary_mc1=Binary_mc.dropDuplicates()

Binary_mc1.select('personid').count()

In [None]:
pid_eom= Binary_mc1.join(clean_eom, 'personid', 'left')

column_name = 'eom'

# Replace "NaN" values with zeros in the specified column
Binary_eom = pid_eom.fillna({column_name: 0})
Binary_eom1=Binary_eom.dropDuplicates()

Binary_eom1.select('personid').count()

In [None]:
pid_hemato= Binary_eom1.join(clean_hemato, 'personid', 'left')

column_name = 'hemato'

# Replace "NaN" values with zeros in the specified column
Binary_hemato = pid_hemato.fillna({column_name: 0})
Binary_hemato1=Binary_hemato.dropDuplicates()

Binary_hemato1.select('personid').count()

In [None]:
pid_preterm= Binary_hemato1.join(clean_preterm, 'personid', 'left')

column_name = 'preterm'

# Replace "NaN" values with zeros in the specified column
Binary_preterm = pid_preterm.fillna({column_name: 0})
Binary_preterm1=Binary_preterm.dropDuplicates()

Binary_preterm1.select('personid').count()

In [None]:
pid_mal= Binary_preterm1.join(clean_mal, 'personid', 'left')

column_name = 'mal'

# Replace "NaN" values with zeros in the specified column
Binary_mal = pid_mal.fillna({column_name: 0})
Binary_mal1=Binary_mal.dropDuplicates()

Binary_mal1.select('personid').count()

In [None]:
pid_pg= Binary_mal1.join(clean_pg, 'personid', 'left')

column_name = 'pg'

# Replace "NaN" values with zeros in the specified column
Binary_pg= pid_pg.fillna({column_name: 0})
Binary_pg1=Binary_pg.dropDuplicates()

Binary_pg1.select('personid').count()

In [None]:
pid_pr= Binary_pg1.join(clean_pr, 'personid', 'left')

column_name = 'pr'

# Replace "NaN" values with zeros in the specified column
Binary_pr= pid_pr.fillna({column_name: 0})
Binary_pr1=Binary_pr.dropDuplicates()

Binary_pr1.select('personid').count()

In [None]:
pid_HPV= Binary_pr1.join(clean_HPV, 'personid', 'left')

column_name = 'HPV'

# Replace "NaN" values with zeros in the specified column
Binary_HPV= pid_HPV.fillna({column_name: 0})
Binary_HPV1=Binary_HPV.dropDuplicates()

Binary_HPV1.select('personid').count()

In [None]:
pid_HIV= Binary_HPV1.join(clean_HIV, 'personid', 'left')

column_name = 'HIV'

# Replace "NaN" values with zeros in the specified column
Binary_HIV= pid_HIV.fillna({column_name: 0})
Binary_HIV1=Binary_HIV.dropDuplicates()

Binary_HIV1.select('personid').count()

In [None]:
pid_STI= Binary_HIV1.join(clean_STI, 'personid', 'left')

column_name = 'STI'

# Replace "NaN" values with zeros in the specified column
Binary_STI= pid_STI.fillna({column_name: 0})
Binary_STI1=Binary_STI.dropDuplicates()

Binary_STI1.select('personid').count()

In [None]:
pid_Smoker= Binary_STI1.join(clean_Smoker, 'personid', 'left')

column_name = 'Smoker'

# Replace "NaN" values with zeros in the specified column
Binary_Smoker= pid_Smoker.fillna({column_name: 0})
Binary_Smoker1=Binary_Smoker.dropDuplicates()

Binary_Smoker1.select('personid').count()

In [None]:
pid_SA= Binary_Smoker1.join(clean_SA, 'personid', 'left')

column_name = 'SA'

# Replace "NaN" values with zeros in the specified column
Binary_SA= pid_SA.fillna({column_name: 0})
Binary_SA1=Binary_SA.dropDuplicates()

Binary_SA1.select('personid').count()

In [None]:
pid_AA= Binary_SA1.join(clean_AA, 'personid', 'left')

column_name = 'AA'

# Replace "NaN" values with zeros in the specified column
Binary_AA= pid_AA.fillna({column_name: 0})
Binary_AA1=Binary_AA.dropDuplicates()

Binary_AA1.select('personid').count()

In [None]:
pid_Db2= Binary_AA1.join(clean_Db2, 'personid', 'left')

column_name = 'Db2'

# Replace "NaN" values with zeros in the specified column
Binary_Db2= pid_Db2.fillna({column_name: 0})
Binary_Db21=Binary_Db2.dropDuplicates()

Binary_Db21.select('personid').count()

In [None]:
pid_CVD= Binary_Db21.join(clean_CVD, 'personid', 'left')

column_name = 'CVD'

# Replace "NaN" values with zeros in the specified column
Binary_CVD= pid_CVD.fillna({column_name: 0})
Binary_CVD1=Binary_CVD.dropDuplicates()

Binary_CVD1.select('personid').count()

In [None]:
pid_FHGC= Binary_CVD1.join(clean_FHGC, 'personid', 'left')

column_name = 'FHGC'

# Replace "NaN" values with zeros in the specified column
Binary_FHGC= pid_FHGC.fillna({column_name: 0})
Binary_FHGC1=Binary_FHGC.dropDuplicates()

Binary_FHGC1.select('personid').count()

In [None]:
pid_FHC= Binary_FHGC1.join(clean_FHC, 'personid', 'left')

column_name = 'FHC'

# Replace "NaN" values with zeros in the specified column
Binary_FHC= pid_FHC.fillna({column_name: 0})
Binary_FHC1=Binary_FHC.dropDuplicates()

Binary_FHC1.select('personid').count()

In [None]:
pid_meno= Binary_FHC1.join(clean_meno, 'personid', 'left')

column_name = 'meno'

# Replace "NaN" values with zeros in the specified column
Binary_meno= pid_meno.fillna({column_name: 0})
Binary_meno1=Binary_meno.dropDuplicates()

Binary_meno1.select('personid').count()

In [None]:
pid_preg= Binary_meno1.join(clean_preg, 'personid', 'left')

column_name = 'preg'

# Replace "NaN" values with zeros in the specified column
Binary_preg= pid_preg.fillna({column_name: 0})
Binary_preg1=Binary_preg.dropDuplicates()

Binary_preg1.select('personid').count()

In [None]:
pid_preg= Binary_meno1.join(clean_preg, 'personid', 'left')

column_name = 'preg'

# Replace "NaN" values with zeros in the specified column
Binary_preg= pid_preg.fillna({column_name: 0})
Binary_preg1=Binary_preg.dropDuplicates()

Binary_preg1.select('personid').count()

In [None]:
pid_ectop= Binary_preg1.join(clean_ectop, 'personid', 'left')

column_name = 'ectop'

# Replace "NaN" values with zeros in the specified column
Binary_ectop= pid_ectop.fillna({column_name: 0})
Binary_ectop1=Binary_ectop.dropDuplicates()

Binary_ectop1.select('personid').count()

In [None]:
pid_lynch= Binary_ectop1.join(clean_lynch, 'personid', 'left')

column_name = 'lynch'

# Replace "NaN" values with zeros in the specified column
Binary_lynch= pid_lynch.fillna({column_name: 0})
Binary_lynch1=Binary_lynch.dropDuplicates()

Binary_lynch1.select('personid').count()

In [None]:
pid_PCOS= Binary_lynch1.join(clean_PCOS, 'personid', 'left')

column_name = 'PCOS'

# Replace "NaN" values with zeros in the specified column
Binary_PCOS= pid_PCOS.fillna({column_name: 0})
Binary_PCOS1=Binary_PCOS.dropDuplicates()

Binary_PCOS1.select('personid').count()= Binary_PCOS1.join(clean_lynch, 'personid', 'left')


In [None]:
pid_PPROM= Binary_PCOS1.join(clean_PPROM, 'personid', 'left')

column_name = 'PPROM'

# Replace "NaN" values with zeros in the specified column
Binary_PPROM= pid_PPROM.fillna({column_name: 0})
Binary_PPROM1=Binary_PPROM.dropDuplicates()

Binary_PPROM1.select('personid').count()= Binary_PPROM1.join(clean_lynch, 'personid', 'left')


In [None]:
pid_MCCLD= Binary_PPROM1.join(clean_MCCLD, 'personid', 'left')

column_name = 'MCCLD'

# Replace "NaN" values with zeros in the specified column
Binary_MCCLD = pid_MCCLD.fillna({column_name: 0})
Binary_MCCLD1=Binary_MCCLD.dropDuplicates()

Binary_MCCLD1.select('personid').count()

In [None]:
pid_HRP= Binary_MCCLD1.join(clean_HRP, 'personid', 'left')

column_name = 'HRP'

# Replace "NaN" values with zeros in the specified column
Binary_HRP = pid_HRP.fillna({column_name: 0})
Binary_HRP1=Binary_HRP.dropDuplicates()

Binary_HRP1.select('personid').count()

In [None]:
pid_csect=Binary_HRP1.join(csect2, 'personid', 'left')

column_name = 'csect'

# Replace "NaN" values with zeros in the specified column
Binary_csect = pid_csect.fillna({column_name: 0})
Binary_csect1=Binary_csect.dropDuplicates()

Binary_csect1.select('personid').count()

In [None]:
pid_ccsect=Binary_csect1.join(ccsect2, 'personid', 'left')

column_name = 'ccsect'

# Replace "NaN" values with zeros in the specified column
Binary_ccsect = pid_ccsect.fillna({column_name: 0})
Binary_ccsect1=Binary_ccsect.dropDuplicates()

Binary_ccsect1.select('personid').count()

In [None]:
pid_lcsect=Binary_ccsect1.join(lcsect2, 'personid', 'left')

column_name = 'lcsect'

# Replace "NaN" values with zeros in the specified column
Binary_lcsect = pid_lcsect.fillna({column_name: 0})
Binary_lcsect1=Binary_lcsect.dropDuplicates()

Binary_lcsect1.select('personid').count()

In [None]:
pid_epcsect=Binary_lcsect1.join(epcsect2, 'personid', 'left')

column_name = 'epcsect'

# Replace "NaN" values with zeros in the specified column
Binary_epcsect = pid_epcsect.fillna({column_name: 0})
Binary_epcsect1=Binary_epcsect.dropDuplicates()

Binary_epcsect1.select('personid').count()

In [None]:

Binary_epcsect1.limit(25).toPandas()

In [None]:
pid_UTC=Binary_epcsect1.join(clean_UTC, 'personid', 'left')

column_name = 'UTC'

# Replace "NaN" values with zeros in the specified column
Binary_UTC = pid_UTC.fillna({column_name: 0})
Binary_UTC1=Binary_UTC.dropDuplicates()

Binary_UTC1.select('personid').count()

In [None]:
pid_OVC=Binary_UTC1.join(clean_OVC, 'personid', 'left')

column_name = 'OVC'

# Replace "NaN" values with zeros in the specified column
Binary_OVC = pid_OVC.fillna({column_name: 0})
Binary_OVC1=Binary_OVC.dropDuplicates()

Binary_OVC1.select('personid').count()

In [None]:
pid_OV_FT=Binary_OVC1.join(clean_OV_FT, 'personid', 'left')

column_name = 'OV_FT'

# Replace "NaN" values with zeros in the specified column
Binary_OV_FT = pid_OV_FT.fillna({column_name: 0})
Binary_OV_FT1=Binary_OV_FT.dropDuplicates()

Binary_OV_FT1.select('personid').count()

In [None]:
pid_CVX=Binary_OVC1.join(clean_CVX, 'personid', 'left')

column_name = 'CVX'

# Replace "NaN" values with zeros in the specified column
Binary_CVX = pid_CVX.fillna({column_name: 0})
Binary_CVX1=Binary_CVX.dropDuplicates()

Binary_CVX1.select('personid').count()

In [None]:
pid_Other_Gyn=Binary_CVX1.join(clean_Other_Gyn, 'personid', 'left')

column_name = 'Other_Gyn'

# Replace "NaN" values with zeros in the specified column
Binary_Other_Gyn = pid_Other_Gyn.fillna({column_name: 0})
Binary_Other_Gyn1=Binary_Other_Gyn.dropDuplicates()

Binary_Other_Gyn1.select('personid').count()

In [None]:
pid_Any_Gyn=Binary_Other_Gyn1.join(clean_Any_Gyn, 'personid', 'left')

column_name = 'Any_Gyn'

# Replace "NaN" values with zeros in the specified column
Binary_Any_Gyn = pid_Any_Gyn.fillna({column_name: 0})
Binary_Any_Gyn1=Binary_Any_Gyn.dropDuplicates()

Binary_Any_Gyn1.select('personid').count()

In [None]:
pid_Breastc=Binary_Any_Gyn1.join(clean_Breastc, 'personid', 'left')

column_name = 'Breastc'

# Replace "NaN" values with zeros in the specified column
Binary_Breastc = pid_Breastc.fillna({column_name: 0})
Binary_Breastc1=Binary_Breastc.dropDuplicates()

Binary_Breastc1.select('personid').count()

In [None]:
Binary_Breastc1.write.saveAsTable('non_CUA_db.non_covariate_matrix')