# Libraries

In [161]:
import pandas as pd
import numpy as np
import datetime as dt
import time
import geonamescache 
from geopy.geocoders import Nominatim
from geopy.geocoders import GoogleV3
import re
import pycountry
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OrdinalEncoder
import pickle


# 1) Sql Tables

In [162]:
my_studies = pd.read_pickle("my_studies.pkl")
my_terminations = pd.read_pickle("my_terminations.pkl")  
my_terminations2 = pd.read_pickle("my_terminations2.pkl")  
my_conditions = pd.read_pickle("my_conditions.pkl")  
my_covid = pd.read_pickle("my_covid.pkl")  
my_placebo = pd.read_pickle("my_placebo.pkl")  
my_interventions = pd.read_pickle("my_interventions.pkl")  
my_interventions_types = pd.read_pickle("my_interventions_types.pkl")  
my_soc = pd.read_pickle("my_soc.pkl")  
my_intervention_methods2 = pd.read_pickle("my_intervention_methods2.pkl")  
my_intervention_methods = pd.read_pickle("my_intervention_methods.pkl")  
my_adverse = pd.read_pickle("my_adverse.pkl")  
my_adverse_system = pd.read_pickle("my_adverse_system.pkl")  
my_designs = pd.read_pickle("my_designs.pkl")  
my_eligibilities = pd.read_pickle("my_eligibilities.pkl")  
my_outcomes = pd.read_pickle("my_outcomes.pkl")  
my_locations = pd.read_pickle("my_locations.pkl")  
my_documents = pd.read_pickle("my_documents.pkl")  

# 2) Load Data

### Load Data

In [163]:
df = pd.read_csv(r"C:\Users\Eugenia\OneDrive\Documents\THESIS\ctg-studies.csv")
pd.set_option('display.max.columns', 35)  # Number of Columns : 35 > 30
df = df.rename(columns={'NCT Number': 'nct_id'})

df.head()

Unnamed: 0,nct_id,Study Title,Study URL,Acronym,Study Status,Brief Summary,Study Results,Conditions,Interventions,Primary Outcome Measures,Secondary Outcome Measures,Other Outcome Measures,Sponsor,Collaborators,Sex,Age,Phases,Enrollment,Funder Type,Study Type,Study Design,Other IDs,Start Date,Primary Completion Date,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Study Documents
0,NCT04385680,Chlorhexidine Vaginal Preparation for Reductio...,https://clinicaltrials.gov/study/NCT04385680,,COMPLETED,The study aims to assess the beneficial value ...,NO,Postpartum Endometritis|Wound Infection|Chlorh...,DRUG: Chlorhexidine Gluconate vaginal solution...,"post-cesarean endometritis, uterine fundal ten...","Significant leukocytosis, increase of WBCs cou...",,Zagazig University,,FEMALE,"ADULT, OLDER_ADULT",PHASE1|PHASE2,840.0,OTHER_GOV,INTERVENTIONAL,Allocation: RANDOMIZED|Intervention Model: PAR...,chlorhex. post cs endometritis,2020-05-15,2022-03-10,2022-08-30,2020-05-13,,2022-09-23,"Armed Forces Hospitals Southern Region, Khamis...",
1,NCT05017480,A Study to Evaluate the Efficacy and Safety of...,https://clinicaltrials.gov/study/NCT05017480,,COMPLETED,This study will evaluate the efficacy and safe...,YES,Moderate-to-severe Atopic Dermatitis,DRUG: CBP-201|DRUG: Placebo,"Investigator Global Assessment (IGA) (0-1), Th...","Eczema Area and Severity Index (EASI)-75, The ...",,"Suzhou Connect Biopharmaceuticals, Ltd.",,ALL,"CHILD, ADULT, OLDER_ADULT",PHASE2,330.0,INDUSTRY,INTERVENTIONAL,Allocation: RANDOMIZED|Intervention Model: PAR...,CBP-201-CN002,2021-08-31,2022-12-01,2023-09-28,2021-08-23,2024-05-01,2024-05-01,"Connect Investigative Site 33, Hefei, Anhui, C...","Study Protocol, https://cdn.clinicaltrials.gov..."
2,NCT01136980,Randomized EsophyX Versus Sham / Placebo Contr...,https://clinicaltrials.gov/study/NCT01136980,RESPECT,COMPLETED,The objective of the study is to evaluate the ...,YES,Gastroesophageal Reflux Disease|Hiatal Hernia,DEVICE: TIF Transoral Fundoplication|OTHER: Sh...,Number of Participants With a Clinically Signi...,Normalization of Esophageal Acid Exposure - as...,,EndoGastric Solutions,,ALL,"ADULT, OLDER_ADULT",,129.0,INDUSTRY,INTERVENTIONAL,Allocation: RANDOMIZED|Intervention Model: PAR...,D01010,2011-04,2015-04,2018-03,2010-06-04,2021-12-03,2021-12-03,"Cedars Sinai Medical Center, Los Angeles, Cali...",
3,NCT01560780,Prasugrel for Prevention of Early Saphenous Ve...,https://clinicaltrials.gov/study/NCT01560780,,COMPLETED,This is a randomized-controlled clinical trial...,YES,Coronary Artery Bypass,DRUG: Prasugrel|DRUG: Placebo,Prevalence of Intragraft Thrombus at 12-month ...,Number of Patients With Severe Bleeding Using ...,,VA Office of Research and Development,,ALL,"ADULT, OLDER_ADULT",PHASE3,84.0,FED,INTERVENTIONAL,Allocation: RANDOMIZED|Intervention Model: PAR...,CLIN-007-11F,2013-02-01,2018-04-30,2018-05-31,2012-03-22,2019-10-09,2019-10-30,"San Francisco VA Medical Center, San Francisco...","Study Protocol and Statistical Analysis Plan, ..."
4,NCT03447080,Co-ingestion of Rice Bran Soymilk or Plain Soy...,https://clinicaltrials.gov/study/NCT03447080,,COMPLETED,bread and that rice-bran soymilk will have an ...,NO,"Diabetes Mellitus, Type 2",OTHER: Control|OTHER: Control|OTHER: Ricebran ...,Change in postprandial blood glucose over 180 ...,Change in postprandial plasma insulin over 180...,,Singapore Institute of Food and Biotechnology ...,,MALE,ADULT,,17.0,OTHER_GOV,INTERVENTIONAL,Allocation: RANDOMIZED|Intervention Model: CRO...,2017/00286,2017-06-02,2019-01-18,2019-01-18,2018-02-27,,2019-05-13,"Clinical Nutrition Reseach Centre, Singapore, ...",


### Duplicated Studies

- nct_id always unique --> given from Site page

In [164]:
df = df.merge(my_studies[['nct_id', 'official_title']], on = 'nct_id', how = 'left')

# Shape before
display(df.shape)  
df.drop_duplicates()  
display(df.shape)  # Shape after --> Same --> No duplicate rows in all columns

# Non Unique Study Title or IDs (But not in all columns) --> # Some Studies have Same Study Title but Different nct_id
display(df.describe(include = ['object'])[["nct_id", "Study Title", "official_title", "Acronym", "Other IDs"]])
display(df[df.duplicated(subset = ["Study Title", "Other IDs"], keep = False)].sort_values(by = "Study Title"))  # Some even have Same Other IDs which is given by Sponsors, Study itself etc.



(177601, 31)

(177601, 31)

Unnamed: 0,nct_id,Study Title,official_title,Acronym,Other IDs
count,177601,177601,175206,48070,177585
unique,177601,177257,174270,40297,175579
top,NCT04385680,Sun Protection Factor Assay,Sun Protection Factor (SPF) Assay: UVA Protect...,RCT,1
freq,1,14,23,70,90


Unnamed: 0,nct_id,Study Title,Study URL,Acronym,Study Status,Brief Summary,Study Results,Conditions,Interventions,Primary Outcome Measures,Secondary Outcome Measures,Other Outcome Measures,Sponsor,Collaborators,Sex,Age,Phases,Enrollment,Funder Type,Study Type,Study Design,Other IDs,Start Date,Primary Completion Date,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Study Documents,official_title
94774,NCT02692599,Safety and Immunogenicity Study of Live Attenu...,https://clinicaltrials.gov/study/NCT02692599,,COMPLETED,The purpose of this study is to evaluate the i...,NO,Mumps,BIOLOGICAL: investigational live attenuated mu...,The seroconversion rates (SCRs) of susceptible...,The incidences of adverse events (AEs) of each...,,"Sinovac (Dalian) Vaccine Technology Co., Ltd.",,ALL,CHILD,PHASE3,1150.0,INDUSTRY,INTERVENTIONAL,Allocation: RANDOMIZED|Intervention Model: PAR...,PRO-MUMPS-3001,2016-01,2016-03,2016-07-15,2016-02-26,,2017-10-26,Dingxing County Center for Disease Control and...,,"A Blind, Randomized and Controlled Clinical Tr..."
170597,NCT05065177,Safety and Immunogenicity Study of Live Attenu...,https://clinicaltrials.gov/study/NCT05065177,,COMPLETED,The purpose of this study is to evaluate the i...,NO,Mumps,BIOLOGICAL: Investigational live attenuated mu...,The seroconversion rates (SCRs) of susceptible...,The incidences of adverse events (AEs) of each...,,"Sinovac Research and Development Co., Ltd.",,ALL,CHILD,PHASE3,1140.0,INDUSTRY,INTERVENTIONAL,Allocation: RANDOMIZED|Intervention Model: PAR...,PRO-MUMPS-3001,2016-01,2016-03,2016-04,2021-10-01,,2021-10-01,Dingxing County Center for Disease Control and...,,"A Blind, Randomized and Controlled Clinical Tr..."
2866,NCT01602874,Study Evaluating Tigecycline Versus Ceftriaxon...,https://clinicaltrials.gov/study/NCT01602874,,WITHDRAWN,The main purpose of this study is to compare t...,NO,Community Acquired Bacterial Pneumonia|Complic...,DRUG: Tigecycline|DRUG: Tigecycline|DRUG: cIAI...,"Clinical efficacy response (cure, failure, or ...",Clinical response at the IV last day of therap...,,Pfizer,,ALL,CHILD,PHASE3,0.0,INDUSTRY,INTERVENTIONAL,Allocation: RANDOMIZED|Intervention Model: PAR...,3074K4-3340|B1811003,2011-01,2014-05,2014-05,2012-05-21,,2013-02-22,,,"Multicenter, Randomized, And Double-Blind Stud..."
140890,NCT00914888,Study Evaluating Tigecycline Versus Ceftriaxon...,https://clinicaltrials.gov/study/NCT00914888,,WITHDRAWN,The main purpose of this study is to compare t...,NO,Community Acquired Bacterial Pneumonia|Complic...,DRUG: Tigecycline|DRUG: Tigecycline|DRUG: cIAI...,"Clinical efficacy response (cure, failure, or ...",Clinical response at the IV last day of therap...,,Wyeth is now a wholly owned subsidiary of Pfizer,,ALL,CHILD,PHASE3,0.0,INDUSTRY,INTERVENTIONAL,Allocation: RANDOMIZED|Intervention Model: PAR...,3074K4-3340|B1811003,2011-01,2014-05,2014-05,2009-06-05,,2012-06-07,,,"Multicenter, Randomized, And Double-Blind Stud..."
15703,NCT04043130,The Evaluation of Pulse: A Mobile Health App a...,https://clinicaltrials.gov/study/NCT04043130,,COMPLETED,This study used a randomized controlled design...,NO,Unprotected Sex|Contraceptive Usage,BEHAVIORAL: Pulse,"Unprotected sex, no contraceptive, Ever having...",Reproductive and sexual health care utilizatio...,,Child Trends,Healthy Teen Network|Ewald and Wasserman|MetaM...,FEMALE,ADULT,,2317.0,OTHER,INTERVENTIONAL,Allocation: RANDOMIZED|Intervention Model: PAR...,TP2AH000038,2016-11,2019-11,2020-06,2019-08-02,,2020-06-16,"Healthy Teen Network, Baltimore, Maryland, 212...",,The Evaluation of Pulse: A Mobile Health App a...
157402,NCT03253783,The Evaluation of Pulse: A Mobile Health App a...,https://clinicaltrials.gov/study/NCT03253783,,COMPLETED,This study uses a randomized controlled design...,NO,Unprotected Sex|Contraceptive Usage|Reproducti...,BEHAVIORAL: Pulse,"Unprotected sex, no contraceptive, Ever having...",Reproductive and sexual health care utilizatio...,,Healthy Teen Network,Child Trends|Ewald and Wasserman|Meta Media,FEMALE,ADULT,,1304.0,OTHER,INTERVENTIONAL,Allocation: |Intervention Model: |Masking: NON...,TP2AH000038,2016-11,2017-12,2018-01,2017-08-18,,2018-01-23,"Healthy Teen Network, Baltimore, Maryland, 212...",,The Evaluation of Pulse: A Mobile Health App a...
11512,NCT04761731,To Evaluate the Efficacy and Safety of ADVAGRA...,https://clinicaltrials.gov/study/NCT04761731,Assign,COMPLETED,This study's objective is to evaluate the inci...,NO,Liver Transplantation,DRUG: ADVAGRAF®,Incidence rate of biopsy confirmed acute rejec...,"Severity of biopsy confirmed acute rejection, ...",,Linical Korea,"Astellas Pharma Korea, Inc.|National Cancer Ce...",ALL,"ADULT, OLDER_ADULT",PHASE4,31.0,INDUSTRY,INTERVENTIONAL,Allocation: NA|Intervention Model: SINGLE_GROU...,Assign,2015-07,2017-12-14,2017-12-14,2021-02-21,,2021-02-21,,,"A Single Center, Single Arm, Open-label Study ..."
20192,NCT03423225,To Evaluate the Efficacy and Safety of ADVAGRA...,https://clinicaltrials.gov/study/NCT03423225,,COMPLETED,This study's objective is to evaluate the inci...,NO,Liver Transplantation,DRUG: ADVAGRAF®,Incidence rate of acute rejection reaction con...,Severity of acute rejection reaction confirmed...,,"Kim, Seoung-Hoon","Astellas Pharma Korea, Inc.|Linical Korea",ALL,"ADULT, OLDER_ADULT",PHASE4,31.0,OTHER_GOV,INTERVENTIONAL,Allocation: NA|Intervention Model: SINGLE_GROU...,Assign,2016-03-22,2017-07-18,2017-12-14,2018-02-06,,2018-02-06,,,"A Single Center, Single Arm, Open-label Study ..."


##  Drop Rows

In [165]:
df = df.drop(df[~df["nct_id"].isin(my_studies["nct_id"])].index)  # Leipoun 15 ereunes apo th bash --> sxetikes me Devide + Diagnostics + etc. = display(177601-177586)
display(df.shape)

(177586, 31)

# 3) Columns Format Dtypes

## Study Status
Information Source: https://clinicaltrials.gov/policy/protocol-definitions#study-status
- Withdrawn Studies = Permenantely Stopped as Noted in Data Source. Withdrawn is Replaced as Terminated. 

In [166]:
display(df['Study Status'].value_counts(dropna=False))  # No null values

df["Study Status"] = df["Study Status"].replace("WITHDRAWN", "TERMINATED") # Replace "WITHDRAWN" with "TERMINATED" : withdrawn studies are permanently stopped based on Source
df["Study Status"] = df["Study Status"].astype("category")
display(df['Study Status'].value_counts(dropna = False))  # No null values
display(df.shape)

Study Status
COMPLETED     151707
TERMINATED     17478
WITHDRAWN       8401
Name: count, dtype: int64

Study Status
COMPLETED     151707
TERMINATED     25879
Name: count, dtype: int64

(177586, 31)

## Age_List

In [167]:
display(df["Age"].value_counts(dropna=False))  
 
df["Age"] = df["Age"].str.replace(', ', ',').str.split(",")   # No apply sort(list(x)) needed here - No duplicated elements with diff list-elem order

display(Counter([elem for row_list in df["Age"].dropna() for elem in row_list]))  
# display(set(elem for row_list in df["Age"].dropna() for elem in row_list)) # Unique values in a list type element column
df = df.rename(columns = {"Age": "Age_List"})

display(df["Age_List"].value_counts(dropna=False))  
display(df.shape)

Age
ADULT, OLDER_ADULT           113474
ADULT                         32525
CHILD                         11733
CHILD, ADULT, OLDER_ADULT     10558
CHILD, ADULT                   6475
OLDER_ADULT                    2821
Name: count, dtype: int64

Counter({'ADULT': 163032, 'OLDER_ADULT': 126853, 'CHILD': 28766})

Age_List
[ADULT, OLDER_ADULT]           113474
[ADULT]                         32525
[CHILD]                         11733
[CHILD, ADULT, OLDER_ADULT]     10558
[CHILD, ADULT]                   6475
[OLDER_ADULT]                    2821
Name: count, dtype: int64

(177586, 31)

## Sex

In [168]:
display(df["Sex"].value_counts(dropna=False))  

# Fillna from my_eligibilities df
df = df.merge(my_eligibilities[["nct_id", "gender"]], on = "nct_id", how = "left")
df['Sex'] = df['Sex'].fillna(df['gender'])
df = df.drop(columns=['gender'])
df["Sex"].isnull().sum()  # Just 1 row/point filled

# Fillna from Brief Summary
df.loc[ df["Sex"].isnull() & df["Brief Summary"].str.contains(r"(?i)\bmale\b|\bmen\b", na=False) & \
                 df["Brief Summary"].str.contains(r"(?i)\bfemale\b|\bwomen\b", na=False), "Sex"] = "ALL"

df.loc[ df["Sex"].isnull() & ~ df["Brief Summary"].str.contains(r"(?i)\bmale\b|\bmen\b", na=False) & \
                 df["Brief Summary"].str.contains(r"(?i)\bfemale\b|\bwomen\b", na=False), "Sex"] = "FEMALE"

df.loc[ df["Sex"].isnull() & df["Brief Summary"].str.contains(r"(?i)\bmale\b|\bmen\b", na=False) & \
                ~ df["Brief Summary"].str.contains(r"(?i)\bfemale\b|\bwomen\b", na=False), "Sex"] = "MALE"

display(df["Sex"].value_counts(dropna=False))
display(df.shape)

Sex
ALL       151315
FEMALE     17076
MALE        9100
NaN           95
Name: count, dtype: int64

Sex
ALL       151321
FEMALE     17080
MALE        9100
None          85
Name: count, dtype: int64

(177586, 31)

## Funder

### Funder_Type

PRS Source: https://register.clinicaltrials.gov/prs/html/account-organizations.html#page-top
Companies Source : https://companiesmarketcap.com/pharmaceuticals/largest-pharmaceutical-companies-by-market-cap/
1. U.S. National Institutes of Health
2. Other U.S. Federal agencies (for example, Food and Drug Administration, Centers for Disease Control and Prevention, or U.S. Department of Veterans Affairs)
3. Industry (for example: pharmaceutical and device companies)
4. All others (including individuals, universities, and community-based organizations)

In [169]:
df = df.rename(columns={"Funder Type": "Funder_Type"})
display(df["Funder_Type"].value_counts(dropna=False))  
display(df.shape)

Funder_Type
OTHER        120444
INDUSTRY      49624
OTHER_GOV      3443
FED            1628
NIH            1391
NETWORK         871
INDIV           170
UNKNOWN          13
AMBIG             2
Name: count, dtype: int64

(177586, 31)

### Sponsor_Collab_List

In [170]:
pharma = pd.read_csv(r"C:\Users\Eugenia\OneDrive\Documents\THESIS\Pharmaceuticals.csv")
pharma = pharma.rename(columns = {pharma.columns[0]: 'Pharma'})
pharma["Pharma"] = pharma["Pharma"].apply(lambda x: x.rstrip("\xa0"))
pharma = pharma.drop_duplicates(subset=["Pharma"]) 
pharma = pharma["Pharma"]

pattern = '|'.join(pharma.unique()).lower()  # Create a regex pattern from the pharma list
display(pattern)
pharma


"180 life sciences|2seventy bio|3m|4d molecular therapeutics|89bio|a. nattermann|aadi bioscience|aah|aardvark therapeutics|aarti drugs|abbott india|abbott|abbvie|abcellera|abeona therapeutics|abiomed|abivax|abraxis bioscience|absci|ac immune|acadia pharmaceuticals|acadia|acceleron|acelyrin|acg group|achieve life sciences|aci limited|aclaris therapeutics|acme laboratories|acorda|acrivon therapeutics|act|actavis|actelion|actinium pharmaceuticals|acumen pharmaceuticals|acura|adagene|adaptimmune therapeutics|adaptive biotechnologies|adc therapeutics|adcock ingram|addex therapeutics|adial pharmaceuticals|adicet bio|adlai nortye|adma biologics|advanced accelerator applications|advanz|advaxis|adverum biotechnologies|aeon biopharma|aerie|aerovate therapeutics|affimed|aft pharmaceuticals|agenus|agios pharmaceuticals|agios|aileron therapeutics|aim immunotech|ajanta|akbarieh|akebia therapeutics|akero therapeutics|akeso|akorn|alaunos therapeutics|alcon|aldeyra therapeutics|alector|alembic pharmace

0               180 Life Sciences
1                    2seventy bio
2                              3M
3       4D Molecular Therapeutics
4                           89bio
                  ...            
1437                     Zura Bio
1438           Zydus Lifesciences
1439                        Zydus
1440                    Zymeworks
1442                 ZymoGenetics
Name: Pharma, Length: 1389, dtype: object

In [171]:
# Sponsor
display(df["Sponsor"].isnull().sum()) 

df.loc[df["Sponsor"].str.lower().str.contains(pattern, case = False, na=False) == True, "Sponsor_Type"] = "Pharmaceutical"
df.loc[df["Sponsor"].str.lower().str.contains("pharm|farma", case = False, na = False) == True, "Sponsor_Type"] = "Pharmaceutical"
df.loc[df["Sponsor"].str.lower().str.contains("hosp|hopital|clinic", case = False, na = False) == True, "Sponsor_Type"] = "Hospital"
df.loc[df["Sponsor"].str.lower().str.contains("univer|college|school", case = False, na = False) == True, "Sponsor_Type"] = "University"
df.loc[(df["Sponsor_Type"].isnull()), 'Sponsor_Type'] = "Other" 

display(df["Sponsor_Type"].value_counts(dropna=False))
display(df.shape)

0

  df.loc[df["Sponsor"].str.lower().str.contains(pattern, case = False, na=False) == True, "Sponsor_Type"] = "Pharmaceutical"


Sponsor_Type
University        77065
Other             46697
Pharmaceutical    39629
Hospital          14195
Name: count, dtype: int64

(177586, 32)

In [172]:
# Collaborators
display(df["Collaborators"].isnull().sum()) 

df.loc[df["Collaborators"].str.lower().str.contains(pattern, case = False, na=False) == True, "Collaborator_Type"] = "Pharmaceutical"
df.loc[df["Collaborators"].str.lower().str.contains("pharm|farma", case = False, na = False) == True, "Collaborator_Type"] = "Pharmaceutical"
df.loc[df["Collaborators"].str.lower().str.contains("hosp|hopital|clinic", case = False, na = False) == True, "Collaborator_Type"] = "Hospital"
df.loc[df["Collaborators"].str.lower().str.contains("univer|college|school", case = False, na = False) == True, "Collaborator_Type"] = "University"

df.loc[(df["Collaborator_Type"].isnull()) & df["Collaborators"].notnull() , 'Collaborator_Type'] = "Other" 

display(df["Collaborator_Type"].value_counts(dropna=False))
display(df.shape)

119854

  df.loc[df["Collaborators"].str.lower().str.contains(pattern, case = False, na=False) == True, "Collaborator_Type"] = "Pharmaceutical"


Collaborator_Type
NaN               119854
Other              22160
University         16578
Pharmaceutical     15057
Hospital            3937
Name: count, dtype: int64

(177586, 33)

In [173]:
df["Sponsor_Collab_List"] = df[['Sponsor_Type', 'Collaborator_Type']].apply(lambda row: [val for val in row if pd.notna(val) and val != "None"],axis=1)
df["Sponsor_Collab_List"] = df["Sponsor_Collab_List"].apply(lambda x: sorted(set(x)))

display(df["Sponsor_Collab_List"].value_counts(dropna = False))
display(df.shape)

Sponsor_Collab_List
[University]                    58037
[Other]                         35746
[Pharmaceutical]                35342
[Other, University]             16378
[Hospital]                      10097
[Pharmaceutical, University]     7224
[Other, Pharmaceutical]          7102
[Hospital, University]           3297
[Hospital, Other]                2893
[Hospital, Pharmaceutical]       1470
Name: count, dtype: int64

(177586, 34)

## Study_Documents

Source of unique values: https://clinicaltrials.gov/policy/protocol-definitions#a1-upload

These include : 
- ICF (Informed Consent Form)
- SAP (Statistical Analysis Plan)
- CSR (Clinical Study Report)  --> did not use because its table has less information for documents
- STUDY_PROTOCOL
- ANALYTIC_CODE --> did not use because its table has less information for documents

### Study_Documents_List

In [174]:
df = df.merge(my_documents[["nct_id", "Study_Documents"]], on = "nct_id", how = "left")
df["Study_Documents"] = df["Study_Documents"].apply(lambda x: ['None'] if (isinstance(x, list) and len(x) == 0) else x)
display(Counter([elem for row_list in df["Study_Documents"].dropna() for elem in row_list]))

df = df.rename(columns = {"Study_Documents": "Study_Documents_List"})


Counter({'None': 145696,
         'Protocol': 30401,
         'Analysis Plan': 29512,
         'Consent Form': 7460})

### Document_Counts

In [None]:
# " Document_Counts" Column
df["Document_Counts"] = df["Study_Documents_List"].apply(lambda x: len(x) if x != ["None"] else 0)
display(df["Document_Counts"].value_counts(dropna=False))
display(df.shape)

Document_Counts
0    145696
2     23627
3      5928
1      2335
Name: count, dtype: int64

(177586, 36)

In [176]:
df["Study_Documents_List"] = df["Study_Documents_List"].apply(lambda x: ["None"] if x == [] else x)  # So not to be dropped
display(df["Study_Documents_List"].value_counts(dropna=False))
display(df.shape)

Study_Documents_List
[None]                                     145696
[Analysis Plan, Protocol]                   23269
[Consent Form, Analysis Plan, Protocol]      5928
[Consent Form]                               1174
[Protocol]                                    872
[Consent Form, Protocol]                      332
[Analysis Plan]                               289
[Consent Form, Analysis Plan]                  26
Name: count, dtype: int64

(177586, 36)

### Document_Counts_Bin


In [177]:
df["Document_Counts_Bin"] = "No"
df.loc[df["Document_Counts"] != 0, "Document_Counts_Bin"] = "Yes"
display(df["Document_Counts_Bin"].value_counts(dropna=False))
display(df.shape)

Document_Counts_Bin
No     145696
Yes     31890
Name: count, dtype: int64

(177586, 37)

## Datetime

- Source Date input : Filter dates have the format of mm/dd/yyyy. Dates Displayed in Outputs (after filtering choices menu) have the Format of yyyyy-mm-dd (or at least yyyy-mm).
- All Datetimes converted to same format : Year-Month is Chosen as Many Dates Include only until Month.


In [178]:
display(df[["Start Date","Completion Date"]].map(len , na_action = 'ignore').nunique()) # Dates are in yyyy-mm-dd (len=10) and yyyy-mm (len=7) format
df[["Start Date","Completion Date"]].head(2)

Start Date         2
Completion Date    2
dtype: int64

Unnamed: 0,Start Date,Completion Date
0,2020-05-15,2022-08-30
1,2021-08-31,2023-09-28


In [179]:
df[["Start Date","Completion Date"]] = df[["Start Date","Completion Date"]].apply(pd.to_datetime, format='%Y-%m', exact=False, errors='coerce')
df[["Start Date","Completion Date"]] = df[["Start Date","Completion Date"]].apply(lambda col: col.dt.to_period('M'))

display(df[["Start Date","Completion Date"]].head(2))


Unnamed: 0,Start Date,Completion Date
0,2020-05,2022-08
1,2021-08,2023-09


### Datetime_Year

In [180]:
df[["Start_Date_Year","Completion_Date_Year"]] = df[["Start Date","Completion Date"]].apply(lambda x: x.dt.year.mask(x.isna(), np.NaN))  

df[["Start_Date_Year","Completion_Date_Year"]] = df[["Start_Date_Year","Completion_Date_Year"]].apply(lambda x: x.convert_dtypes(convert_integer = True))

display(df[["Start_Date_Year","Completion_Date_Year"]].head(2))


Unnamed: 0,Start_Date_Year,Completion_Date_Year
0,2020,2022
1,2021,2023


### Date_Year_Categ

In [181]:
df[["Start_Date_Year_Categ","Completion_Date_Year_Categ"]] = df[["Start_Date_Year","Completion_Date_Year"]]\
        .apply(lambda col: pd.cut(col, bins = [col.min()-1, 2019, col.max()], ordered = True, include_lowest = False))
display(df.shape)
df[["Start_Date_Year_Categ","Completion_Date_Year_Categ"]].head(3)



(177586, 41)

Unnamed: 0,Start_Date_Year_Categ,Completion_Date_Year_Categ
0,"(2019, 2024]","(2019, 2024]"
1,"(2019, 2024]","(2019, 2024]"
2,"(2010, 2019]","(2010, 2019]"


### Datetime_Gaps

In [182]:
df["Completion_Gap"] = df["Completion Date"] - df["Start Date"]

df[["Completion_Gap"]] = df[["Completion_Gap"]].apply(lambda col: col.astype('str').str.strip('< *MonthEnds>').replace("NaT", np.NaN).replace("", '0').astype('float'))
df[["Completion_Gap",]] = df[["Completion_Gap"]].convert_dtypes(convert_integer = True)

display(df[["Completion_Gap"]].head())
df[["Completion_Gap"]].describe()

Unnamed: 0,Completion_Gap
0,27
1,25
2,83
3,63
4,19


Unnamed: 0,Completion_Gap
count,177586.0
mean,24.605977
std,21.612307
min,0.0
25%,9.0
50%,19.0
75%,35.0
max,163.0


### Datetime_Year Categorize


In [183]:
# df[["Start_Date_Year", "Completion_Date_Year"]] = df[["Start_Date_Year", "Completion_Date_Year"]].apply(lambda col: pd.Categorical(col, ordered = True))
df["Start_Date_Year"].head(2)


0    2020
1    2021
Name: Start_Date_Year, dtype: Int64

In [184]:
df[["Start_Date_Year", "Completion_Date_Year"]].isnull().sum()

Start_Date_Year         0
Completion_Date_Year    0
dtype: int64

## Interventions

- Source of unique values: https://clinicaltrials.gov/policy/protocol-definitions#arms-groups-interventions

### Intervention_Method_List

In [185]:
with open("intervention_methods.pkl", "rb") as f:
    data = pickle.load(f)

oral1 = data["oral1"]
oral2 = data["oral2"]
injection1 = data["injection1"]
injection2 = data["injection2"]
topical1 = data["topical1"]


In [186]:
# Injection
df.loc[df['Brief Summary'].str.lower().str.contains(injection1, case = False, na = False) == True, 'Injection'] = "Injection"
df.loc[df['Brief Summary'].str.lower().str.lower().str.contains(injection2, case = True, na = False) == True, 'Injection'] = "Injection"

# Oral
df.loc[df['Brief Summary'].str.lower().str.contains(oral1, case = False, na = False) == True, 'Oral'] = "Oral"
df.loc[df['Brief Summary'].str.lower().str.lower().str.contains(oral2, case = True, na = False) == True, 'Oral'] = "Oral"

# Topical
df.loc[df['Brief Summary'].str.lower().str.contains(topical1, case = False, na = False) == True, 'Topical'] = "Topical"


In [187]:
df = df.merge(my_intervention_methods[['nct_id', 'Interv_method']], on = "nct_id", how = "left")
df = df.rename({"Interv_method" : "Intervention_Method_List"}, axis = 1)

display(df['Intervention_Method_List'].value_counts(dropna = False))
display(df.shape)
df["Intervention_Method_List"].head(2)

Intervention_Method_List
NaN                           97977
[Injection]                   33013
[Oral]                        21849
[Topical]                     11641
[Injection, Oral]              5394
[Injection, Topical]           4800
[Oral, Topical]                1945
[Injection, Oral, Topical]      967
Name: count, dtype: int64

(177586, 46)

0      [Topical]
1    [Injection]
Name: Intervention_Method_List, dtype: object

In [188]:
def interv(row):    
    methods = []
    for col in ['Injection', 'Oral', 'Topical']:
        if pd.notna(row[col]) and row[col] not in methods:
            methods.append(row[col])
    return methods

df['Intervention_Method_List'] = df.apply(interv, axis=1)
df['Intervention_Method_List'] = df['Intervention_Method_List'].apply(lambda x: sorted(set(x)))
df['Intervention_Method_List'] = df['Intervention_Method_List'].apply(lambda x: ["Unknown"] if isinstance(x, list) and len(x) == 0 else x)

display(df['Intervention_Method_List'].value_counts(dropna=False))
display(df.shape)

Intervention_Method_List
[Unknown]                     91906
[Injection]                   42399
[Topical]                     13449
[Oral]                        11004
[Injection, Topical]           9791
[Injection, Oral]              5675
[Injection, Oral, Topical]     1698
[Oral, Topical]                1664
Name: count, dtype: int64

(177586, 46)

In [189]:
df = df.drop(columns = ['Injection', 'Oral', 'Topical'] , axis = 1)

### Intervention_Type_List

In [190]:
df = df.merge(my_interventions_types[["nct_id", "intervention_type"]], on = "nct_id", how = "left")
df = df.rename({"intervention_type" : "Intervention_Type_List"}, axis = 1)
display(df["Intervention_Type_List"].value_counts(dropna=False))
display(set(elem for row_list in df["Intervention_Type_List"] for elem in row_list))
display(df["Intervention_Type_List"].isnull().sum())  # No Null Values
display(df.shape)

Intervention_Type_List
[DRUG]                                              59601
[OTHER]                                             27134
[BEHAVIORAL]                                        20997
[DEVICE]                                            20567
[PROCEDURE]                                         10267
                                                    ...  
[DIAGNOSTIC_TEST, DIETARY_SUPPLEMENT, PROCEDURE]        1
[BEHAVIORAL, DRUG, OTHER, RADIATION]                    1
[DEVICE, GENETIC, RADIATION]                            1
[BEHAVIORAL, DRUG, GENETIC]                             1
[BIOLOGICAL, DEVICE, OTHER, RADIATION]                  1
Name: count, Length: 248, dtype: int64

{'BEHAVIORAL',
 'BIOLOGICAL',
 'COMBINATION_PRODUCT',
 'DEVICE',
 'DIAGNOSTIC_TEST',
 'DIETARY_SUPPLEMENT',
 'DRUG',
 'GENETIC',
 'OTHER',
 'PROCEDURE',
 'RADIATION'}

0

(177586, 44)

### Intevention_Counts

In [None]:
# Intevention_Counts" Column
df["Intervention_Counts"] = df["Intervention_Type_List"].apply(lambda x: len(x)) # no null in Intervention_Type_List
display(df["Intervention_Counts"].value_counts(dropna=False))  
display(df["Intervention_Counts"].isnull().sum())
display(df.shape)

Intervention_Counts
1    154269
2     21012
3      2008
4       255
5        41
6         1
Name: count, dtype: int64

0

(177586, 45)

## Placebo_Bin

In [192]:
my_placebo2 = my_placebo[my_placebo["group_type"] == "PLACEBO_COMPARATOR"]
df = df.merge(my_placebo2[["nct_id", "group_type"]], on = "nct_id", how = "left")
df = df.rename(columns={"group_type" : "Placebo_Bin"})
display(df["Placebo_Bin"].value_counts(dropna = False))

df["Placebo_Bin"] = df["Placebo_Bin"].replace({np.NaN : "No", "PLACEBO_COMPARATOR" : "Yes"})
display(df["Placebo_Bin"].value_counts(dropna = False)) 
display(df.shape)


Placebo_Bin
NaN                   144416
PLACEBO_COMPARATOR     33170
Name: count, dtype: int64

Placebo_Bin
No     144416
Yes     33170
Name: count, dtype: int64

(177586, 46)

## Standard_Care_Bin

In [193]:
df = df.merge(my_soc[["nct_id", "group_type"]], on = "nct_id", how = "left")
df = df.rename(columns={"group_type" : "Standard_Care_Bin"})
display(df["Standard_Care_Bin"].value_counts(dropna=False))

df["Standard_Care_Bin"] = df["Standard_Care_Bin"].replace({np.NaN : "No", "ACTIVE_COMPARATOR" : "Yes"})
display(df["Standard_Care_Bin"].value_counts(dropna=False))
display(df.shape)


Standard_Care_Bin
NaN                  170012
ACTIVE_COMPARATOR      7574
Name: count, dtype: int64

Standard_Care_Bin
No     170012
Yes      7574
Name: count, dtype: int64

(177586, 47)

## Healthy_Bin

In [194]:
df = df.merge(my_eligibilities[["nct_id", "healthy_volunteers"]], on = "nct_id", how = "left")
df = df.rename(columns={"healthy_volunteers" : "Healthy_Bin"})
df['Healthy_Bin'] = df['Healthy_Bin'].replace({'Condition' : "No", "Healthy" : "Yes"})

display(df["Healthy_Bin"].value_counts(dropna = False))
display(df.shape)

Healthy_Bin
No     118750
Yes     58770
NaN        66
Name: count, dtype: int64

(177586, 48)

## Arm_Counts

In [195]:
df = df.merge(my_studies[["nct_id", "number_of_arms"]], on = "nct_id", how = "left")
df = df.rename(columns = {"number_of_arms" : "Arm_Counts"})
display(df['Arm_Counts'].value_counts(dropna = False))
display(df.shape)

Arm_Counts
2.0     98050
1.0     40766
3.0     20491
4.0      9911
5.0      2529
6.0      2254
NaN       910
8.0       682
7.0       635
9.0       348
10.0      269
12.0      174
11.0      127
13.0       80
16.0       77
14.0       74
15.0       50
18.0       43
17.0       21
20.0       15
32.0       13
19.0       12
24.0       11
22.0        9
21.0        7
23.0        6
25.0        4
27.0        4
30.0        3
26.0        3
40.0        1
37.0        1
29.0        1
43.0        1
44.0        1
31.0        1
34.0        1
28.0        1
Name: count, dtype: int64

(177586, 49)

## Conditions
- Source MEsH Ontologies and Drugs : https://meshb.nlm.nih.gov/treeView  

### Covid_19_Bin

In [196]:
df = df.merge(my_covid[["nct_id", "mesh_term"]], on = "nct_id", how = "left")
df = df.rename(columns={"mesh_term" : "Covid_19_Bin"})
display(df["Covid_19_Bin"].value_counts(dropna=False))
df["Covid_19_Bin"] = df["Covid_19_Bin"].replace({np.NaN : "No", "COVID-19": "Yes"})
display(df["Covid_19_Bin"].value_counts(dropna=False))
display(df.shape)

Covid_19_Bin
NaN         174786
COVID-19      2800
Name: count, dtype: int64

Covid_19_Bin
No     174786
Yes      2800
Name: count, dtype: int64

(177586, 50)

### Conditions_List

In [198]:
df = df.merge(my_conditions[["nct_id", "Category"]], on = "nct_id", how = "left")
df = df.rename(columns = {"Category" : "Conditions_List"})
display(df["Conditions_List"].value_counts(dropna=False))
df["Conditions_List"] = df["Conditions_List"].apply(lambda x: ["None"] if isinstance(x, list) and len(x) == 0 else x)

display(Counter([elem for row_list in df["Conditions_List"] for elem in row_list]))
display(df.shape)

Conditions_List
[Diseases]                                                                                                  105207
[]                                                                                                           39471
[Diseases, Psychiatry, Psychology]                                                                            9740
[Psychiatry, Psychology]                                                                                      7711
[Diseases, Phenomena, Processes, Psychiatry, Psychology]                                                      7489
[Diagnostic, Equipment, Diseases, Phenomena, Processes]                                                       3820
[Diseases, Phenomena, Processes]                                                                              2330
[Diseases, Health Care]                                                                                        374
[Anthropology, Sociology, Psychiatry, Psychology]               

Counter({'Diseases': 130078,
         'None': 39471,
         'Psychiatry, Psychology': 25798,
         'Phenomena, Processes': 14366,
         'Diagnostic, Equipment': 4332,
         'Health Care': 607,
         'Anthropology, Sociology': 412,
         'Anatomy': 310,
         'Chemicals, Drugs': 3})

(177586, 51)

In [199]:
df.loc[df['Conditions_List'].apply(lambda x: x== ['None']) & df['Intervention_Type_List'].apply(lambda x: x!= ['DRUG'])][['Conditions', 'Intervention_Type_List', 'Healthy_Bin']]

Unnamed: 0,Conditions,Intervention_Type_List,Healthy_Bin
20,Specific Positive Memories|Specific Positive F...,[BEHAVIORAL],No
22,Granulocyte/ Polymorphonuclear Cells,[DEVICE],Yes
30,Study is Open to Seniors Age 50 or Older,[BEHAVIORAL],Yes
32,COPD,"[BEHAVIORAL, DEVICE]",No
34,Malaria|Plasmodium Falciparum,[BIOLOGICAL],Yes
...,...,...,...
177554,Intravenous Catheterization,[OTHER],Yes
177558,Healthy,[DEVICE],Yes
177571,Healthy Volunteers,[OTHER],Yes
177573,Cancer|Health Literacy|Health Insurance|Health...,[BEHAVIORAL],Yes


### Comorbidity_Counts

In [200]:
df = df.merge(my_conditions[["nct_id", "Comorbidity"]], on = "nct_id", how = "left")
df = df.rename(columns = {"Comorbidity" : "Comorbidity_Counts"})

display(df["Comorbidity_Counts"].value_counts(dropna=False))
display(df.shape)

Comorbidity_Counts
2     42791
0     39471
1     38826
3     27620
4     16568
5      8283
6      2486
7      1028
8       360
9        95
10       34
11       12
14        4
13        3
12        3
16        1
15        1
Name: count, dtype: int64

(177586, 52)

### Comorbidity_Bin


In [201]:
df['Comorbidity_Bin'] = df.loc[df["Comorbidity_Counts"] <= 1, "Comorbidity_Bin"] = 'No'
df['Comorbidity_Bin'] = df.loc[df["Comorbidity_Counts"] >= 2, "Comorbidity_Bin"] = 'Yes'

df['Comorbidity_Bin'].value_counts(dropna = False)

Comorbidity_Bin
Yes    177586
Name: count, dtype: int64

## Adverse

### Adverse_List

In [202]:
df = df.merge(my_adverse[["nct_id", "event_type"]], on = "nct_id", how = "left")
df = df.rename(columns={"event_type" : "Adverse_List"})
df["Adverse_List"] = df["Adverse_List"].apply(lambda x: ['None'] if (isinstance(x, list) and len(x) == 0) else x)

display(df["Adverse_List"].value_counts(dropna=False))
display(df.shape)

Adverse_List
[None]                     144050
[Death, Other, Serious]     12233
[Other, Serious]             6847
[Other]                      6822
[Death]                      3070
[Death, Other]               2827
[Death, Serious]             1038
[Serious]                     699
Name: count, dtype: int64

(177586, 54)

### Adverse_Counts

In [203]:
df = df.merge(my_adverse[["nct_id","event_sum"]], on = "nct_id", how = "left")
df = df.rename(columns={"event_sum" : "Adverse_Counts"}) # number of adverse noticed at the participants 
display(df["Adverse_Counts"].value_counts(dropna=False))
display(df["Adverse_Counts"].isnull().sum())
display(df.shape)

Adverse_Counts
0.0       146459
1.0         2494
3.0         1401
2.0          856
6.0          855
           ...  
846.0          1
3960.0         1
4298.0         1
1301.0         1
1144.0         1
Name: count, Length: 1542, dtype: int64

0

(177586, 55)

### Adverse_System_List

In [204]:
df = df.merge(my_adverse_system[["nct_id", "organ_system"]], on = "nct_id", how = "left")
df = df.rename(columns={"organ_system" : "Adverse_System_List"})
df["Adverse_System_List"].value_counts(dropna=False)


Adverse_System_List
NaN                                                                                                                                                                                                                                                                                              147041
[General]                                                                                                                                                                                                                                                                                           551
[Skin, Subcutaneous]                                                                                                                                                                                                                                                                                485
[Gastrointestinal]                                                                          

In [205]:

df.loc[(df["Adverse_System_List"].isnull() & df["Adverse_List"].apply(lambda x: x == ["None"])), "Adverse_System_List"] = "None"
df.loc[(df["Adverse_System_List"].isnull() & df["Adverse_List"].apply(lambda x: x == ["Death"])), "Adverse_System_List"] = "Death"  

df['Adverse_System_List'] = df['Adverse_System_List'].apply(lambda x: ['None'] if x== 'None' else x)
df['Adverse_System_List'] = df['Adverse_System_List'].apply(lambda x: ['Death'] if x== 'Death' else x)


df["Adverse_System_List"] = df["Adverse_System_List"].apply(lambda x: sorted(set(x)) if isinstance(x, str) else x)

display(df["Adverse_System_List"].value_counts(dropna=False))
display(df.shape)

Adverse_System_List
[None]                                                                                                                                                                                                                                                                                           143998
[Death]                                                                                                                                                                                                                                                                                            3043
[General]                                                                                                                                                                                                                                                                                           551
[Skin, Subcutaneous]                                                                        

(177586, 56)

### Adverse_System_Counts

In [206]:
df['Adverse_System_Counts'] = df['Adverse_System_List'].apply(lambda x: len(x) if isinstance(x, list) and x != ['None'] else 0)
display(df['Adverse_System_Counts'].value_counts(dropna = False))

Adverse_System_Counts
0     143998
1       7065
2       2336
3       1892
4       1661
5       1475
6       1416
7       1319
8       1303
9       1239
12      1165
10      1164
11      1151
17      1099
14      1074
13      1074
18      1059
16      1032
15      1027
19      1021
20       919
21       784
22       583
23       362
24       205
25       107
26        39
27        17
Name: count, dtype: int64

### Adverse_System_Counts_Log


In [207]:
df['Adverse_System_Counts_Log'] = np.log1p(df["Adverse_System_Counts"]) 
df['Adverse_System_Counts_Log'].describe()

count    177586.000000
mean          0.368930
std           0.850432
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           3.332205
Name: Adverse_System_Counts_Log, dtype: float64

## Termination

In [208]:
df = df.merge(my_terminations[["nct_id", "why_stopped"]], on = "nct_id", how = "left")
df = df.rename(columns={"why_stopped" : "Termination"})
df.loc[(df['Termination'].isnull()) & (df['Study Status'] =='TERMINATED'), 'Termination'] = np.NaN
df["Termination"] = df["Termination"].replace({np.NaN : "None"})
display(df['Termination'].value_counts(dropna=False))  
display(df["Termination"].isnull().sum())  
display(df.shape)

Termination
None              153617
Enrollment          9947
Cther               5779
Fund                4816
Administration      1799
Efficacy            1271
Supply               357
Name: count, dtype: int64

0

(177586, 59)

## Study Design

Source of Unique Values: 
https://clinicaltrials.gov/policy/protocol-definitions#study-design

Split Study Design into 4 columns --> 
1. Allocation (Randomized-Non Randomized) --> N/A (not applicable): For a single-arm trial --> https://clinicaltrials.gov/policy/protocol-definitions#study-sponsor-collaborators
2. Intervention_model (Single Grup, Crossover etc.)
3. Masking  (Blindness, None, Double etc.)
4. Primary purpose (Treatment, Screening etc.)


### Allocation 


In [209]:
df = df.merge(my_designs[["nct_id", "allocation"]], on = "nct_id", how = "left")
df = df.rename(columns={"allocation" : "Allocation"})
display(df["Allocation"].value_counts(dropna=False))
display(df.shape)

Allocation
RANDOMIZED        120394
NA                 40258
NON_RANDOMIZED     16531
None                 403
Name: count, dtype: int64

(177586, 60)

In [210]:
# Allocation Fillna
df.loc[(df["Allocation"].isnull()) & (df["Brief Summary"].str.contains(r"(?i)\snon.randomized\s|open label" , case = False , na = False) ) , "Allocation"] = "NON_RANDOMIZED"
df.loc[(df["Allocation"].isnull()) & (df["Brief Summary"].str.contains(r"(?i)\srandomized\s")) & ~(df["Brief Summary"].str.contains(r"(?i)\snon.randomized\s", case = False , na = False)) , "Allocation"] = "RANDOMIZED"

df["Allocation"] = df["Allocation"].replace({"NA" : "NOT_APPLICABLE"})
display(df["Allocation"].value_counts(dropna=False))
display(df.shape)

Allocation
RANDOMIZED        120407
NOT_APPLICABLE     40258
NON_RANDOMIZED     16549
None                 372
Name: count, dtype: int64

(177586, 60)

### Intervention_Model

In [211]:
df = df.merge(my_designs[["nct_id", "intervention_model"]], on = "nct_id", how = "left")
df = df.rename(columns={"intervention_model" : "Intervention_Model"})
display(df["Intervention_Model"].value_counts(dropna=False))
display(df.shape)

Intervention_Model
PARALLEL        107157
SINGLE_GROUP     45310
CROSSOVER        17496
SEQUENTIAL        5076
FACTORIAL         2338
None               209
Name: count, dtype: int64

(177586, 61)

In [212]:
# Fillna Intervention model
# display(df.loc[df["Intervention_Model"].isnull() & df["Brief Summary"].str.contains(r"(?i)PARALLEL|SINGLE GROUP|CROSSOVER|SEQUENTIAL|FACTORIAL"), "Brief Summary"].values[0:2])

df.loc[df["Intervention_Model"].isnull() & df["Brief Summary"].str.contains(r"PARALLEL", case = False), "Intervention_Model"] = "PARALLEL"
df.loc[df["Intervention_Model"].isnull() & df["Brief Summary"].str.contains(r"SINGLE GROUP", case = False), "Intervention_Model"] = "SINGLE_GROUP"
df.loc[df["Intervention_Model"].isnull() & df["Brief Summary"].str.contains(r"CROSSOVER", case = False), "Intervention_Model"] = "CROSSOVER"
df.loc[df["Intervention_Model"].isnull() & df["Brief Summary"].str.contains(r"SEQUENTIAL", case = False), "Intervention_Model"] = "SEQUENTIAL"
df.loc[df["Intervention_Model"].isnull() & df["Brief Summary"].str.contains(r"FACTORIAL", case = False), "Intervention_Model"] = "FACTORIAL"

display(df["Intervention_Model"].value_counts(dropna=False))
display(df.shape)

Intervention_Model
PARALLEL        107159
SINGLE_GROUP     45310
CROSSOVER        17498
SEQUENTIAL        5082
FACTORIAL         2338
None               199
Name: count, dtype: int64

(177586, 61)

### Masking

In [213]:
df = df.merge(my_designs[["nct_id", "masking"]], on = "nct_id", how = "left")
df = df.rename(columns={"masking" : "Masking"})
display(df["Masking"].value_counts(dropna=False))
display(df.shape)

Masking
NONE         93879
SINGLE       28519
DOUBLE       24101
QUADRUPLE    17726
TRIPLE       13100
None           261
Name: count, dtype: int64

(177586, 62)

In [214]:
# Fillna Masking 
df.loc[df["Masking"].isnull() & df["Brief Summary"].str.contains(r"(?i)OPEN.LABEL") , "Masking"] = "NONE"
df.loc[df["Masking"].isnull() & df["Brief Summary"].str.contains(r"(?i)SINGLE.BLIND") , "Masking"] = "SINGLE"
df.loc[df["Masking"].isnull() & df["Brief Summary"].str.contains(r"(?i)DOUBLE.BLIND") , "Masking"] = "DOUBLE"
df.loc[df["Masking"].isnull() & df["Brief Summary"].str.contains(r"(?i)TRIPLE.BLIND") , "Masking"] = "TRIPLE"
df.loc[df["Masking"].isnull() & df["Brief Summary"].str.contains(r"(?i)QUADRUPLE.BLIND") , "Masking"] = "QUADRUPLE"

display(df["Masking"].value_counts(dropna = False))
display(df.shape)

Masking
NONE         93890
SINGLE       28519
DOUBLE       24105
QUADRUPLE    17726
TRIPLE       13100
None           246
Name: count, dtype: int64

(177586, 62)

### Masking_Detail_List

In [215]:
df = df.merge(my_designs[["nct_id", "masking_detail"]], on = "nct_id", how = "left")
df = df.rename(columns={"masking_detail" : "Masking_Detail_List"})

In [216]:
df.loc[df["Masking_Detail_List"].apply(lambda x: isinstance(x, list) and len(x) == 0)]["Masking"].value_counts(dropna=False)  


Masking
NONE      93890
None        246
DOUBLE      171
SINGLE       35
Name: count, dtype: int64

In [217]:
# Fillna Masking_Detail
df.loc[df["Masking_Detail_List"].apply(lambda x: isinstance(x, list) and len(x) == 0)]["Masking"].value_counts(dropna=False)  # Masking = 'NONE' --> Masking_Details = ['NONE']
df.loc[(df["Masking_Detail_List"].apply(lambda x: isinstance(x, list) and len(x) == 0)) & (df["Masking"] == "NONE"), "Masking_Detail_List"] = "NONE"
df["Masking_Detail_List"] = df["Masking_Detail_List"].apply(lambda x: ["NONE"] if x == "NONE" else x)
df["Masking_Detail_List"] = df["Masking_Detail_List"].apply(lambda x: np.NaN if isinstance(x, list) and len(x) == 0 else x)

display(df["Masking_Detail_List"].value_counts(dropna=False))
display(df.shape)

Masking_Detail_List
[NONE]                                                           93890
[OUTCOMES_ASSESSOR, INVESTIGATOR, PARTICIPANT, CARE_PROVIDER]    17726
[PARTICIPANT, INVESTIGATOR]                                      13168
[OUTCOMES_ASSESSOR]                                              12450
[PARTICIPANT]                                                    12017
[OUTCOMES_ASSESSOR, INVESTIGATOR, PARTICIPANT]                    5776
[OUTCOMES_ASSESSOR, PARTICIPANT]                                  5618
[PARTICIPANT, INVESTIGATOR, CARE_PROVIDER]                        5238
[INVESTIGATOR]                                                    3378
[OUTCOMES_ASSESSOR, INVESTIGATOR]                                 2609
[PARTICIPANT, CARE_PROVIDER]                                      1866
[OUTCOMES_ASSESSOR, PARTICIPANT, CARE_PROVIDER]                   1591
[CARE_PROVIDER]                                                    639
[OUTCOMES_ASSESSOR, INVESTIGATOR, CARE_PROVIDER]         

(177586, 63)

### Primary Purpose

In [218]:
df = df.merge(my_designs[["nct_id", "primary_purpose"]], on = "nct_id", how = "left")
df = df.rename(columns={"primary_purpose" : "Primary_Purpose"})
df["Primary_Purpose"] = df["Primary_Purpose"].replace({np.NaN : "UNKNOWN"})
display(df["Primary_Purpose"].value_counts(dropna=False))
display(df.shape)

Primary_Purpose
TREATMENT                   103582
PREVENTION                   19900
OTHER                        13198
SUPPORTIVE_CARE              11431
BASIC_SCIENCE                11308
DIAGNOSTIC                    7819
HEALTH_SERVICES_RESEARCH      5448
UNKNOWN                       2352
SCREENING                     1760
DEVICE_FEASIBILITY             788
Name: count, dtype: int64

(177586, 64)

## Outcomes

In [219]:
df = df.drop(columns = ['Primary Outcome Measures', 'Secondary Outcome Measures','Other Outcome Measures'] , axis = 1)

### Outcomes_List

In [220]:
df = df.merge(my_outcomes[["nct_id", "outcome_type"]], on = "nct_id", how = "left")
df = df.rename(columns={"outcome_type" : "Outcomes_List"})
display(df["Outcomes_List"].value_counts(dropna=False))
df["Outcomes_List"] = df["Outcomes_List"].apply(lambda x: ['None'] if (isinstance(x, list) and len(x) == 0) else x)
display(df.shape)

Outcomes_List
[]                                                     135774
[PRIMARY, SECONDARY]                                    29859
[PRIMARY]                                                7551
[OTHER_PRE_SPECIFIED, PRIMARY, SECONDARY]                3395
[OTHER_PRE_SPECIFIED, PRIMARY]                            464
[POST_HOC, PRIMARY, SECONDARY]                            381
[OTHER_PRE_SPECIFIED, POST_HOC, PRIMARY, SECONDARY]       110
[POST_HOC, PRIMARY]                                        42
[OTHER_PRE_SPECIFIED, POST_HOC, PRIMARY]                   10
Name: count, dtype: int64

(177586, 62)

### Outcomes_Counts

In [221]:
df["Outcomes_Counts"] = df["Outcomes_List"].apply(lambda x: len(x) if x != ["None"] else 0)
display(df["Outcomes_Counts"].value_counts(dropna=False))
display(df.shape)

Outcomes_Counts
0    135774
2     30365
1      7551
3      3786
4       110
Name: count, dtype: int64

(177586, 63)

### Outcomes_Bin 


In [222]:
df["Outcomes_Bin"] = "No"
df.loc[df["Outcomes_Counts"] != 0, "Outcomes_Bin"] = "Yes"
display(df["Outcomes_Bin"].value_counts(dropna=False))

Outcomes_Bin
No     135774
Yes     41812
Name: count, dtype: int64

## Locations

In [223]:
# Countries
df = df.merge(my_locations[[ "nct_id","facilities_country"]], on = "nct_id", how = "left") # already sorted(set(x)) in df_locations
df = df.rename(columns={"facilities_country" : "Countries"}) 
df["Countries"] = df["Countries"].apply(lambda x: ['None'] if isinstance(x, list) and len(x) == 0 else x)
display(df["Countries"].value_counts(dropna=False))
display(df.shape)

Countries
[United States]                                                                                                                                                                                                  59268
[None]                                                                                                                                                                                                           11520
[China]                                                                                                                                                                                                           7839
[France]                                                                                                                                                                                                          7124
[Turkey]                                                                                                                          

(177586, 65)

### Country_Counts

In [224]:
df = df.merge(my_locations[[ "nct_id","Country_Counts"]], on = "nct_id", how = "left")
display(df["Country_Counts"].value_counts(dropna=False))
display(df.shape) 

Country_Counts
1     151043
0      11520
2       4665
3       1905
4       1403
5       1058
6        882
7        740
8        628
9        475
10       450
11       368
12       307
13       259
14       233
15       206
16       183
18       150
17       145
19       136
20        98
21        89
22        85
23        68
24        63
26        53
25        52
28        40
27        39
29        37
31        34
30        26
32        21
33        18
34        15
37        14
35        11
42         9
36         9
41         8
38         7
49         6
44         5
40         5
43         5
45         3
48         3
39         2
46         2
51         1
57         1
59         1
Name: count, dtype: int64

(177586, 66)

### Continents_List


In [225]:
Countries = pd.DataFrame(geonamescache.GeonamesCache().get_countries()).transpose().reset_index()
Countries = Countries[["name", "continentcode"]]
Countries = Countries.replace({"NA": "North America", "OC": "Oceania", "EU": "Europe", "AS": "Asia", "AF": "Africa", "SA": "South America"})
geo_dict = dict(zip( Countries['name'], Countries['continentcode']))

df['Continents_List'] = df['Countries'].apply(lambda x: [geo_dict.get(item, "None") for item in x] if isinstance(x, list) else ["None"])
df['Continents_List'] = df['Continents_List'].apply(lambda x: ["None"] if len(x) == 0 else x)
df['Continents_List'] = df['Continents_List'].apply(lambda x: sorted(set(x)))

display(df['Continents_List'].value_counts(dropna = False))
display(df.shape)

Continents_List
[North America]                                    67782
[Europe]                                           41614
[Asia]                                             32456
[None]                                             13063
[Africa]                                            7184
                                                   ...  
[Africa, Asia, None, South America]                    1
[Asia, None, South America]                            1
[Africa, None, South America]                          1
[Asia, None, North America, Oceania]                   1
[Africa, North America, Oceania, South America]        1
Name: count, Length: 114, dtype: int64

(177586, 67)

### Continent_Counts

In [226]:
df["Continent_Counts"] = df["Continents_List"].apply(lambda x: len(x) if isinstance(x, list) and x !=['None'] else 0)
display(df["Continent_Counts"].value_counts(dropna = False))
display(df.shape)

Continent_Counts
1    153885
0     13063
2      4869
3      2506
4      1534
5       896
6       599
7       234
Name: count, dtype: int64

(177586, 68)

### City_Counts

In [227]:
df = df.merge(my_locations[[ "nct_id","City_Counts"]], on = "nct_id", how = "left") # Cities already sorted(set(x)) in df_locations --> correct counts
display(df["City_Counts"].value_counts(dropna=False))
display(df.shape) 

City_Counts
1      124904
0       11520
2        9011
3        4379
4        3085
        ...  
313         1
242         1
306         1
232         1
342         1
Name: count, Length: 398, dtype: int64

(177586, 69)

## Enrollment


### Enrollment


In [228]:
display(df["Enrollment"].value_counts(dropna=False)) # useful for qcut
display(df["Enrollment"].isnull().sum()) # Null values
display(df["Enrollment"].head(2)) # useful for qcut
display(df.shape)

Enrollment
0.0        8396
60.0       5056
30.0       4996
20.0       4781
40.0       4606
           ... 
4485.0        1
2164.0        1
27395.0       1
5219.0        1
2367.0        1
Name: count, Length: 3907, dtype: int64

54

0    840.0
1    330.0
Name: Enrollment, dtype: float64

(177586, 69)

### Enrollment_Log


In [229]:
df['Enrollment_Log'] = np.log1p(df['Enrollment'])
df['Enrollment_Log'].describe()  

count    177532.000000
mean          3.892819
std           1.587854
min           0.000000
25%           3.091042
50%           3.931826
75%           4.744932
max          16.817285
Name: Enrollment_Log, dtype: float64

## Phases

- 5 phases and NA: 
1. Early Phase 1 --> formerly listed as Phase 0, 
2. Phase 1, 
3. Phase 2, 
4. Phase 3,  
5. Phase 4. 
6. Not Applicable --> used to describe trials without FDA-defined phases, including trials of devices or behavioral interventions.



### Phases

In [230]:
# Same with db and csv
df["Phases"] = df["Phases"].replace({np.NaN : "NOT_APPLICABLE", "EARLY_PHASE1" : "PHASE0"})  
df["Phases"] = df["Phases"].str.split( "|" )  # No apply sort(list(x)) needed here - No duplicated elements with diff list-elem order

display(df["Phases"].value_counts(dropna=False))  
display(df.shape)

Phases
[NOT_APPLICABLE]    97831
[PHASE1]            21764
[PHASE2]            20896
[PHASE3]            13817
[PHASE4]            13073
[PHASE1, PHASE2]     5411
[PHASE2, PHASE3]     2490
[PHASE0]             2304
Name: count, dtype: int64

(177586, 70)

# 4) Drop Cols

In [231]:
df = df.drop(columns = ["Study Title", "official_title", "Other IDs", "Acronym", "Study URL","Study Type", "Study Results", 'Brief Summary', "Locations", \
                        "Study Design", "Interventions","Start Date", "Completion Date", "First Posted", "Results First Posted","Last Update Posted",\
                        "Primary Completion Date", "First Posted", "Results First Posted", "Last Update Posted", "Primary Completion Date", "Conditions",\
                        'Sponsor', 'Collaborators', 'Sponsor_Type', 'Collaborator_Type', "Countries", "Study Documents"] , axis = 1) 

# 5) NaN Check

In [232]:
df.isnull().sum().sort_values(ascending=False)

Arm_Counts                    910
Masking_Detail_List           452
Allocation                    372
Masking                       246
Intervention_Model            199
Sex                            85
Healthy_Bin                    66
Enrollment_Log                 54
Enrollment                     54
Outcomes_Bin                    0
Adverse_System_Counts_Log       0
Adverse_List                    0
City_Counts                     0
Continent_Counts                0
Adverse_Counts                  0
Adverse_System_List             0
Adverse_System_Counts           0
Termination                     0
Outcomes_Counts                 0
Continents_List                 0
Country_Counts                  0
Comorbidity_Counts              0
Primary_Purpose                 0
Outcomes_List                   0
Comorbidity_Bin                 0
nct_id                          0
Conditions_List                 0
Study Status                    0
Age_List                        0
Phases        

# 6) Dfs Split

In [233]:
mlb = MultiLabelBinarizer()
dummies = pd.DataFrame(mlb.fit_transform(df['Phases']), columns = mlb.classes_, index = df.index)
df = pd.concat([df, dummies] , axis = 1)
df.head(2)

Unnamed: 0,nct_id,Study Status,Sex,Age_List,Phases,Enrollment,Funder_Type,Sponsor_Collab_List,Study_Documents_List,Document_Counts,Document_Counts_Bin,Start_Date_Year,Completion_Date_Year,Start_Date_Year_Categ,Completion_Date_Year_Categ,Completion_Gap,Intervention_Method_List,...,Masking,Masking_Detail_List,Primary_Purpose,Outcomes_List,Outcomes_Counts,Outcomes_Bin,Country_Counts,Continents_List,Continent_Counts,City_Counts,Enrollment_Log,NOT_APPLICABLE,PHASE0,PHASE1,PHASE2,PHASE3,PHASE4
0,NCT04385680,COMPLETED,FEMALE,"[ADULT, OLDER_ADULT]","[PHASE1, PHASE2]",840.0,OTHER_GOV,[University],[None],0,No,2020,2022,"(2019, 2024]","(2019, 2024]",27,[Topical],...,SINGLE,[PARTICIPANT],PREVENTION,[None],0,No,1,[Asia],1,1,6.734592,0,0,1,1,0,0
1,NCT05017480,COMPLETED,ALL,"[CHILD, ADULT, OLDER_ADULT]",[PHASE2],330.0,INDUSTRY,[Pharmaceutical],"[Analysis Plan, Protocol]",2,Yes,2021,2023,"(2019, 2024]","(2019, 2024]",25,[Unknown],...,QUADRUPLE,"[OUTCOMES_ASSESSOR, INVESTIGATOR, PARTICIPANT,...",TREATMENT,"[PRIMARY, SECONDARY]",2,Yes,1,[Asia],1,30,5.802118,0,0,0,1,0,0


In [None]:
df0 = df[df["PHASE0"] == 1].reset_index(drop = True).drop(columns = ['NOT_APPLICABLE', 'PHASE0', 'PHASE1','PHASE2', 'PHASE3', 'PHASE4'])
df1 = df[df["PHASE1"] == 1].reset_index(drop = True).drop(columns = ['NOT_APPLICABLE', 'PHASE0', 'PHASE1','PHASE2', 'PHASE3', 'PHASE4'])
df2 = df[df["PHASE2"] == 1].reset_index(drop = True).drop(columns = ['NOT_APPLICABLE', 'PHASE0', 'PHASE1','PHASE2', 'PHASE3', 'PHASE4'])
df3 = df[df["PHASE3"] == 1].reset_index(drop = True).drop(columns = ['NOT_APPLICABLE', 'PHASE0', 'PHASE1','PHASE2', 'PHASE3', 'PHASE4'])
df4 = df[df["PHASE4"] == 1].reset_index(drop = True).drop(columns = ['NOT_APPLICABLE', 'PHASE0', 'PHASE1','PHASE2', 'PHASE3', 'PHASE4'])
df5 = df[df["NOT_APPLICABLE"] == 1].reset_index(drop = True).drop(columns = ['NOT_APPLICABLE', 'PHASE0', 'PHASE1','PHASE2', 'PHASE3', 'PHASE4'])

display(df0.shape)
display(df1.shape)
display(df2.shape)
display(df3.shape)
display(df4.shape)
display(df5.shape)

(2304, 46)

(27175, 46)

(28797, 46)

(16307, 46)

(13073, 46)

(97831, 46)

## Count_Categ_Columns

In [235]:
display(df['Enrollment'].isnull().sum())
df_test = df.loc[(df['Enrollment'].isnull()) & (df['Study Status'] == 'TERMINATED')][['nct_id', 'Study Status', 'Termination']]
df_test = pd.merge(left = df_test, right =  my_terminations2[['nct_id', 'why_stopped']], on='nct_id', how ='left')
df_test # --> No values to be filled as 0

54

Unnamed: 0,nct_id,Study Status,Termination,why_stopped
0,NCT01899235,TERMINATED,Enrollment,Slow recruitment
1,NCT01444118,TERMINATED,Efficacy,The early termination is not related to safety...
2,NCT01705808,TERMINATED,,
3,NCT01801943,TERMINATED,Enrollment,participants are no longer being examined or r...
4,NCT02267551,TERMINATED,Enrollment,low enrollment


### Enrollment_Categ

In [236]:
display(df0['Enrollment'].describe())
display(df1['Enrollment'].describe())
display(df2['Enrollment'].describe())
display(df3['Enrollment'].describe())
display(df4['Enrollment'].describe())
display(df5['Enrollment'].describe())

df0['Enrollment_Categ'] = pd.cut(df0["Enrollment"], bins = [df0["Enrollment"].min(), df0["Enrollment"].median(), df0["Enrollment"].max()], ordered = True, include_lowest = False)
df1['Enrollment_Categ'] = pd.cut(df1["Enrollment"], bins = [df1["Enrollment"].min(), df1["Enrollment"].median(), df1["Enrollment"].max()], ordered = True, include_lowest = False)
df2['Enrollment_Categ'] = pd.cut(df2["Enrollment"], bins = [df2["Enrollment"].min(), df2["Enrollment"].median(), df2["Enrollment"].max()], ordered = True, include_lowest = False)
df3['Enrollment_Categ'] = pd.cut(df3["Enrollment"], bins = [df3["Enrollment"].min(), df3["Enrollment"].median(), df3["Enrollment"].max()], ordered = True, include_lowest = False)
df4['Enrollment_Categ'] = pd.cut(df4["Enrollment"], bins = [df4["Enrollment"].min(), df4["Enrollment"].median(), df4["Enrollment"].max()], ordered = True, include_lowest = False)
df5['Enrollment_Categ'] = pd.cut(df5["Enrollment"], bins = [df5["Enrollment"].min(), df5["Enrollment"].median(), df5["Enrollment"].max()], ordered = True, include_lowest = False)

display(df0["Enrollment_Categ"])
display(df.shape)

count    2304.000000
mean       49.031250
std       207.342017
min         0.000000
25%         7.000000
50%        20.000000
75%        43.000000
max      5998.000000
Name: Enrollment, dtype: float64

count    27167.000000
mean        46.909964
std        107.316392
min          0.000000
25%         14.000000
50%         28.000000
75%         51.000000
max       8231.000000
Name: Enrollment, dtype: float64

count     28791.000000
mean        111.011358
std        1093.740620
min           0.000000
25%          15.000000
50%          41.000000
75%         100.000000
max      144539.000000
Name: Enrollment, dtype: float64

count     16299.000000
mean        568.006626
std        4944.845306
min           0.000000
25%          52.000000
50%         159.000000
75%         419.000000
max      477102.000000
Name: Enrollment, dtype: float64

count     13070.000000
mean        390.859985
std        5927.643855
min           0.000000
25%          25.000000
50%          60.000000
75%         139.750000
max      393387.000000
Name: Enrollment, dtype: float64

count    9.780100e+04
mean     1.065257e+03
std      7.888849e+04
min      0.000000e+00
25%      2.400000e+01
50%      5.200000e+01
75%      1.120000e+02
max      2.012121e+07
Name: Enrollment, dtype: float64

0          (0.0, 20.0]
1       (20.0, 5998.0]
2       (20.0, 5998.0]
3          (0.0, 20.0]
4       (20.0, 5998.0]
             ...      
2299       (0.0, 20.0]
2300    (20.0, 5998.0]
2301    (20.0, 5998.0]
2302       (0.0, 20.0]
2303       (0.0, 20.0]
Name: Enrollment_Categ, Length: 2304, dtype: category
Categories (2, interval[float64, right]): [(0.0, 20.0] < (20.0, 5998.0]]

(177586, 52)

### Document_Counts_Categ

In [237]:
display(df0['Document_Counts'].describe())
display(df1['Document_Counts'].describe())
display(df2['Document_Counts'].describe())
display(df3['Document_Counts'].describe())
display(df4['Document_Counts'].describe())
display(df5['Document_Counts'].describe())

df0['Document_Counts_Categ'] = pd.cut(df0["Document_Counts"], bins = [df0['Document_Counts'].min(), 1, df0['Document_Counts'].max()], ordered = True, include_lowest = False)
df1['Document_Counts_Categ'] = pd.cut(df1["Document_Counts"], bins = [df1['Document_Counts'].min(), 1, df1['Document_Counts'].max()], ordered = True, include_lowest = False)
df2['Document_Counts_Categ'] = pd.cut(df2["Document_Counts"], bins = [df2['Document_Counts'].min(), 1, df2['Document_Counts'].max()], ordered = True, include_lowest = False)
df3['Document_Counts_Categ'] = pd.cut(df3["Document_Counts"], bins = [df3['Document_Counts'].min(), 1, df3['Document_Counts'].max()], ordered = True, include_lowest = False)
df4['Document_Counts_Categ'] = pd.cut(df4["Document_Counts"], bins = [df4['Document_Counts'].min(), 1, df4['Document_Counts'].max()], ordered = True, include_lowest = False)
df5['Document_Counts_Categ'] = pd.cut(df5["Document_Counts"], bins = [df5['Document_Counts'].min(), 1, df5['Document_Counts'].max()], ordered = True, include_lowest = False)

df0["Document_Counts_Categ"].head()


count    2304.000000
mean        0.313802
std         0.801681
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         3.000000
Name: Document_Counts, dtype: float64

count    27175.000000
mean         0.319853
std          0.771775
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          3.000000
Name: Document_Counts, dtype: float64

count    28797.000000
mean         0.669653
std          1.010337
min          0.000000
25%          0.000000
50%          0.000000
75%          2.000000
max          3.000000
Name: Document_Counts, dtype: float64

count    16307.000000
mean         0.603851
std          0.951184
min          0.000000
25%          0.000000
50%          0.000000
75%          2.000000
max          3.000000
Name: Document_Counts, dtype: float64

count    13073.000000
mean         0.454372
std          0.899520
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          3.000000
Name: Document_Counts, dtype: float64

count    97831.000000
mean         0.283346
std          0.757042
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          3.000000
Name: Document_Counts, dtype: float64

0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
Name: Document_Counts_Categ, dtype: category
Categories (2, interval[int64, right]): [(0, 1] < (1, 3]]

### Outcomes_Counts_Categ

In [253]:
display(df0['Outcomes_Counts'].describe())
display(df1['Outcomes_Counts'].describe())
display(df2['Outcomes_Counts'].describe())
display(df3['Outcomes_Counts'].describe())
display(df4['Outcomes_Counts'].describe())
display(df5['Outcomes_Counts'].describe())

df0['Outcomes_Counts_Categ'] = pd.cut(df0["Outcomes_Counts"], bins = [df0['Outcomes_Counts'].min(), 1, df0['Outcomes_Counts'].max()], ordered = True, include_lowest = False)
df1['Outcomes_Counts_Categ'] = pd.cut(df1["Outcomes_Counts"], bins = [df1['Outcomes_Counts'].min(), 1, df1['Outcomes_Counts'].max()], ordered = True, include_lowest = False)
df2['Outcomes_Counts_Categ'] = pd.cut(df2["Outcomes_Counts"], bins = [df2['Outcomes_Counts'].min(), 1, df2['Outcomes_Counts'].max()], ordered = True, include_lowest = False)
df3['Outcomes_Counts_Categ'] = pd.cut(df3["Outcomes_Counts"], bins = [df3['Outcomes_Counts'].min(), 1, df3['Outcomes_Counts'].max()], ordered = True, include_lowest = False)
df4['Outcomes_Counts_Categ'] = pd.cut(df4["Outcomes_Counts"], bins = [df4['Outcomes_Counts'].min(), 1, df4['Outcomes_Counts'].max()], ordered = True, include_lowest = False)
df5['Outcomes_Counts_Categ'] = pd.cut(df5["Outcomes_Counts"], bins = [df5['Outcomes_Counts'].min(), 1, df5['Outcomes_Counts'].max()], ordered = True, include_lowest = False)

df0["Outcomes_Counts_Categ"].head()

count    2304.000000
mean        0.243924
std         0.658593
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         4.000000
Name: Outcomes_Counts, dtype: float64

count    27175.000000
mean         0.405299
std          0.809481
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          4.000000
Name: Outcomes_Counts, dtype: float64

count    28797.000000
mean         0.856166
std          1.026768
min          0.000000
25%          0.000000
50%          0.000000
75%          2.000000
max          4.000000
Name: Outcomes_Counts, dtype: float64

count    16307.000000
mean         0.918808
std          1.033372
min          0.000000
25%          0.000000
50%          0.000000
75%          2.000000
max          4.000000
Name: Outcomes_Counts, dtype: float64

count    13073.000000
mean         0.632602
std          0.942762
min          0.000000
25%          0.000000
50%          0.000000
75%          2.000000
max          4.000000
Name: Outcomes_Counts, dtype: float64

count    97831.000000
mean         0.268105
std          0.689748
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          4.000000
Name: Outcomes_Counts, dtype: float64

0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
Name: Outcomes_Counts_Categ, dtype: category
Categories (2, interval[int64, right]): [(0, 1] < (1, 4]]

### Adverse_Counts_Categ
- Many values out of main distribution (around 0) --> No pd.Cut --> use just binary (Adverse_Bin)?

In [238]:
display(df0['Adverse_Counts'].describe())
display(df1['Adverse_Counts'].describe())
display(df2['Adverse_Counts'].describe())
display(df3['Adverse_Counts'].describe())
display(df4['Adverse_Counts'].describe())
display(df5['Adverse_Counts'].describe())

df0['Adverse_Counts_Log'] = np.log1p(df0['Adverse_Counts'])
df1['Adverse_Counts_Log'] = np.log1p(df1['Adverse_Counts'])
df2['Adverse_Counts_Log'] = np.log1p(df2['Adverse_Counts'])
df3['Adverse_Counts_Log'] = np.log1p(df3['Adverse_Counts'])
df4['Adverse_Counts_Log'] = np.log1p(df4['Adverse_Counts'])
df5['Adverse_Counts_Log'] = np.log1p(df5['Adverse_Counts'])

display(df0['Adverse_Counts_Log'].describe())


count    2304.000000
mean        0.980469
std         6.281296
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max       130.000000
Name: Adverse_Counts, dtype: float64

count    27175.000000
mean         9.194922
std         58.667264
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max       3385.000000
Name: Adverse_Counts, dtype: float64

count    28797.000000
mean        32.399243
std        123.021921
min          0.000000
25%          0.000000
50%          0.000000
75%         19.000000
max       5288.000000
Name: Adverse_Counts, dtype: float64

count    16307.000000
mean       162.888882
std        738.428413
min          0.000000
25%          0.000000
50%          0.000000
75%         87.000000
max      47910.000000
Name: Adverse_Counts, dtype: float64

count    13073.000000
mean        28.143884
std        655.397065
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max      70245.000000
Name: Adverse_Counts, dtype: float64

count    97831.000000
mean         5.300631
std        158.296170
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max      28085.000000
Name: Adverse_Counts, dtype: float64

count    2304.000000
mean        0.158556
std         0.613821
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         4.875197
Name: Adverse_Counts_Log, dtype: float64

### Completion_Gap_Categ

In [239]:
display(df0['Completion_Gap'].describe())
display(df1['Completion_Gap'].describe())
display(df2['Completion_Gap'].describe())
display(df3['Completion_Gap'].describe())
display(df4['Completion_Gap'].describe())
display(df5['Completion_Gap'].describe())

df0['Completion_Gap_Categ'] = pd.cut(df0["Completion_Gap"], bins = [df0["Completion_Gap"].min(), df0["Completion_Gap"].median(), df0["Completion_Gap"].max()], ordered = True, include_lowest = False)
df1['Completion_Gap_Categ'] = pd.cut(df1["Completion_Gap"], bins = [df1["Completion_Gap"].min(), df1["Completion_Gap"].median(), df1["Completion_Gap"].max()], ordered = True, include_lowest = False)
df2['Completion_Gap_Categ'] = pd.cut(df2["Completion_Gap"], bins = [df2["Completion_Gap"].min(), df2["Completion_Gap"].median(), df2["Completion_Gap"].max()], ordered = True, include_lowest = False)
df3['Completion_Gap_Categ'] = pd.cut(df3["Completion_Gap"], bins = [df3["Completion_Gap"].min(), df3["Completion_Gap"].median(), df3["Completion_Gap"].max()], ordered = True, include_lowest = False)
df4['Completion_Gap_Categ'] = pd.cut(df4["Completion_Gap"], bins = [df4["Completion_Gap"].min(), df4["Completion_Gap"].median(), df4["Completion_Gap"].max()], ordered = True, include_lowest = False)
df5['Completion_Gap_Categ'] = pd.cut(df5["Completion_Gap"], bins = [df5["Completion_Gap"].min(), df5["Completion_Gap"].median(), df5["Completion_Gap"].max()], ordered = True, include_lowest = False)

display(df0['Completion_Gap_Categ'].value_counts())

count       2304.0
mean     24.924913
std      20.113227
min            0.0
25%           10.0
50%           20.0
75%           35.0
max          128.0
Name: Completion_Gap, dtype: Float64

count      27175.0
mean     22.270543
std      23.265677
min            0.0
25%            5.0
50%           14.0
75%           33.0
max          150.0
Name: Completion_Gap, dtype: Float64

count      28797.0
mean     31.597354
std       24.07007
min            0.0
25%           13.0
50%           25.0
75%           44.0
max          163.0
Name: Completion_Gap, dtype: Float64

count      16307.0
mean     29.560557
std      22.722842
min            0.0
25%           13.0
50%           24.0
75%           40.0
max          160.0
Name: Completion_Gap, dtype: Float64

count      13073.0
mean     25.428823
std      19.978733
min            0.0
25%           11.0
50%           21.0
75%           35.0
max          149.0
Name: Completion_Gap, dtype: Float64

count      97831.0
mean     22.925443
std      20.217512
min            0.0
25%            8.0
50%           17.0
75%           32.0
max          157.0
Name: Completion_Gap, dtype: Float64

Completion_Gap_Categ
(20.0, 128.0]    1120
(0.0, 20.0]      1111
Name: count, dtype: int64

### Intervention_Counts_Categ

In [257]:
display(df0['Intervention_Counts'].describe())
display(df1['Intervention_Counts'].describe())
display(df2['Intervention_Counts'].describe())
display(df3['Intervention_Counts'].describe())
display(df4['Intervention_Counts'].describe())
display(df5['Intervention_Counts'].describe())

df0['Intervention_Counts_Categ'] = pd.cut(df0["Intervention_Counts"], bins = [0, 1, df0["Intervention_Counts"].max()], ordered = True, include_lowest = False)
df1['Intervention_Counts_Categ'] = pd.cut(df1["Intervention_Counts"], bins = [0, 1, df1["Intervention_Counts"].max()], ordered = True, include_lowest = False)
df2['Intervention_Counts_Categ'] = pd.cut(df2["Intervention_Counts"], bins = [0, 1, df2["Intervention_Counts"].max()], ordered = True, include_lowest = False)
df3['Intervention_Counts_Categ'] = pd.cut(df3["Intervention_Counts"], bins = [0, 1, df3["Intervention_Counts"].max()], ordered = True, include_lowest = False)
df4['Intervention_Counts_Categ'] = pd.cut(df4["Intervention_Counts"], bins = [0, 1, df4["Intervention_Counts"].max()], ordered = True, include_lowest = False)
df5['Intervention_Counts_Categ'] = pd.cut(df5["Intervention_Counts"], bins = [0, 1, df5["Intervention_Counts"].max()], ordered = True, include_lowest = False)

display(df0['Intervention_Counts_Categ'].value_counts())

count    2304.000000
mean        1.280816
std         0.544707
min         1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         4.000000
Name: Intervention_Counts, dtype: float64

count    27175.000000
mean         1.162870
std          0.438167
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          6.000000
Name: Intervention_Counts, dtype: float64

count    28797.000000
mean         1.225614
std          0.497290
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          6.000000
Name: Intervention_Counts, dtype: float64

count    16307.000000
mean         1.166554
std          0.403410
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          5.000000
Name: Intervention_Counts, dtype: float64

count    13073.000000
mean         1.166832
std          0.409602
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          5.000000
Name: Intervention_Counts, dtype: float64

count    97831.000000
mean         1.115516
std          0.350750
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          5.000000
Name: Intervention_Counts, dtype: float64

Intervention_Counts_Categ
(0, 1]    1757
(1, 4]     547
Name: count, dtype: int64

### Comorbidity_Counts_Categ
- See plots --> Comorbidity has a "cutpoint" at 2 counts in most Phases

In [241]:
display(df0['Comorbidity_Counts'].describe())
display(df1['Comorbidity_Counts'].describe())
display(df2['Comorbidity_Counts'].describe())
display(df3['Comorbidity_Counts'].describe())
display(df4['Comorbidity_Counts'].describe())
display(df5['Comorbidity_Counts'].describe())

df0['Comorbidity_Counts_Categ'] = pd.cut(df0["Comorbidity_Counts"], bins = [0, df0["Comorbidity_Counts"].quantile(0.75), df0["Comorbidity_Counts"].max()], ordered = True, include_lowest = False) 
df1['Comorbidity_Counts_Categ'] = pd.cut(df1["Comorbidity_Counts"], bins = [0, df1["Comorbidity_Counts"].quantile(0.75), df1["Comorbidity_Counts"].max()], ordered = True, include_lowest = False) 
df2['Comorbidity_Counts_Categ'] = pd.cut(df2["Comorbidity_Counts"], bins = [0, df2["Comorbidity_Counts"].quantile(0.75), df2["Comorbidity_Counts"].max()], ordered = True, include_lowest = False) 
df3['Comorbidity_Counts_Categ'] = pd.cut(df3["Comorbidity_Counts"], bins = [0, df3["Comorbidity_Counts"].quantile(0.75), df3["Comorbidity_Counts"].max()], ordered = True, include_lowest = False) 
df4['Comorbidity_Counts_Categ'] = pd.cut(df4["Comorbidity_Counts"], bins = [0, df4["Comorbidity_Counts"].quantile(0.75), df4["Comorbidity_Counts"].max()], ordered = True, include_lowest = False) 
df5['Comorbidity_Counts_Categ'] = pd.cut(df5["Comorbidity_Counts"], bins = [0, df5["Comorbidity_Counts"].quantile(0.75), df5["Comorbidity_Counts"].max()], ordered = True, include_lowest = False) 

df0['Comorbidity_Counts_Categ'].value_counts(dropna = False)

count    2304.000000
mean        1.914931
std         1.583859
min         0.000000
25%         1.000000
50%         2.000000
75%         3.000000
max        10.000000
Name: Comorbidity_Counts, dtype: float64

count    27175.000000
mean         1.665097
std          1.567864
min          0.000000
25%          0.000000
50%          1.000000
75%          3.000000
max         14.000000
Name: Comorbidity_Counts, dtype: float64

count    28797.000000
mean         2.365073
std          1.425452
min          0.000000
25%          1.000000
50%          2.000000
75%          3.000000
max         13.000000
Name: Comorbidity_Counts, dtype: float64

count    16307.000000
mean         2.275035
std          1.441306
min          0.000000
25%          1.000000
50%          2.000000
75%          3.000000
max         10.000000
Name: Comorbidity_Counts, dtype: float64

count    13073.000000
mean         2.064790
std          1.597747
min          0.000000
25%          1.000000
50%          2.000000
75%          3.000000
max         14.000000
Name: Comorbidity_Counts, dtype: float64

count    97831.000000
mean         1.818166
std          1.633407
min          0.000000
25%          0.000000
50%          2.000000
75%          3.000000
max         16.000000
Name: Comorbidity_Counts, dtype: float64

Comorbidity_Counts_Categ
(0.0, 3.0]     1465
NaN             500
(3.0, 10.0]     339
Name: count, dtype: int64

### Adverse_System_Counts_Categ

In [242]:
display(df0['Adverse_System_Counts'].describe())
display(df1['Adverse_System_Counts'].describe())
display(df2['Adverse_System_Counts'].describe())
display(df3['Adverse_System_Counts'].describe())
display(df4['Adverse_System_Counts'].describe())
display(df5['Adverse_System_Counts'].describe())

df0['Adverse_System_Counts_Categ'] = pd.cut(df0["Adverse_System_Counts"], bins = [0, 1, df0["Adverse_System_Counts"].max()], ordered = True, include_lowest = False)  
df1['Adverse_System_Counts_Categ'] = pd.cut(df1["Adverse_System_Counts"], bins = [0, 1, df1["Adverse_System_Counts"].max()], ordered = True, include_lowest = False)  
df2['Adverse_System_Counts_Categ'] = pd.cut(df2["Adverse_System_Counts"], bins = [0, 1, df2["Adverse_System_Counts"].max()], ordered = True, include_lowest = False)  
df3['Adverse_System_Counts_Categ'] = pd.cut(df3["Adverse_System_Counts"], bins = [0, 1, df3["Adverse_System_Counts"].max()], ordered = True, include_lowest = False)  
df4['Adverse_System_Counts_Categ'] = pd.cut(df4["Adverse_System_Counts"], bins = [0, 1, df4["Adverse_System_Counts"].max()], ordered = True, include_lowest = False)  
df5['Adverse_System_Counts_Categ'] = pd.cut(df5["Adverse_System_Counts"], bins = [0, 1, df5["Adverse_System_Counts"].max()], ordered = True, include_lowest = False)  

df0['Adverse_System_Counts_Categ'].value_counts(dropna = False)

count    2304.000000
mean        0.393229
std         1.869579
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max        21.000000
Name: Adverse_System_Counts, dtype: float64

count    27175.000000
mean         2.082134
std          5.120196
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max         26.000000
Name: Adverse_System_Counts, dtype: float64

count    28797.000000
mean         4.487863
std          6.858169
min          0.000000
25%          0.000000
50%          0.000000
75%          8.000000
max         27.000000
Name: Adverse_System_Counts, dtype: float64

count    16307.000000
mean         5.228491
std          7.414357
min          0.000000
25%          0.000000
50%          0.000000
75%         10.000000
max         27.000000
Name: Adverse_System_Counts, dtype: float64

count    13073.000000
mean         1.506234
std          3.857524
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max         27.000000
Name: Adverse_System_Counts, dtype: float64

count    97831.000000
mean         0.314082
std          1.696128
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max         27.000000
Name: Adverse_System_Counts, dtype: float64

Adverse_System_Counts_Categ
NaN            2096
(1.0, 21.0]     130
(0.0, 1.0]       78
Name: count, dtype: int64

### City_Counts_Categ

In [243]:
display(df0['City_Counts'].describe())
display(df1['City_Counts'].describe())
display(df2['City_Counts'].describe())
display(df3['City_Counts'].describe())
display(df4['City_Counts'].describe())
display(df5['City_Counts'].describe())

df0['City_Counts_Categ'] = pd.cut(df0["City_Counts"], bins = [0, 1, df0["City_Counts"].max()], ordered = True, include_lowest = False)  
df1['City_Counts_Categ'] = pd.cut(df1["City_Counts"], bins = [0, 1, df1["City_Counts"].max()], ordered = True, include_lowest = False)  
df2['City_Counts_Categ'] = pd.cut(df2["City_Counts"], bins = [0, 1, df2["City_Counts"].max()], ordered = True, include_lowest = False)  
df3['City_Counts_Categ'] = pd.cut(df3["City_Counts"], bins = [0, 1, df3["City_Counts"].max()], ordered = True, include_lowest = False)  
df4['City_Counts_Categ'] = pd.cut(df4["City_Counts"], bins = [0, 1, df4["City_Counts"].max()], ordered = True, include_lowest = False)  
df5['City_Counts_Categ'] = pd.cut(df5["City_Counts"], bins = [0, 1, df5["City_Counts"].max()], ordered = True, include_lowest = False)  

df0['City_Counts_Categ'].value_counts(dropna = False)

count    2304.000000
mean        1.228299
std         3.785609
min         0.000000
25%         1.000000
50%         1.000000
75%         1.000000
max       156.000000
Name: City_Counts, dtype: float64

count    27175.000000
mean         2.865428
std          6.149534
min          0.000000
25%          1.000000
50%          1.000000
75%          2.000000
max        282.000000
Name: City_Counts, dtype: float64

count    28797.000000
mean         9.132514
std         22.769810
min          0.000000
25%          1.000000
50%          1.000000
75%          8.000000
max        666.000000
Name: City_Counts, dtype: float64

count    16307.000000
mean        27.063592
std         58.389480
min          0.000000
25%          1.000000
50%          3.000000
75%         29.000000
max       1087.000000
Name: City_Counts, dtype: float64

count    13073.000000
mean         4.422397
std         18.304551
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max        617.000000
Name: City_Counts, dtype: float64

count    97831.000000
mean         1.581963
std          4.980605
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max        691.000000
Name: City_Counts, dtype: float64

City_Counts_Categ
(0.0, 1.0]      1992
NaN              159
(1.0, 156.0]     153
Name: count, dtype: int64

### Country_Counts_Categ

In [244]:
display(df0['Country_Counts'].describe())
display(df1['Country_Counts'].describe())
display(df2['Country_Counts'].describe())
display(df3['Country_Counts'].describe())
display(df4['Country_Counts'].describe())
display(df5['Country_Counts'].describe())

df0['Country_Counts_Categ'] = pd.cut(df0["Country_Counts"], bins = [0, 1, df0["Country_Counts"].max()], ordered = True, include_lowest = False) 
df1['Country_Counts_Categ'] = pd.cut(df1["Country_Counts"], bins = [0, 1, df1["Country_Counts"].max()], ordered = True, include_lowest = False) 
df2['Country_Counts_Categ'] = pd.cut(df2["Country_Counts"], bins = [0, 1, df2["Country_Counts"].max()], ordered = True, include_lowest = False) 
df3['Country_Counts_Categ'] = pd.cut(df3["Country_Counts"], bins = [0, 1, df3["Country_Counts"].max()], ordered = True, include_lowest = False) 
df4['Country_Counts_Categ'] = pd.cut(df4["Country_Counts"], bins = [0, 1, df4["Country_Counts"].max()], ordered = True, include_lowest = False) 
df5['Country_Counts_Categ'] = pd.cut(df5["Country_Counts"], bins = [0, 1, df5["Country_Counts"].max()], ordered = True, include_lowest = False) 

df0['Country_Counts_Categ'].value_counts(dropna = False)

count    2304.000000
mean        0.960938
std         0.456765
min         0.000000
25%         1.000000
50%         1.000000
75%         1.000000
max        10.000000
Name: Country_Counts, dtype: float64

count    27175.000000
mean         1.237387
std          1.216826
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max         33.000000
Name: Country_Counts, dtype: float64

count    28797.000000
mean         1.854881
std          2.809360
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max         46.000000
Name: Country_Counts, dtype: float64

count    16307.000000
mean         3.782363
std          6.402418
min          0.000000
25%          1.000000
50%          1.000000
75%          3.000000
max         59.000000
Name: Country_Counts, dtype: float64

count    13073.00000
mean         1.23713
std          1.96712
min          0.00000
25%          1.00000
50%          1.00000
75%          1.00000
max         42.00000
Name: Country_Counts, dtype: float64

count    97831.000000
mean         0.988695
std          0.656901
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max         49.000000
Name: Country_Counts, dtype: float64

Country_Counts_Categ
(0.0, 1.0]     2118
NaN             159
(1.0, 10.0]      27
Name: count, dtype: int64

### Continent_Counts_Categ

In [245]:
display(df0['Continent_Counts'].describe())
display(df1['Continent_Counts'].describe())
display(df2['Continent_Counts'].describe())
display(df3['Continent_Counts'].describe())
display(df4['Continent_Counts'].describe())
display(df5['Continent_Counts'].describe())

df0['Continent_Counts_Categ'] = pd.cut(df0["Continent_Counts"], bins = [0, 1, df0["Continent_Counts"].max()], ordered = True, include_lowest = False) 
df1['Continent_Counts_Categ'] = pd.cut(df1["Continent_Counts"], bins = [0, 1, df1["Continent_Counts"].max()], ordered = True, include_lowest = False) 
df2['Continent_Counts_Categ'] = pd.cut(df2["Continent_Counts"], bins = [0, 1, df2["Continent_Counts"].max()], ordered = True, include_lowest = False) 
df3['Continent_Counts_Categ'] = pd.cut(df3["Continent_Counts"], bins = [0, 1, df3["Continent_Counts"].max()], ordered = True, include_lowest = False) 
df4['Continent_Counts_Categ'] = pd.cut(df4["Continent_Counts"], bins = [0, 1, df4["Continent_Counts"].max()], ordered = True, include_lowest = False) 
df5['Continent_Counts_Categ'] = pd.cut(df5["Continent_Counts"], bins = [0, 1, df5["Continent_Counts"].max()], ordered = True, include_lowest = False) 

df0['Continent_Counts_Categ'].value_counts(dropna = False)

count    2304.000000
mean        0.934028
std         0.281097
min         0.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         3.000000
Name: Continent_Counts, dtype: float64

count    27175.000000
mean         1.050414
std          0.504449
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          7.000000
Name: Continent_Counts, dtype: float64

count    28797.000000
mean         1.196583
std          0.833727
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          7.000000
Name: Continent_Counts, dtype: float64

count    16307.000000
mean         1.593610
std          1.478366
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          7.000000
Name: Continent_Counts, dtype: float64

count    13073.000000
mean         0.986231
std          0.535285
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          7.000000
Name: Continent_Counts, dtype: float64

count    97831.000000
mean         0.938608
std          0.319288
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          7.000000
Name: Continent_Counts, dtype: float64

Continent_Counts_Categ
(0.0, 1.0]    2121
NaN            169
(1.0, 3.0]      14
Name: count, dtype: int64

### Arm_Counts_Categ
- See thsesis_Vizual --> Many data points in specific values. 1-2 many data points and less in >=3. + Skewed --> Visible change after 2 (=median in all dfs) --> Better categorical.

In [246]:
display(df0['Arm_Counts'].describe())
display(df1['Arm_Counts'].describe())
display(df2['Arm_Counts'].describe())
display(df3['Arm_Counts'].describe())
display(df4['Arm_Counts'].describe())
display(df5['Arm_Counts'].describe())

df0['Arm_Counts_Categ'] = pd.cut(df0["Arm_Counts"], bins = [0, 2, df0["Arm_Counts"].max()], ordered = True, include_lowest = False)  
df1['Arm_Counts_Categ'] = pd.cut(df1["Arm_Counts"], bins = [0, 2, df1["Arm_Counts"].max()], ordered = True, include_lowest = False)  
df2['Arm_Counts_Categ'] = pd.cut(df2["Arm_Counts"], bins = [0, 2, df2["Arm_Counts"].max()], ordered = True, include_lowest = False)  
df3['Arm_Counts_Categ'] = pd.cut(df3["Arm_Counts"], bins = [0, 2, df3["Arm_Counts"].max()], ordered = True, include_lowest = False)  
df4['Arm_Counts_Categ'] = pd.cut(df4["Arm_Counts"], bins = [0, 2, df4["Arm_Counts"].max()], ordered = True, include_lowest = False)  
df5['Arm_Counts_Categ'] = pd.cut(df5["Arm_Counts"], bins = [0, 2, df5["Arm_Counts"].max()], ordered = True, include_lowest = False)  

df0['Arm_Counts_Categ'].value_counts(dropna = False)

count    2281.000000
mean        1.935116
std         1.409927
min         1.000000
25%         1.000000
50%         2.000000
75%         2.000000
max        40.000000
Name: Arm_Counts, dtype: float64

count    26980.000000
mean         2.754337
std          2.402645
min          1.000000
25%          1.000000
50%          2.000000
75%          3.000000
max         44.000000
Name: Arm_Counts, dtype: float64

count    28674.000000
mean         2.336542
std          1.753676
min          1.000000
25%          1.000000
50%          2.000000
75%          3.000000
max         43.000000
Name: Arm_Counts, dtype: float64

count    16281.000000
mean         2.264849
std          1.167064
min          1.000000
25%          2.000000
50%          2.000000
75%          2.000000
max         32.000000
Name: Arm_Counts, dtype: float64

count    13030.000000
mean         2.085111
std          0.986773
min          1.000000
25%          2.000000
50%          2.000000
75%          2.000000
max         32.000000
Name: Arm_Counts, dtype: float64

count    97286.000000
mean         2.064860
std          1.063453
min          1.000000
25%          2.000000
50%          2.000000
75%          2.000000
max         32.000000
Name: Arm_Counts, dtype: float64

Arm_Counts_Categ
(0.0, 2.0]     1893
(2.0, 40.0]     388
NaN              23
Name: count, dtype: int64

## Fillna [] list columns

In [249]:
def fill_list(dfis, cols):
    for dfi, col in zip(dfis, cols):
        dfi[col] = dfi[col].apply(lambda x: ["None"] if x == [] else x)  # So not to be dropped

dfis = [df0, df1, df2, df3, df4, df5]
cols = ['Study_Documents_List', 'Conditions_List', 'Conditions_List', 'Outcomes_List', 'Adverse_List', 'Adverse_System_List', 'Masking_Detail_List', 'Outcomes_List', 'Continents_List']
fill_list(dfis, cols)


# 7) Save Dfs

In [259]:
df.to_pickle("df.pkl")  # pickle better to save dtypes e.g category, period etc. --> csv does not save dtypes
df0.to_pickle("df0.pkl")
df1.to_pickle("df1.pkl")
df2.to_pickle("df2.pkl")
df3.to_pickle("df3.pkl")
df4.to_pickle("df4.pkl")
df5.to_pickle("df5.pkl")
