# Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle
from sqlalchemy import create_engine
from sqlalchemy import text

# 1) SQL

## Engine

In [2]:
host = "localhost"           
port = "5432"                
database = "AACT"
user = "postgres"
password = "pgAdmin4321"
connection_string = f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{database}"
engine = create_engine(connection_string)

# Tables

## my_studies

In [3]:
create_table_sql = """
DROP TABLE IF EXISTS ctgov.my_studies;

CREATE TABLE ctgov.my_studies AS
SELECT DISTINCT ON 
    (
studies.nct_id,
studies.brief_title,
studies.official_title,
studies.source,
studies.source_class,
studies.overall_status,
studies.phase,
studies.number_of_arms,
studies.enrollment
    ) 
studies.nct_id,
studies.brief_title,
studies.official_title,
studies.source,
studies.source_class,
studies.overall_status,
studies.phase,
studies.number_of_arms,
studies.enrollment

from ctgov.studies
WHERE TO_CHAR(studies.start_date, 'YYYY-MM') >= '2011-01'
  AND studies.study_type = 'INTERVENTIONAL'
  AND studies.overall_status in ('COMPLETED', 'WITHDRAWN', 'TERMINATED')

ORDER BY studies.nct_id;
"""

with engine.connect() as conn:
    conn.execute(text(create_table_sql))
    conn.commit()
   
my_studies = pd.read_sql("select * from ctgov.my_studies;" , con = engine)
my_studies

Unnamed: 0,nct_id,brief_title,official_title,source,source_class,overall_status,phase,number_of_arms,enrollment
0,NCT00125528,D-cycloserine in the Management of Chronic Low...,D-Cycloserine in the Management of Chronic Low...,Northwestern University,OTHER,COMPLETED,PHASE2,2.0,41.0
1,NCT00170209,Rifampin Versus Isoniazid for the Treatment of...,A Randomized Trial to Compare Effectiveness of...,McGill University,OTHER,COMPLETED,PHASE3,2.0,844.0
2,NCT00183482,Family Cognitive Behavioral Therapy for Preven...,Family Cognitive Behavioral Prevention of Depr...,Vanderbilt University,OTHER,COMPLETED,,2.0,304.0
3,NCT00194714,Vaccine Therapy in Treating Patients With Stag...,Phase I/II Study of Combination Immunotherapy ...,University of Washington,OTHER,COMPLETED,PHASE1/PHASE2,1.0,22.0
4,NCT00258791,Effects of Pretreatment With Ibuprofen in Post...,Effects of Pretreatment With Ibuprofen in Post...,Norwegian University of Science and Technology,OTHER,WITHDRAWN,,,0.0
...,...,...,...,...,...,...,...,...,...
182231,NCT06875349,Foot Reflexology and Abdominal Massage,The Effects of Foot Reflexology and Abdominal ...,Karadeniz Technical University,OTHER,COMPLETED,,3.0,90.0
182232,NCT06875557,Mind Matters; Unveiling the Impact of Digital ...,MIND MATTERS; UNVEILING THE IMPACT OF DIGITAL ...,Fatima Jinnah Women University,OTHER,COMPLETED,PHASE1/PHASE2,2.0,376.0
182233,NCT06875661,Treatment of Adolescent Idiopathic Scoliosis w...,Treatment of Adolescent Idiopathic Scoliosis w...,University of Novi Sad,OTHER,COMPLETED,,2.0,34.0
182234,NCT06875674,Selenium Biofortification of Strawberries,Selenium Biofortification of Strawberries and ...,University of Palermo,OTHER,COMPLETED,,3.0,44.0


## my_terminations

In [3]:
create_table_sql = """
DROP TABLE IF EXISTS ctgov.my_terminations;

CREATE TABLE ctgov.my_terminations AS
SELECT DISTINCT ON 
    (
studies.nct_id,
studies.why_stopped,
studies.overall_status
    ) 
studies.nct_id,
studies.why_stopped,
studies.overall_status

from ctgov.studies
WHERE TO_CHAR(studies.start_date, 'YYYY-MM') >= '2011-01'
  AND studies.study_type = 'INTERVENTIONAL'
  AND studies.overall_status in ('COMPLETED', 'WITHDRAWN', 'TERMINATED')

ORDER BY studies.nct_id;
"""
with engine.connect() as conn:
    conn.execute(text(create_table_sql))
    conn.commit()
    
my_terminations = pd.read_sql("select * from ctgov.my_terminations;" , con = engine)
my_terminations2 = my_terminations.copy()
my_terminations

Unnamed: 0,nct_id,why_stopped,overall_status
0,NCT00125528,,COMPLETED
1,NCT00170209,,COMPLETED
2,NCT00183482,,COMPLETED
3,NCT00194714,,COMPLETED
4,NCT00258791,first postponed then cancelled as national dru...,WITHDRAWN
...,...,...,...
182231,NCT06875349,,COMPLETED
182232,NCT06875557,,COMPLETED
182233,NCT06875661,,COMPLETED
182234,NCT06875674,,COMPLETED


In [5]:
my_terminations.loc[my_terminations["why_stopped"].str.contains("accru|enrol|participant|recruit|patient|inclusion|human|subjects", case = False) == True, "why_stopped"] = "Enrollment"  # 1st cause "human resources"
my_terminations.loc[my_terminations["why_stopped"].str.contains("fund|sponsor|business|company|portfolio|corporate|resources|financ", case = False) == True, "why_stopped"] = "Fund"
my_terminations.loc[my_terminations["why_stopped"].str.contains("PI", case = True) == True, "why_stopped"] = "Administration"
my_terminations.loc[my_terminations["why_stopped"].str.contains("administrative|management|strateg|internal|organiz|policy|FDA", case = False) == True, "why_stopped"] = "Administration"
my_terminations.loc[my_terminations["why_stopped"].str.contains("researcher|Researcher|researchers|Researchers|investigator|Investigator|investigators|Investigators", case = True) == True, "why_stopped"] = "Administration"
my_terminations.loc[my_terminations["why_stopped"].str.contains("futility|efficacy|ethical|safety|sufficient|risk", case = False) == True, "why_stopped"] = "Efficacy"
my_terminations.loc[my_terminations["why_stopped"].str.contains("drug|logistic", case = False) == True, "why_stopped"] = "Supply"
my_terminations.loc[my_terminations["why_stopped"].str.contains("covid|pandemic", case = False) == True, "why_stopped"] = "Covid"
my_terminations.loc[~my_terminations["why_stopped"].isin(["Enrollment","Fund","Administration","Efficacy","Supply","COVID-19"]) & ~my_terminations["why_stopped"].isnull(), "why_stopped"] = "Cther"

my_terminations["why_stopped"].value_counts(dropna=False)

why_stopped
None              156688
Enrollment         10346
Cther               6228
Fund                5309
Administration      1979
Efficacy            1304
Supply               382
Name: count, dtype: int64

## my_conditions

In [6]:
category_map1 = {
    "A": "Anatomy",
    "B": "Organisms",
    "C": "Diseases",
    "D": "Chemicals and Drugs",
    "E": "Analytical, Diagnostic and Therapeutic Techniques and Equipment",
    "F": "Psychiatry and Psychology",
    "G": "Phenomena and Processes",
    "H": "Disciplines and Occupations",
    "I": "Anthropology, Education, Sociology and Social Phenomena",
    "J": "Technology, Industry, and Agriculture",
    "K": "Humanities",
    "L": "Information Science",
    "M": "Named Groups",
    "N": "Health Care",
    "V": "Publication Characteristics",
    "Z": "Geographic Locations"
}

category_map = {
    "A": "Anatomy",
    "B": "Organisms",
    "C": "Diseases",
    "D": "Chemicals, Drugs",
    "E": "Diagnostic, Equipment",
    "F": "Psychiatry, Psychology",
    "G": "Phenomena, Processes",
    "H": "Disciplines, Occupations",
    "I": "Anthropology, Sociology",
    "J": "Technology, Industry, Agriculture",
    "K": "Humanities",
    "L": "Information Science",
    "M": "Named Groups",
    "N": "Health Care",
    "V": "Publication Character.",
    "Z": "Geographic"
}

my_conditions2 = pd.DataFrame({"Code" : category_map.keys(), "Category" : category_map.values()})
my_conditions2

Unnamed: 0,Code,Category
0,A,Anatomy
1,B,Organisms
2,C,Diseases
3,D,"Chemicals, Drugs"
4,E,"Diagnostic, Equipment"
5,F,"Psychiatry, Psychology"
6,G,"Phenomena, Processes"
7,H,"Disciplines, Occupations"
8,I,"Anthropology, Sociology"
9,J,"Technology, Industry, Agriculture"


In [7]:
create_table_sql ="""
DROP TABLE IF EXISTS ctgov.my_conditions;

CREATE TABLE ctgov.my_conditions AS
SELECT DISTINCT ON 
    (
    studies.nct_id,
    mesh_headings.qualifier, 
    mesh_headings.heading
    )
    studies.nct_id, 
    mesh_headings.qualifier, 
    mesh_headings.heading

FROM ctgov.studies
LEFT JOIN ctgov.conditions
    ON studies.nct_id = conditions.nct_id
LEFT JOIN ctgov.browse_conditions
    ON studies.nct_id = browse_conditions.nct_id
LEFT JOIN ctgov.mesh_terms
    ON browse_conditions.mesh_term = mesh_terms.mesh_term
LEFT JOIN ctgov.mesh_headings
    ON mesh_terms.qualifier = mesh_headings.qualifier
    
WHERE TO_CHAR(studies.start_date, 'YYYY-MM') >= '2011-01'
	AND study_type = 'INTERVENTIONAL'
  	AND overall_status IN ('COMPLETED', 'WITHDRAWN', 'TERMINATED')
      
ORDER BY studies.nct_id, mesh_headings.qualifier, mesh_headings.heading;
"""

with engine.connect() as conn:
    conn.execute(text(create_table_sql))
    conn.commit()

# 	conditions.name,
#   browse_conditions.mesh_term, 
  
my_conditions = pd.read_sql("select * from ctgov.my_conditions;", con = engine)

my_conditions["Code"] = my_conditions["qualifier"].str[0]
my_conditions = my_conditions.merge(my_conditions2[["Code", "Category"]], on = "Code", how = "left")

my_conditions = my_conditions.groupby("nct_id").agg({
    "qualifier": lambda x: sorted(set(i for i in x if pd.notnull(i))),
    "heading": lambda x: sorted(set(i for i in x if pd.notnull(i))),
    "Code": lambda x: sorted(set(i for i in x if pd.notnull(i))),
    "Category": lambda x: sorted(set(i for i in x if pd.notnull(i))),
}).reset_index()
my_conditions["Comorbidity1"] = my_conditions["Code"].apply(lambda x: len(x))
my_conditions["Comorbidity"] = my_conditions["qualifier"].apply(lambda x: len(x))

my_conditions

: 

: 

## my_covid

In [None]:
create_table_sql = """
DROP TABLE IF EXISTS ctgov.my_covid;

CREATE TABLE ctgov.my_covid AS
SELECT DISTINCT ON 
(
studies.nct_id, 
browse_conditions.mesh_term
)
    studies.nct_id,
    browse_conditions.mesh_term
    
FROM ctgov.studies
LEFT JOIN ctgov.browse_conditions
    ON studies.nct_id = browse_conditions.nct_id
    
WHERE TO_CHAR(studies.start_date, 'YYYY-MM') >= '2011-01'
  AND study_type = 'INTERVENTIONAL'
  AND overall_status IN ('COMPLETED', 'WITHDRAWN', 'TERMINATED')
  AND browse_conditions.mesh_term ILIKE '%covid%'
ORDER BY studies.nct_id, browse_conditions.mesh_term;
"""

with engine.connect() as conn:
    conn.execute(text(create_table_sql))
    conn.commit()

my_covid = pd.read_sql("select * from ctgov.my_covid;", con = engine)
my_covid = my_covid.drop_duplicates(subset = ["nct_id"])
my_covid["mesh_term"] = my_covid["mesh_term"].replace("Post-Acute COVID-19 Syndrome", "COVID-19")
my_covid

## my_placebo

In [None]:
create_table_sql ="""
DROP TABLE IF EXISTS ctgov.my_placebo;

CREATE TABLE ctgov.my_placebo AS
SELECT DISTINCT ON (
    studies.nct_id,
    design_groups.group_type   
)
    studies.nct_id,
    design_groups.group_type

    FROM ctgov.studies
LEFT JOIN ctgov.design_groups
    ON studies.nct_id = design_groups.nct_id
    
WHERE TO_CHAR(studies.start_date, 'YYYY-MM') >= '2011-01'
  AND studies.study_type = 'INTERVENTIONAL'
  AND studies.overall_status IN ('COMPLETED', 'WITHDRAWN', 'TERMINATED')
ORDER BY 
    studies.nct_id, 
    design_groups.group_type;
"""

with engine.connect() as conn:
    conn.execute(text(create_table_sql))
    conn.commit()


my_placebo = pd.read_sql("select * from ctgov.my_placebo;", con = engine)
my_placebo

## my_interventions

In [None]:
create_table_sql ="""
DROP TABLE IF EXISTS ctgov.my_interventions;

CREATE TABLE ctgov.my_interventions AS
SELECT DISTINCT ON (
    studies.nct_id,
    browse_interventions.mesh_term      
)
    studies.nct_id,
    browse_interventions.mesh_term

    FROM ctgov.studies
LEFT JOIN ctgov.browse_interventions
    ON studies.nct_id = browse_interventions.nct_id

WHERE TO_CHAR(studies.start_date, 'YYYY-MM') >= '2011-01'
  AND studies.study_type = 'INTERVENTIONAL'
  AND studies.overall_status IN ('COMPLETED', 'WITHDRAWN', 'TERMINATED')
ORDER BY 
    studies.nct_id, 
    browse_interventions.mesh_term;
"""
#     interventions.name

with engine.connect() as conn:
    conn.execute(text(create_table_sql))
    conn.commit()


my_interventions = pd.read_sql("select * from ctgov.my_interventions;", con = engine)
my_interventions

## my_interventions_types

In [None]:
create_table_sql = """
DROP TABLE IF EXISTS ctgov.my_interventions_types;

CREATE TABLE ctgov.my_interventions_types AS
SELECT DISTINCT ON (
    studies.nct_id,
    interventions.intervention_type
)
    studies.nct_id,
    interventions.intervention_type

FROM ctgov.studies

LEFT JOIN ctgov.interventions
    ON studies.nct_id = interventions.nct_id

WHERE TO_CHAR(studies.start_date, 'YYYY-MM') >= '2011-01'
  AND studies.study_type = 'INTERVENTIONAL'
  AND studies.overall_status IN ('COMPLETED', 'WITHDRAWN', 'TERMINATED')

ORDER BY 
    studies.nct_id,
    interventions.intervention_type;
"""

with engine.connect() as conn:
    conn.execute(text(create_table_sql))
    conn.commit()

my_interventions_types = pd.read_sql("select * from ctgov.my_interventions_types;", con = engine)
my_interventions_types = my_interventions_types.groupby("nct_id")["intervention_type"].agg(lambda x: sorted(set(i for i in x if i is not None))).reset_index()
# my_interventions_types['intervention_type'] = my_interventions_types['intervention_type'].apply(lambda x: list(set(sorted(x))))
my_interventions_types

## my_soc

In [None]:
create_table_sql ="""
DROP TABLE IF EXISTS ctgov.my_intervention_methods2;

CREATE TABLE ctgov.my_intervention_methods2 AS
SELECT DISTINCT ON 
(
    studies.nct_id,
    interventions.description, 
    design_groups.description,
    design_groups.group_type
    
)
    studies.nct_id,
    design_groups.description AS design_groups_description,
    interventions.description AS interventions_description,
    design_groups.group_type

    FROM ctgov.studies

LEFT JOIN ctgov.interventions
    ON studies.nct_id = interventions.nct_id
LEFT JOIN ctgov.design_groups
    ON studies.nct_id = design_groups.nct_id

WHERE TO_CHAR(studies.start_date, 'YYYY-MM') >= '2011-01'
  AND studies.study_type = 'INTERVENTIONAL'
  AND studies.overall_status IN ('COMPLETED', 'WITHDRAWN', 'TERMINATED')
ORDER BY 
    studies.nct_id,
    design_groups.group_type;
"""

with engine.connect() as conn:
    conn.execute(text(create_table_sql))
    conn.commit()


my_intervention_methods2 = pd.read_sql("select * from ctgov.my_intervention_methods2;", con = engine)

my_soc = my_intervention_methods2[my_intervention_methods2["group_type"] == "ACTIVE_COMPARATOR"]
my_soc = my_soc[my_soc["design_groups_description"].str.contains("Standard", case = False, na = False) | my_soc["interventions_description"].str.contains("Standard", case = False, na = False)]
my_soc = my_soc.drop_duplicates(subset=["nct_id"])
display(my_soc) 
display(my_intervention_methods2) 



## my_intervention_methods2

## my_intervention_methods

In [None]:
create_table_sql = """
DROP TABLE IF EXISTS ctgov.my_intervention_methods;

CREATE TABLE ctgov.my_intervention_methods AS

select distinct on 
(
nct_id, 
intervention_type, 
description
) 
studies.nct_id,
interventions.intervention_type,
interventions.description

from ctgov.studies
left join ctgov.interventions
on studies.nct_id = interventions.nct_id

WHERE TO_CHAR(studies.start_date, 'YYYY-MM') >= '2011-01'
  AND studies.study_type = 'INTERVENTIONAL'
  AND studies.overall_status IN ('COMPLETED', 'WITHDRAWN', 'TERMINATED')
order by studies.nct_id, interventions.description, interventions.intervention_type;
"""

with engine.connect() as conn:
    conn.execute(text(create_table_sql))
    conn.commit()

#  AND interventions.intervention_type = 'DRUG'

my_intervention_methods = pd.read_sql("select * from ctgov.my_intervention_methods;", con = engine)
my_intervention_methods  # only Drug intervention_type chosen --> only 132185 rows (not 182....)
my_intervention_methods

In [None]:
# Strings --> all lowercases --> cause case = True in some contains
injection1 = r"inject|vaccine|bolus|vein|intra|parenteral|intravenous|infusion|subcutaneous|insuline|chemotherapy|subcutaneous|antibod|vial|intramuscular|intraarterial|syring|sub-cutaneous|blood"
injection2 = r"\biv\b|\bi\.v\.\b|\bim\b|\bi\.m\.\b|\bsc\b|\bsq\b|\bit\b|\bip\b"
oral1 = r"\boral|tablet|capsule|pill|supplement|mouth|sublingual|buccal|chew"
oral2 = r"\spo\s|\sper\sos\s|\sp\.o\.\s|p\.o\."
topical1 = r"\bgel\b|patch|cream|ointment|balm|nebul|toothpaste|intranasal|nasal|inhal|aerosol|lotion|local|vaginal|rectal|spray|drops|opthalm|otic|transdermal|suppository|douche|breath|skin"

In [None]:
interv_data = {
    "oral1": oral1,
    "oral2":oral2,
    "injection1": injection1,
    "injection2": injection2,
    "topical1": topical1,
}

with open("intervention_methods.pkl", "wb") as f:
    pickle.dump(interv_data, f)

In [None]:
# Injection
my_intervention_methods.loc[my_intervention_methods['description'].str.contains(injection1, case = False, na = False) == True, 'Injection'] = "Injection"
my_intervention_methods.loc[my_intervention_methods['description'].str.lower().str.contains(injection2, case = True, na = False) == True, 'Injection'] = "Injection"  # str.lower() --> case = True

# Oral
my_intervention_methods.loc[my_intervention_methods['description'].str.contains(oral1, case = False, na = False) == True, 'Oral'] = "Oral"
my_intervention_methods.loc[my_intervention_methods['description'].str.lower().str.contains(oral2, case = True, na = False) == True, 'Oral'] = "Oral"  # str.lower() --> case = True

# Topical
my_intervention_methods.loc[my_intervention_methods['description'].str.contains(topical1, case = False, na = False) == True, 'Topical'] = "Topical"

my_intervention_methods

In [None]:
# Injection
my_intervention_methods2.loc[my_intervention_methods2['interventions_description'].str.contains(injection1, case = False, na = False) == True, 'Injection2'] = "Injection"
my_intervention_methods2.loc[my_intervention_methods2['interventions_description'].str.lower().str.contains(injection2, case = True, na = False) == True, 'Injection2'] = "Injection"  # str.lower() --> case = True

# Oral
my_intervention_methods2.loc[my_intervention_methods2['interventions_description'].str.contains(oral1, case = False, na = False) == True, 'Oral2'] = "Oral"
my_intervention_methods2.loc[my_intervention_methods2['interventions_description'].str.lower().str.contains(oral2, case = True, na = False) == True, 'Oral2'] = "Oral"  # str.lower() --> case = True

# Topical
my_intervention_methods2.loc[my_intervention_methods2['interventions_description'].str.contains(topical1, case = False, na = False) == True, 'Topical2'] = "Topical"

my_intervention_methods2

In [None]:
my_intervention_methods = my_intervention_methods.merge(my_intervention_methods2[['nct_id', 'interventions_description', 'Injection2', 'Oral2', 'Topical2']], on = "nct_id", how = "left")
my_intervention_methods = my_intervention_methods.loc[my_intervention_methods["Injection2"].notnull() | my_intervention_methods["Oral2"].notnull() | my_intervention_methods["Topical2"].notnull() | my_intervention_methods["Injection"].notnull() | my_intervention_methods["Oral"].notnull() | my_intervention_methods["Topical"].notnull()]
my_intervention_methods

In [None]:
my_intervention_methods.loc[my_intervention_methods["Injection"].isnull() &  my_intervention_methods["Injection2"].notnull(), "Injection"] = my_intervention_methods["Injection2"]
my_intervention_methods.loc[my_intervention_methods["Oral"].isnull() &  my_intervention_methods["Oral2"].notnull(), "Oral"] = my_intervention_methods["Oral2"]
my_intervention_methods.loc[my_intervention_methods["Topical"].isnull() &  my_intervention_methods["Topical2"].notnull(), "Topical"] = my_intervention_methods["Topical2"]

my_intervention_methods.loc[my_intervention_methods["Injection"] ==  my_intervention_methods["Injection2"], "Injection2"] = np.nan
my_intervention_methods.loc[my_intervention_methods["Oral"] ==  my_intervention_methods["Oral2"], "Oral2"] = np.nan
my_intervention_methods.loc[my_intervention_methods["Topical"] ==  my_intervention_methods["Topical2"], "Topical2"] = np.nan

my_intervention_methods = my_intervention_methods.drop_duplicates()
my_intervention_methods

In [None]:
my_intervention_methods = my_intervention_methods.drop(columns = ['interventions_description', 'description'], axis = 1)

my_intervention_methods["Interv_method"] = my_intervention_methods[["Injection", "Oral", "Topical", "Injection2", "Oral2", "Topical2"]].apply(lambda row: [val for val in row if pd.notna(val) and val != "None"], axis=1)
my_intervention_methods = my_intervention_methods.groupby("nct_id")["Interv_method"].agg(lambda x: sorted(set(sum(x, [])))).reset_index()
my_intervention_methods = my_intervention_methods[['nct_id', 'Interv_method']]
my_intervention_methods

## my_adverse

### my_adverse

In [None]:
create_table_sql = """
DROP TABLE IF EXISTS ctgov.my_adverse;

CREATE TABLE ctgov.my_adverse AS
SELECT DISTINCT ON 
(
  studies.nct_id, 
	reported_event_totals.event_type,
	reported_event_totals.subjects_affected
  )
  studies.nct_id, 
	reported_event_totals.event_type,
	reported_event_totals.subjects_affected as event_count
	
FROM ctgov.studies
LEFT JOIN ctgov.reported_event_totals
  ON studies.nct_id = reported_event_totals.nct_id

WHERE TO_CHAR(studies.start_date, 'YYYY-MM') >= '2011-01'
  AND studies.study_type = 'INTERVENTIONAL'
  AND studies.overall_status IN ('COMPLETED', 'WITHDRAWN', 'TERMINATED')

ORDER BY 
	studies.nct_id, 
    reported_event_totals.event_type;
"""

with engine.connect() as conn:
    conn.execute(text(create_table_sql))
    conn.commit()

# drop_withdrawals.reason as withrawals_reason,
# drop_withdrawals.count as withrawals_count,
#	reported_event_totals.classification,
#	reported_event_totals.classification as event_classification,

# LEFT JOIN ctgov.drop_withdrawals
# ON studies.nct_id = drop_withdrawals.nct_id

my_adverse = pd.read_sql("select * from ctgov.my_adverse;", con = engine)
my_adverse = my_adverse.replace({'other':'Other', 'serious':'Serious', 'deaths': 'Death'})  # Do not replace [] --> event_count is needed as 0
my_adverse.loc[my_adverse["event_count"] == 0.0, ["event_type", "event_count"]] = np.nan    # nan = []
my_adverse = my_adverse.groupby("nct_id").agg({
    "event_type": lambda x: sorted(set(i for i in x if pd.notnull(i))),
    "event_count": lambda x: sorted(set(i for i in x if pd.notnull(i))),
}).reset_index()
my_adverse["event_sum"] = my_adverse["event_count"].apply(lambda x: sum(i for i in x if pd.notnull(i)))
my_adverse

Unnamed: 0,nct_id,event_type,event_count,event_sum
0,NCT00125528,"[Death, Other]","[3.0, 5.0]",8.0
1,NCT00170209,[],[],0.0
2,NCT00183482,[],[],0.0
3,NCT00194714,"[Death, Other, Serious]","[3.0, 11.0, 21.0]",35.0
4,NCT00258791,[],[],0.0
...,...,...,...,...
182231,NCT06875349,[],[],0.0
182232,NCT06875557,[],[],0.0
182233,NCT06875661,[],[],0.0
182234,NCT06875674,[],[],0.0


### my_adverse_system

In [None]:
create_table_sql = """
DROP TABLE IF EXISTS ctgov.my_adverse_system;

CREATE TABLE ctgov.my_adverse_system AS
SELECT DISTINCT ON
(
studies.nct_id,
reported_events.organ_system
)
studies.nct_id,
reported_events.organ_system

FROM ctgov.reported_events
LEFT JOIN ctgov.studies
    ON studies.nct_id = reported_events.nct_id

WHERE TO_CHAR(studies.start_date, 'YYYY-MM') >= '2011-01'
  AND studies.study_type = 'INTERVENTIONAL'
  AND studies.overall_status in ('COMPLETED','WITHDRAWN', 'TERMINATED')
ORDER BY studies.nct_id;
"""

with engine.connect() as conn:
    conn.execute(text(create_table_sql))
    conn.commit()

my_adverse_system = pd.read_sql("select * from ctgov.my_adverse_system;", con = engine)
# display(my_adverse_system['organ_system'].unique())
my_adverse_system['organ_system'] = my_adverse_system['organ_system'].replace({
      'General disorders' : 'General', 
      'Nervous system disorders' : 'Nervous System',
      'Skin and subcutaneous tissue disorders' : 'Skin, Subcutaneous',
      'Blood and lymphatic system disorders' : 'Blood, Lymphatic', 
      'Cardiac disorders' : 'Cardiac',
      'Eye disorders' : 'Eye', 
      'Gastrointestinal disorders' : 'Gastrointestinal',
      'Infections and infestations' : 'Infection',
      'Injury, poisoning and procedural complications' : 'Injury, Poisoning, Procedural', 
      # 'Investigations',
      'Metabolism and nutrition disorders' : 'Metabolism, Nutrition',
      'Musculoskeletal and connective tissue disorders' : 'Musculoskeletal',
      'Psychiatric disorders' : 'Psychiatric', 
      'Renal and urinary disorders' : 'Renal, Urinary',
      'Reproductive system and breast disorders' : 'Reproductive, Breast',
      'Respiratory, thoracic and mediastinal disorders' : 'Respiratory',
      'Surgical and medical procedures' : 'Surgical, Medical', 
      'Vascular disorders': 'Vascular',
      'Immune system disorders' : 'Immune System', 
      'Hepatobiliary disorders' : 'Hepatobiliary',
      'Ear and labyrinth disorders' : 'Ear', 
      'Endocrine disorders' : 'Endocrine',
      'Neoplasms benign, malignant and unspecified (incl cysts and polyps)' : 'Neoplasms',
      # 'Product Issues', 
      'Congenital, familial and genetic disorders' : 'Congenital, Genetic',
      'Pregnancy, puerperium and perinatal conditions' : 'Pregnancy, Puerperium, Perinatal',
      'Social circumstances' : 'Social'})
my_adverse_system["organ_system"] = my_adverse_system["organ_system"].replace(r"\s*disorders$", "", regex=True)
my_adverse_system = my_adverse_system.groupby("nct_id").agg({"organ_system": lambda x: sorted(set(i for i in x if pd.notnull(i)))}).reset_index()
my_adverse_system

## my_designs

In [None]:
create_table_sql = """
DROP TABLE IF EXISTS ctgov.my_designs;

CREATE TABLE ctgov.my_designs AS
SELECT DISTINCT ON 
(
studies.nct_id,
designs.allocation,
designs.intervention_model,
designs.primary_purpose,
designs.masking,
designs.subject_masked,
designs.caregiver_masked,
designs.investigator_masked,
designs.outcomes_assessor_masked
)

studies.nct_id,
designs.allocation,
designs.intervention_model,
designs.primary_purpose,
designs.masking,
designs.subject_masked,
designs.caregiver_masked,
designs.investigator_masked,
designs.outcomes_assessor_masked

FROM ctgov.studies
LEFT JOIN ctgov.designs
  ON studies.nct_id = designs.nct_id
WHERE TO_CHAR(studies.start_date, 'YYYY-MM') >= '2011-01'
  AND studies.study_type = 'INTERVENTIONAL'
  AND studies.overall_status IN ('COMPLETED', 'WITHDRAWN', 'TERMINATED')
ORDER BY studies.nct_id;

"""

with engine.connect() as conn:
    conn.execute(text(create_table_sql))
    conn.commit()


my_designs = pd.read_sql("select * from ctgov.my_designs;", con = engine)

my_designs["subject_masked"] = my_designs["subject_masked"].replace({True : "PARTICIPANT", False : np.NaN})
my_designs["caregiver_masked"] = my_designs["caregiver_masked"].replace({True : "CARE_PROVIDER", False : np.NaN})
my_designs["investigator_masked"] = my_designs["investigator_masked"].replace({True : "INVESTIGATOR", False : np.NaN})
my_designs["outcomes_assessor_masked"] = my_designs["outcomes_assessor_masked"].replace({True : "OUTCOMES_ASSESSOR", False : np.NaN})

my_designs["masking_detail"] = my_designs[["subject_masked", "caregiver_masked", "investigator_masked", "outcomes_assessor_masked"]].apply(
    lambda row: [val for val in row if pd.notna(val) and val != "None"], axis=1)
my_designs['masking_detail'] = my_designs['masking_detail'].apply(lambda x: list(set(sorted(x))))
my_designs

## my_eligibilities

In [None]:
create_table_sql = """
DROP TABLE IF EXISTS ctgov.my_eligibilities;

CREATE TABLE ctgov.my_eligibilities AS
SELECT DISTINCT ON 
(
studies.nct_id, 
eligibilities.gender, 
eligibilities.healthy_volunteers
) 
studies.nct_id, 
eligibilities.gender, 
eligibilities.healthy_volunteers
                               
FROM ctgov.studies
LEFT JOIN ctgov.eligibilities
	ON studies.nct_id = eligibilities.nct_id
WHERE TO_CHAR(studies.start_date, 'YYYY-MM') >= '2011-01'
  AND studies.study_type = 'INTERVENTIONAL'
  AND studies.overall_status in ('COMPLETED', 'WITHDRAWN', 'TERMINATED')
ORDER BY studies.nct_id;
"""
# eligibilities.minimum_age, 
# eligibilities.maximum_age, 

with engine.connect() as conn:
    conn.execute(text(create_table_sql))
    conn.commit()

    
my_eligibilities = pd.read_sql("select * from ctgov.my_eligibilities;", con = engine)
my_eligibilities["healthy_volunteers"] = my_eligibilities["healthy_volunteers"].replace({True : "Healthy", False : "Condition", None : np.NaN})
my_eligibilities

## my_outcomes

In [None]:
create_table_sql ="""
DROP TABLE IF EXISTS ctgov.my_outcomes;

CREATE TABLE ctgov.my_outcomes AS
SELECT DISTINCT ON 
( 
    studies.nct_id, 
    outcomes.outcome_type 
)
    studies.nct_id, 
    outcomes.outcome_type 

FROM ctgov.studies

LEFT JOIN ctgov.outcomes
  ON studies.nct_id = outcomes.nct_id

WHERE TO_CHAR(studies.start_date, 'YYYY-MM') >= '2011-01'
  AND studies.study_type = 'INTERVENTIONAL'
  AND studies.overall_status in ('COMPLETED', 'WITHDRAWN', 'TERMINATED')

ORDER BY 
    studies.nct_id, 
    outcomes.outcome_type;

"""

with engine.connect() as conn:
    conn.execute(text(create_table_sql))
    conn.commit()
   
my_outcomes = pd.read_sql("select * from ctgov.my_outcomes;", con = engine)
my_outcomes = my_outcomes.groupby("nct_id")["outcome_type"].agg(lambda x: sorted(set(i for i in x if i is not None))).reset_index()
my_outcomes

## my_locations

In [None]:
create_table_sql ="""
DROP TABLE IF EXISTS ctgov.my_locations;

CREATE TABLE ctgov.my_locations AS
SELECT DISTINCT ON (
    studies.nct_id,
    countries.name,
    facilities.country,
    facilities.state,
    facilities.city
)
    studies.nct_id,
    countries.name AS countries_country,
    facilities.country AS facilities_country,
    facilities.state AS facilities_state,
    facilities.city AS facilities_city
    
FROM ctgov.studies 
LEFT JOIN ctgov.facilities
    ON studies.nct_id = facilities.nct_id
LEFT JOIN ctgov.countries
    ON studies.nct_id = countries.nct_id
WHERE TO_CHAR(studies.start_date, 'YYYY-MM') >= '2011-01'
  AND studies.study_type = 'INTERVENTIONAL'
  AND studies.overall_status IN ('COMPLETED', 'WITHDRAWN', 'TERMINATED')
ORDER BY 
    studies.nct_id, 
    countries.name, 
    facilities.country, 
    facilities.state, 
    facilities.city;

"""

with engine.connect() as conn:
    conn.execute(text(create_table_sql))
    conn.commit()

   
my_locations = pd.read_sql("select * from ctgov.my_locations;", con = engine)
my_locations["facilities_country"] = my_locations["facilities_country"].replace('Korea, Republic of', 'South Korea')  # replace Korea with South korea, as North Korea is isolated and not posting to worldwide sites.

my_locations = my_locations.groupby("nct_id").agg({
    "countries_country": lambda x: sorted(set(i for i in x if pd.notnull(i))),
    "facilities_country": lambda x: sorted(set(i for i in x if pd.notnull(i))),
    "facilities_state": lambda x: sorted(set(i for i in x if pd.notnull(i))),
    "facilities_city": lambda x: sorted(set(i for i in x if pd.notnull(i)))
}).reset_index()

my_locations["Country_Counts"] = my_locations["facilities_country"].apply(lambda x: len(x))
my_locations["State_Counts"] = my_locations["facilities_state"].apply(lambda x: len(x))
my_locations["City_Counts"] = my_locations["facilities_city"].apply(lambda x: len(x))

my_locations

Unnamed: 0,nct_id,countries_country,facilities_country,facilities_state,facilities_city,Country_Counts,State_Counts,City_Counts
0,NCT00125528,[United States],[United States],[Illinois],[Chicago],1,1,1
1,NCT00170209,"[Australia, Benin, Brazil, Canada, Ghana, Guin...","[Australia, Benin, Brazil, Canada, Ghana, Guin...","[Africa, Alberta, British Columbia, New South ...","[Bandung, Conakry, Cotonou, Edmonton, Kumasi, ...",7,6,9
2,NCT00183482,[United States],[United States],[Tennessee],[Nashville],1,1,1
3,NCT00194714,[United States],[United States],[Washington],[Seattle],1,1,1
4,NCT00258791,[Norway],[],[],[],0,0,0
...,...,...,...,...,...,...,...,...
182231,NCT06875349,[Turkey],[Turkey],[],[Trabzon],1,0,1
182232,NCT06875557,[Pakistan],[Pakistan],[Punjab],[Rawalpindi],1,1,1
182233,NCT06875661,[Serbia],[Serbia],[],[Novi Sad],1,0,1
182234,NCT06875674,[Italy],[Italy],[PA],[Palermo],1,1,1


## my_documents

In [None]:
create_table_sql ="""
DROP TABLE IF EXISTS ctgov.my_documents;

CREATE TABLE ctgov.my_documents AS
SELECT DISTINCT ON 
( 
studies.nct_id, 
provided_documents.has_protocol, 
provided_documents.has_icf, 
provided_documents.has_sap
)
studies.nct_id, 
provided_documents.has_protocol, 
provided_documents.has_icf, 
provided_documents.has_sap 
FROM ctgov.studies
LEFT JOIN ctgov.provided_documents
  ON studies.nct_id = provided_documents.nct_id
WHERE TO_CHAR(studies.start_date, 'YYYY-MM') >= '2011-01'
  AND studies.study_type = 'INTERVENTIONAL'
  AND studies.overall_status in ('COMPLETED', 'WITHDRAWN', 'TERMINATED')
ORDER BY studies.nct_id;

"""

with engine.connect() as conn:
    conn.execute(text(create_table_sql))
    conn.commit()
    
my_documents = pd.read_sql("select * from ctgov.my_documents;", con = engine)

my_documents["has_protocol"] = my_documents["has_protocol"].replace({True : "Protocol", False : np.NaN})
my_documents["has_icf"] = my_documents["has_icf"].replace({True : "Consent Form", False : np.NaN})
my_documents["has_sap"] = my_documents["has_sap"].replace({True : "Analysis Plan", False : np.NaN})

my_documents = my_documents.groupby("nct_id", as_index=False).agg({
    'has_protocol': 'first',
    'has_icf': 'first',
    'has_sap': 'first'
}).reset_index()

my_documents["Study_Documents"] = my_documents[["has_protocol", "has_icf", "has_sap"]].apply(
    lambda row: [val for val in row if pd.notna(val) and val != "None"],
    axis=1)
my_documents['Study_Documents'] = my_documents['Study_Documents'].apply(lambda x: list(set(sorted(x))))
my_documents

# 2) Save Dfs

In [6]:
my_studies.to_pickle("my_studies.pkl")
my_terminations.to_pickle("my_terminations.pkl")  
my_terminations2.to_pickle("my_terminations2.pkl")  
my_conditions.to_pickle("my_conditions.pkl")  
my_covid.to_pickle("my_covid.pkl")  
my_placebo.to_pickle("my_placebo.pkl")  
my_interventions.to_pickle("my_interventions.pkl")  
my_interventions_types.to_pickle("my_interventions_types.pkl")  
my_soc.to_pickle("my_soc.pkl")  
my_intervention_methods2.to_pickle("my_intervention_methods2.pkl")  
my_intervention_methods.to_pickle("my_intervention_methods.pkl")  
my_adverse.to_pickle("my_adverse.pkl")  
my_adverse_system.to_pickle("my_adverse_system.pkl")  
my_designs.to_pickle("my_designs.pkl")  
my_eligibilities.to_pickle("my_eligibilities.pkl")  
my_outcomes.to_pickle("my_outcomes.pkl")  
my_locations.to_pickle("my_locations.pkl")  
my_documents.to_pickle("my_documents.pkl")  

NameError: name 'my_studies' is not defined