In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

In [62]:
hemonc_data = pd.read_csv("hemonc_joined.csv")
hemonc_data.head()

Unnamed: 0,component,regulator,date,condition,accelerated,withdrawn,first_in_class,note,context,stage_or_status,risk_stratification,demographics,prior_therapy,prior_therapy_negation,prior_therapy_setting,response_contingency,time_contingency,prior_biomarker,with,biomarker,biomarker_negation,study_yn,study,string,date_added.x,drug_CUI,drug_INN,investigational,main_class,class_type,CanMED_major_class,CanMED_major_class_CUI,CanMED_minor_class,CanMED_minor_class_CUI,date_added.y
0,Abarelix,FDA,2003-11-25,Prostate cancer,False,False,True,,,Symptomatic,,men,,,,,,,,,,True,Koch et al. 2003,2003-11-25: Approved for palliative treatment ...,2022-09-05,3652,,False,GnRH antagonist,mechanistic,,,,,2019-08-29
1,Abciximab,FDA,1994-12-22,NONE,False,False,True,No linked condition,,,,,,,,,,,,,,,,1994-12-22: Initial approval (label not availa...,2022-09-05,3,,False,Anti-GPIIb-IIIa antibody,mechanistic,,,,,2019-05-27
2,Abemaciclib,FDA,2017-09-28,Breast cancer,False,False,False,,,Metastatic,,women and men,Antiestrogen AND Chemotherapy,False,Metastatic,,,,,HR+ and HER2-,FALSE AND TRUE,True,MONARCH 1,2017-09-28: FDA approved as monotherapy for wo...,2022-09-05,4,abemaciclib,False,CDK4/6 inhibitor,mechanistic,CDK inhibitor,44963.0,CDK4 inhibitor|CDK6 inhibitor,32998|32999,2019-05-27
3,Abemaciclib,FDA,2017-09-28,Breast cancer,False,False,False,,,Advanced OR Metastatic,,women,Antiestrogen,False,,,,,Fulvestrant,HR+ and HER2-,FALSE AND TRUE,True,MONARCH 2,2017-09-28: FDA approved in combination with <...,2022-09-05,4,abemaciclib,False,CDK4/6 inhibitor,mechanistic,CDK inhibitor,44963.0,CDK4 inhibitor|CDK6 inhibitor,32998|32999,2019-05-27
4,Abemaciclib,FDA,2018-02-26,Breast cancer,False,False,False,,,Advanced OR Metastatic,,postmenopausal women,,,,,,,Aromatase inhibitor,HR+ and HER2-,FALSE AND TRUE,True,MONARCH 3,2018-02-26: FDA approved in combination with a...,2022-09-05,4,abemaciclib,False,CDK4/6 inhibitor,mechanistic,CDK inhibitor,44963.0,CDK4 inhibitor|CDK6 inhibitor,32998|32999,2019-05-27


## Drug approval dates


In [75]:
# Cols of interest
cols = [
    "component",
    "regulator",
    "date",
    "condition",
    "withdrawn",
    "drug_CUI",
    "with",
    "string",
]
approv_hem_data = hemonc_data[cols]

# check for missing / null values in year and in condition
approv_hem_data = approv_hem_data[approv_hem_data["condition"] != "NONE"]
approv_hem_data = approv_hem_data[approv_hem_data["regulator"] == "FDA"]

# Convert 'date' to datetime year month day and withdraw to boolean
approv_hem_data["date"] = pd.to_datetime(approv_hem_data["date"], errors="coerce")
approv_hem_data["withdrawn"] = approv_hem_data["withdrawn"].astype(bool)

# create a new column 'approval_year' and set it to the year of the date if withdrawn is false
approv_hem_data["approval_year"] = np.where(
    ~approv_hem_data["withdrawn"], approv_hem_data["date"].dt.year, np.nan
)

# if withdrawn is true then create a new column 'withdrawn_year' and set it to the year of the date
approv_hem_data["withdrawal_year"] = np.where(
    approv_hem_data["withdrawn"], approv_hem_data["date"].dt.year, np.nan
)

# Convert 'active_years' to a list of years
approv_hem_data["active_years"] = [
    (
        list(range(int(approval), int(withdrawal) + 1))
        if not np.isnan(withdrawal) and not np.isnan(approval)
        else list(range(int(approval), 2025)) if not np.isnan(approval) else np.nan
    )
    for approval, withdrawal in zip(
        approv_hem_data["approval_year"], approv_hem_data["withdrawal_year"]
    )
]

approv_hem_data.head()

Unnamed: 0,component,regulator,date,condition,withdrawn,drug_CUI,with,string,approval_year,withdrawal_year,active_years
0,Abarelix,FDA,2003-11-25,Prostate cancer,False,3652,,2003-11-25: Approved for palliative treatment ...,2003.0,,"[2003, 2004, 2005, 2006, 2007, 2008, 2009, 201..."
2,Abemaciclib,FDA,2017-09-28,Breast cancer,False,4,,2017-09-28: FDA approved as monotherapy for wo...,2017.0,,"[2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]"
3,Abemaciclib,FDA,2017-09-28,Breast cancer,False,4,Fulvestrant,2017-09-28: FDA approved in combination with <...,2017.0,,"[2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]"
4,Abemaciclib,FDA,2018-02-26,Breast cancer,False,4,Aromatase inhibitor,2018-02-26: FDA approved in combination with a...,2018.0,,"[2018, 2019, 2020, 2021, 2022, 2023, 2024]"
11,Abemaciclib,FDA,2021-10-12,Breast cancer,False,4,Aromatase inhibitor OR Tamoxifen,2021-10-12: Approved with endocrine therapy (t...,2021.0,,"[2021, 2022, 2023, 2024]"


In [76]:
# confirm there is only one row for each drug + with combination
approv_hem_data[
    approv_hem_data.duplicated(subset=["drug_CUI", "with"], keep=False)
].sort_values(by=["drug_CUI", "with"])

Unnamed: 0,component,regulator,date,condition,withdrawn,drug_CUI,with,string,approval_year,withdrawal_year,active_years
551,Capivasertib,FDA,2023-11-16,Breast cancer,False,2,Fulvestrant,2023-11-16: Approved with fulvestrant for adul...,2023.0,,"[2023, 2024]"
552,Capivasertib,FDA,2023-11-16,Breast cancer,False,2,Fulvestrant,2023-11-16: Approved with fulvestrant for adul...,2023.0,,"[2023, 2024]"
16,Abiraterone,FDA,2011-04-28,Prostate cancer,False,6,,"2011-04-28: <a target=""_blank"" rel=""noreferrer...",2011.0,,"[2011, 2012, 2013, 2014, 2015, 2016, 2017, 201..."
19,Abiraterone,FDA,2012-12-10,Prostate cancer,False,6,,"2012-12-10: <a target=""_blank"" rel=""noreferrer...",2012.0,,"[2012, 2013, 2014, 2015, 2016, 2017, 2018, 201..."
27,Acalabrutinib,FDA,2017-10-31,Mantle cell lymphoma,False,7,,2017-10-31: Granted accelerated approval for t...,2017.0,,"[2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]"
...,...,...,...,...,...,...,...,...,...,...,...
2226,Pertuzumab and Trastuzumab hyaluronidase,FDA,2020-06-29,Breast cancer,False,111391,Chemotherapy,2020-06-29: Initial approval in combination wi...,2020.0,,"[2020, 2021, 2022, 2023, 2024]"
2219,Pertuzumab and Trastuzumab hyaluronidase,FDA,2020-06-29,Breast cancer,False,111391,Docetaxel,2020-06-29: Initial approval in combination wi...,2020.0,,"[2020, 2021, 2022, 2023, 2024]"
2220,Pertuzumab and Trastuzumab hyaluronidase,FDA,2020-06-29,Breast cancer,False,111391,Docetaxel,2020-06-29: Initial approval in combination wi...,2020.0,,"[2020, 2021, 2022, 2023, 2024]"
1393,Iobenguane I 131,FDA,2018-07-30,Paraganglioma,False,115124,,2018-07-30: Initial approval for the treatment...,2018.0,,"[2018, 2019, 2020, 2021, 2022, 2023, 2024]"


In [77]:
# check duplicate entries in drug approval for differences
# some are due to combination therapy rows-> how want to handle active years for these?
# need to go through string column to see differences -> ? in other metadata columns

## Trial Status dates


In [66]:
hemonc_data.columns

Index(['component', 'regulator', 'date', 'condition', 'accelerated', 'withdrawn', 'first_in_class', 'note', 'context', 'stage_or_status', 'risk_stratification', 'demographics', 'prior_therapy', 'prior_therapy_negation', 'prior_therapy_setting', 'response_contingency', 'time_contingency', 'prior_biomarker', 'with', 'biomarker', 'biomarker_negation', 'study_yn', 'study', 'string', 'date_added.x', 'drug_CUI', 'drug_INN', 'investigational', 'main_class', 'class_type', 'CanMED_major_class', 'CanMED_major_class_CUI', 'CanMED_minor_class', 'CanMED_minor_class_CUI', 'date_added.y'], dtype='object')

In [71]:
# Cols of interest
cols = [
    "component",
    "condition",
    "study_yn",
    "study",
    "string",
    "drug_CUI",
]
trial_hem_data = hemonc_data[cols]

# Extract the year from the 'string' column
trial_hem_data["trial_year"] = trial_hem_data["string"].str.extract("(\d{4})")

# check for missing / null values in year and in condition
trial_hem_data = trial_hem_data[trial_hem_data["condition"] != "nan"]

trial_hem_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trial_hem_data["trial_year"] = trial_hem_data["string"].str.extract("(\d{4})")


Unnamed: 0,component,condition,study_yn,study,string,drug_CUI,trial_year
0,Abarelix,Prostate cancer,True,Koch et al. 2003,2003-11-25: Approved for palliative treatment ...,3652,2003
1,Abciximab,NONE,,,1994-12-22: Initial approval (label not availa...,3,1994
2,Abemaciclib,Breast cancer,True,MONARCH 1,2017-09-28: FDA approved as monotherapy for wo...,4,2017
3,Abemaciclib,Breast cancer,True,MONARCH 2,2017-09-28: FDA approved in combination with <...,4,2017
4,Abemaciclib,Breast cancer,True,MONARCH 3,2018-02-26: FDA approved in combination with a...,4,2018


In [None]:
# check what is in string- why differences