In [1]:
import pandas as pd
import numpy as np
import os
import pyarrow as pa
import pyarrow.parquet as pq

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

#  set max display width too view full text
pd.set_option("display.max_colwidth", None)

In [2]:
df_dir = "../../data/coral/post_edit/"

In [3]:
def load_dataset(pre_filtered_df_dir, transformation):
    dataset_path = os.path.join(
        pre_filtered_df_dir,
        f"{transformation}.parquet",
    )

    if not os.path.exists(dataset_path):
        print(f"Dataset not found at path: {dataset_path}")
        return None

    return pd.read_parquet(dataset_path)


def merge_datasets_with_transformation(pre_filtered_df_dir):
    transformations = ["original", "brand_to_generic", "generic_to_brand"]
    merged_df = pd.DataFrame()

    for transformation in transformations:
        df = load_dataset(pre_filtered_df_dir, transformation)
        if df is not None:
            df["transformation_type"] = transformation
            df["question_index"] = range(len(df))
            merged_df = pd.concat([merged_df, df], ignore_index=True)

    return merged_df


merged_df = merge_datasets_with_transformation(df_dir)
merged_df = merged_df.sort_values(by=["coral_idx"])

if not merged_df.empty:
    print(f"\nMerged Dataset")
    print(f"Contains {len(merged_df)} rows.")
else:
    print(f"Failed to load and merge datasets for inspection.")


Merged Dataset
Contains 555 rows.


In [4]:
merged_df.head(1)

Unnamed: 0,coral_idx,Sex,UCSFDerivedRaceEthnicity_X,BirthDate,note_text,type,local_id,found_keywords,transformation_type,question_index
257,0,Female,Native Hawaiian or Other Pacific Islander,1964-03-25,"Medical Oncology Consult Note Patient Name: ***** ***** Patient MRN: ***** Patient DOB: 11/23/1963 Date of Visit: 12/30/2019 Provider: ***** ***** ***** Primary Care Provider: None Per Patient Provider Referring MD: Reason for visit: Chief Complaint Patient presents with  New Patient Evaluation Diagnosis: 1. Malignant neoplasm of overlapping sites of right breast in female, estrogen receptor positive (CMS code) NM Whole Body Bone Scan MR Brain with and without Contrast Complete Blood Count with Differential Comprehensive Metabolic Panel (BMP, AST, ALT, T.BILI, ALKP, TP, ALB) Cancer Antigen 15-3 Carcinoembryonic Antigen Activated Partial Thromboplastin Time Prothrombin Time Ambulatory Referral to Integrative Medicine History of Present Illness: 56 year old female diagnosed in May 2013 with a multifocal Stage IIA right breast cancer. She had a mastectomy with sentinel node and implant reconstruction in June 2013 and had 2.3 and 2.4 cm tumors found with a negative sentinel node. The tumors were G2, ER and PR positive and her 2 neu negative. She did not have radiation or chemotherapy and declined tamoxifen. She has had ***** but no other imaging since her diagnosis for screening and was treated at *****. She reports feeling discomfort and pressure in her waist and back for a few months and nausea and burping for a week. She went to the ER and had a CT scan which shows widely metastatic cancer. She has involvement of the lungs, peritoneum, liver and ovary with a local recurrence near the right axilla and implant. She takes a long list of supplements which she will send to me. She wants to have natural therapies. She is very scared and appears anxious. Her husband is here and very supportive. We discussed how we need to get a biopsy to evaluate the phenotype of the cancer and she is agreeable. I will set her up to see Dr. \n ***** this week for a biopsy of the right axilla in the office. We discussed how treatment would be palliative but that treatment could improve her quality of life as well as length of life. She asked me multiple times if this can be cured. I will refer her to the ***** center. I also ordered a MRI of brain and bone scan as well as labs to complete her work up. Oncologic history Patient Active Problem List Diagnosis Date Noted  Malignant neoplasm of overlapping sites of right breast in female, estrogen receptor positive (CMS code) 01/02/2020  Mass of right chest wall 01/01/2020 Stage at *****: Cancer Staging Malignant neoplasm of overlapping sites of right breast in female, estrogen receptor positive (CMS code) Staging form: Breast, AJCC 8th Edition - Pathologic stage from 07/04/2013: Stage ***** (pT2, *****(sn), *****, G2, ER+, PR+, *****-) - Signed by ***** ***** *****, MD on 01/02/2020 Medications: No current outpatient medications on file. No current facility-administered medications for this visit. Allergies: Allergies/Contraindications Allergen Reactions  Penicillins Rash Rash Rash Medical History: Past Medical History: Diagnosis Date  Anxiety  Breast cancer (CMS code)  Hyperlipidemia Surgical History: Past Surgical History: Procedure Laterality Date  CYST REMOVAL neck  MASTECTOMY Right with sentinel node and implant placement.  MYOMECTOMY Social History: Social History Tobacco Use  Smoking status: Never Smoker Substance and Sexual Activity  Alcohol use: Yes Comment: rare  Drug use: Never  Sexual activity: Yes Partners: Male *****: ***** ***** Status: married ***** husband Gynecologic History: G 3 P 1 AB 2 Menarche 12 Menopause 52 OCP 0 years HRT 0 years Fertility treatments 1-2 years of IVF Family History: Family \n History Problem Relation Name Age of Onset  Lung cancer Mother  Lung cancer Father  Esophageal cancer Maternal Uncle Review of systems: CONSTITUTIONAL: No fevers, chills, night sweats, loss of appetite or weight loss H/E/N/T: No headaches, vision changes, mouth sores CV: No *****, orthopnea PULM: No cough, hemoptysis or sob. ABD: No abdominal pain, nausea, vomiting, diarrhea, or bleeding GU: No dysuria or hematuria SKIN: No rash or bruising NEURO: No difficulty with speech, balance, or gait, no headache MUSC: No joint pain PSYCH: No depression, or anxiety or trouble sleeping. Pain Pain Score/Location 12/30/19 1611 *****: 0 Code status: Advance Care Planning Full code. Performance status: 0 - Asymptomatic Physical exam: BP 117/74 | Pulse 96 | Temp 36 C (96.8 F) | Resp 20 | Ht 154.1 cm (5' 0.67"") Comment: 12/30/2019@***** | Wt 63.5 kg (140 lb 1.6 oz) | SpO2 99% | BMI 26.76 kg/m GEN: No acute distress, awake and alert, appears stated age HEENT: No oral lesions Neck supple no lymphadenopathy Cardiac: Regular rate, rhythm no murmur Lungs: clear to ascultation Abdomen: Soft, non tender, non distended, she has hepatomegaly and omental masses that are palpable in the upper and mid abdomen. Non tender. Extremity: No edema Lymph: no peripheral adenopathy Neuro: *****-12 grossly intact, normal UE and LE strength. Skin: No rash Breast left breast without mass or skin changes, right mastectomy site with mass in the axilla felt 3 cm in size. Labs: No visits with results within 1 Month(s) from this visit. Latest known visit with results is: Hospital Outpatient Visit on 12/11/2001 Component Date Value Ref Range Status  Human Chorionic Gonadotropin for P***** 12/11/2001 <2 IU/L Final IMPRESSION: CT chest, abdomen and pelvis 12/24/2019 1. Widespread metastases consistent with metastatic \n breast carcinoma. 2. Right axillary adenopathy with 1.8 cm spiculated mass along superolateral margin of right breast implant, suspicious for regional recurrence. 3. Multiple small pulmonary nodules, right greater than left, likely metastatic. 4. Large anterior right and left lobe hepatic masses likely metastatic. 5. Peritoneal carcinomatosis with omental *****, nodularity, and small possibly malignant effusion. 6. Mildly enlarged and hyperdense ovaries which could reflect breast metastases to ovaries. 7. Appendix distended with fluid without associated inflammatory wall thickening. A mucocele of the appendix would be a consideration. 8. No CT evidence of osseous metastasis. Psychologic/emotional well-being/support She is distressed. She has good support from her husband who is here today. Assessment / Plan: 1. Stage II right breast cancer s/p right mastectomy with sentinel node in June 2013 ( 2.4 and 2.3 cm tumors, node negative ER and PR positive and her 2 negative.) No further adjuvant therapy given and patient declined tamoxifen. 2. Metastatic relapse of her breast cancer. She needs tissue confirmation and we will complete her staging work up. 3. Appointment with Dr. ***** on Thursday to biopsy mass in right axilla in the office. 4. RTC with me after completed work up to formulate a plan. If she is HR+/ her 2 negative on biopsy will recommend palbociclib and *****. 5. I will ask the nurse to send her information about the treatment. 6. ***** ***** Referral asap. ***** ***** *****, MD Medical Oncology/Hematology TIME SPENT: 75 minutes examining patient, reviewing chart, coordinating care, discussing with patient and/or family. \n",breast,0,"[ibrance, tamoxifen]",brand_to_generic,72


In [5]:
import os
import pandas as pd
from collections import Counter

# Assuming df_dir is defined and dataset_path is constructed correctly
dataset_path = os.path.join(df_dir, "original.parquet")

# Read the original dataset
original_df = pd.read_parquet(dataset_path)

# Flatten the list of keywords and count occurrences
all_keywords = [
    keyword for sublist in original_df["found_keywords"] for keyword in sublist
]
keyword_counts = Counter(all_keywords)

# Optionally, display the counts in a more readable format
keyword_counts_df = pd.DataFrame(keyword_counts.items(), columns=["Keyword", "Count"])
keyword_counts_df = keyword_counts_df.sort_values(by="Count", ascending=False)
print(keyword_counts_df)

# print number unique keywords
print(f"Number of unique keywords: {len(keyword_counts)}")

# Print the total count of keywords
total_keywords_count = sum(keyword_counts.values())
print(f"Total count of keywords: {total_keywords_count}")

                   Keyword  Count
4              gemcitabine     82
13              paclitaxel     50
11                abraxane     37
31               tamoxifen     34
5             capecitabine     33
34               letrozole     31
10              omeprazole     29
12                prilosec     27
2               irinotecan     26
6              oxaliplatin     24
54                   taxol     23
21                  xeloda     15
36                    dexa     15
18           dexamethasone     15
32                  femara     14
64               denosumab     13
45                arimidex     11
40             trastuzumab     11
15                decadron     11
52                 cytoxan     10
16             carboplatin     10
9               prednisone     10
46             amoxicillin     10
27          hydrocortisone     10
72              everolimus      9
47              exemestane      9
53             anastrozole      8
51                taxotere      8
82            

In [6]:
# save original df to csv for inspection
original_df.to_csv("../../data/coral/post_edit/original.csv", index=False)

In [8]:
import pandas as pd
import re


def extract_assessment_plan(note_text):
    # List of phrases to search for, formatted to handle special characters and spaces
    phrases = [
        "ASSESSMENT & PLAN",
        "ASSESSMENT / PLAN",
        "ASSESSMENT \\ PLAN",
        "ASSESSMENT AND PLAN",
        "ASSESSMENT",
        "FINAL IMPRESSION",
        "IMPRESSION",
        "PLAN",
    ]

    # Normalize the note_text to lower case and remove special characters
    note_text_normalized = re.sub(r"\s+", " ", note_text.strip()).lower()

    # Combine the phrases into a single regex pattern, formatted to lower case and handle special characters
    pattern = (
        r"(" + "|".join(re.escape(phrase.lower()) for phrase in phrases) + r"):(.*)"
    )

    # Search for the pattern
    match = re.search(pattern, note_text_normalized, re.DOTALL)
    if match:
        extracted_text = match.group(2).strip()
        return extracted_text
    else:
        return None


# Assuming `original_df` is already defined as your original DataFrame
df = original_df.copy()

# Apply the function to the DataFrame and store the results
df["assessment_plan"] = df["note_text"].apply(extract_assessment_plan)

# Iterate through the DataFrame and print rows where no match is found
for index, row in df.iterrows():
    if row["assessment_plan"] is None:
        print(f"No match found for index {index}:")
        print(row["note_text"])
        print()  # For better readability between notes

# Optionally, display the DataFrame
# print(df[["note_text", "assessment_plan"]])

No match found for index 0:
 UCSF Cancer Center GI Medical Oncology Program  ***** ***** *****, ***** *****  ***** ***** ***** *****-*****  Phone: *****-*****-***** | Fax: *****-*****-*****     Follow-up *****  ***** : ***** *****, MD, *****      11/24/16    ***** had the pleasure of seeing ***** ***** ***** for follow up of metastatic pancreatic adenocarcinoma with peritoneal carcinomatosis at our GI Medical Oncology practice at the UCSF ***** ***** Family Comprehensive Cancer Center.    Patient Identification and Oncology History  ***** ***** ***** is a 61 y.o. female who presents to our GI Oncology practice at ***** ***** ***** Family Comprehensive Cancer Center for evaluation of metastatic pancreatic adenocarcinoma with peritoneal carcinomatosis.     Patient initially presented to ***** Community Hospital with loose BM, weight loss, abdominal pain and jaundice on 11/16/15. She was found to have elevated LFTs/hyperbilirubinemia (t.bili 8.7). An abdominal ultrasound was obtained whic

In [None]:
# write out when happy
def write_df_to_parquet(output_dir, local_dataset_name, split, version, df):
    split_path = os.path.join(
        output_dir, local_dataset_name, split, f"{version}.parquet"
    )

    # Ensure the directory exists
    os.makedirs(os.path.dirname(split_path), exist_ok=True)

    # Write the DataFrame to Parquet
    df.to_parquet(split_path, index=False)
    print(f"DataFrame written to {split_path}")