In [1]:
import json
import re
import time
from collections import defaultdict
import pandas as pd
import requests
import os

In [2]:
API_BASE_URL = "https://clinicaltrials.gov/api/v2/studies"
SEARCH_KEYWORD = 'fitzpatrick'
BOSTON_LAT = 42.36
BOSTON_LON = -71.06
SEARCH_RADIUS_MI = 25
STATE_TO_ISOLATE = "Massachusetts"

RAW_JSON_FILENAME = "fitzpatrick_search.json"
FINAL_OUTPUT_CSV = "final_fitzpatrick_trials_dataset.csv"

In [3]:
def fetch_clinical_trials_data(api_url, keyword, geo_lat, geo_lon, radius_mi, fields, output_filename):
    """
    Searches the ClinicalTrials.gov API with geographic and eligibility filters
    and saves the raw results to a JSON file.
    """
    if os.path.exists(output_filename):
        print(f"[*] Raw data file '{output_filename}' already exists. Skipping download.")
        return

    all_studies = []
    page_count = 1
    next_page_token = None

    location_filter = f"distance({geo_lat},{geo_lon},{radius_mi}mi)"
    eligibility_search = f'AREA[EligibilityCriteria]({keyword})'

    params = {
        'query.term': eligibility_search,
        'filter.geo': location_filter,
        'fields': ",".join(fields),
        'pageSize': 100
    }

    print("[*] Starting API query to fetch clinical trial data...")
    print(f"    - Search Logic: {eligibility_search}")

    while True:
        try:
            if next_page_token:
                params['pageToken'] = next_page_token

            response = requests.get(api_url, params=params)
            response.raise_for_status()
            data = response.json()

            current_studies = data.get('studies', [])
            if current_studies:
                all_studies.extend(current_studies)
                print(f"[*] Page {page_count}: Fetched {len(current_studies)} studies. Total so far: {len(all_studies)}")
            else:
                print("[*] No more studies found, ending search.")
                break

            next_page_token = data.get('nextPageToken')
            if not next_page_token:
                print("[*] All pages have been retrieved.")
                break

            page_count += 1
            time.sleep(0.5)  # Be polite to the API

        except requests.exceptions.RequestException as e:
            print(f"\n[!] API request failed on page {page_count}: {e}")
            break

    if all_studies:
        print(f"\n[*] Compiling {len(all_studies)} total studies into '{output_filename}'...")
        try:
            with open(output_filename, 'w', encoding='utf-8') as f:
                json.dump({'studies': all_studies}, f, ensure_ascii=False, indent=2)
            print(f"[*] Successfully saved raw data to {output_filename}")
        except IOError as e:
            print(f"[!] Error writing to file: {e}")
    else:
        print("\n[!] No studies were found to save.")

# --- Execute Data Fetching ---
fields_to_get = ["NCTId", "protocolSection", "resultsSection"]
fetch_clinical_trials_data(API_BASE_URL, SEARCH_KEYWORD, BOSTON_LAT, BOSTON_LON, SEARCH_RADIUS_MI, fields_to_get, RAW_JSON_FILENAME)

[*] Raw data file 'fitzpatrick_search.json' already exists. Skipping download.


In [None]:
def parse_eligibility_criteria(study_record, keyword):
    """
    Finds sentences mentioning a keyword in the eligibility criteria.
    Returns a list of dictionaries, each containing the sentence and a boolean
    indicating if it's from the exclusion section.
    """
    eligibility_text = study_record.get('protocolSection', {}).get('eligibilityModule', {}).get('eligibilityCriteria', '')
    if not eligibility_text:
        return []

    # Split criteria into inclusion and exclusion parts
    parts = re.split(r'exclusion criteria', eligibility_text, flags=re.IGNORECASE)
    inclusion_text = parts[0]
    exclusion_text = parts[1] if len(parts) > 1 else ""

    found_sentences = []

    # Search inclusion text
    for sentence in re.split(r'[.\n]', inclusion_text):
        if keyword in sentence.lower() and sentence.strip():
            found_sentences.append({'sentence': sentence.strip(), 'is_exclusion': False})

    # Search exclusion text
    if exclusion_text:
        for sentence in re.split(r'[.\n]', exclusion_text):
            if keyword in sentence.lower() and sentence.strip():
                found_sentences.append({'sentence': sentence.strip(), 'is_exclusion': True})

    return found_sentences


def extract_and_standardize_scores(sentence):
    """
    Analyzes a sentence to extract and format Fitzpatrick scores.
    Returns a dictionary containing a readable score and binary flags for each type.
    """
    if not isinstance(sentence, str):
        return {}

    text = sentence.lower()
    standardized = {'Type_I': 0, 'Type_II': 0, 'Type_III': 0, 'Type_IV': 0, 'Type_V': 0, 'Type_VI': 0}
    result = {'extracted_score': 'Not Specified'}
    result.update(standardized) # Start with all types as 0

    # Rule 1: Filter out irrelevant sentences
    if any(word in text for word in ['wrinkle', 'severity', 'questionnaire']):
        result['extracted_score'] = 'Not a Skin Type Score'
        return result

    # Rule 2: Handle 'all' or 'any'
    if 'all' in text or 'any' in text:
        result.update({k: 1 for k in standardized})
        result['extracted_score'] = 'All'
        return result

    # --- Score Parsing Logic ---
    roman_map = {'I': 1, 'II': 2, 'III': 3, 'IV': 4, 'V': 5, 'VI': 6}
    to_roman_map = {v: k for k, v in roman_map.items()}

    # Helper function to safely convert Roman or Arabic numerals to an integer
    def _to_int(num_str):
        if num_str.isdigit():
            return int(num_str)
        return roman_map.get(num_str.upper())

    # Find all Roman and Arabic numerals
    numerals_found = re.findall(r'\b(vi|v|iv|iii|ii|i|[1-6])\b', text)
    range_match = re.search(r'\b([ivx\d]+)\s*(?:-|to|through)\s*([ivx\d]+)\b', text)

    if range_match and len(numerals_found) >= 2:
        # **FIXED LOGIC HERE**
        start_num = _to_int(numerals_found[0])
        end_num = _to_int(numerals_found[-1])

        if start_num and end_num and start_num < end_num:
            for i in range(start_num, end_num + 1):
                standardized[f"Type_{to_roman_map[i]}"] = 1
            result['extracted_score'] = f"{to_roman_map[start_num]}-{to_roman_map[end_num]}"

    elif numerals_found:
        unique_scores = sorted(list(set(_to_int(n) for n in numerals_found)))
        roman_scores = []
        for score in unique_scores:
            roman_version = to_roman_map[score]
            standardized[f"Type_{roman_version}"] = 1
            roman_scores.append(roman_version)
        result['extracted_score'] = ", ".join(roman_scores)

    result.update(standardized)
    return result


def extract_study_details(study_record, state_to_isolate):
    """
    Extracts locations, status, and race demographics from a study record.
    """
    details = {
        'status': "N/A",
        f'{state_to_isolate}_facilities': "",
        'other_facilities': "",
    }

    protocol = study_record.get('protocolSection', {})
    if not protocol:
        return details

    # --- Extract Status ---
    details['status'] = protocol.get('statusModule', {}).get('overallStatus', 'N/A')

    # --- Extract Locations ---
    iso_locs, other_locs = [], []
    locations = protocol.get('contactsLocationsModule', {}).get('locations', [])
    for loc in locations:
        facility_str = f"{loc.get('facility', 'N/A')} ({loc.get('city', 'N/A')}, {loc.get('state', 'N/A')})"
        if loc.get('state') == state_to_isolate:
            iso_locs.append(facility_str)
        else:
            other_locs.append(facility_str)
    details[f'{state_to_isolate}_facilities'] = "; ".join(iso_locs) if iso_locs else ""
    details['other_facilities'] = "; ".join(other_locs) if other_locs else ""

    # --- Extract Race Demographics ---
    race_counts = defaultdict(int)
    results = study_record.get('resultsSection', {})
    if results:
        baseline_measures = results.get('baselineCharacteristicsModule', {}).get('measures', [])
        for measure in baseline_measures:
            if measure.get('title') == "Race (NIH/OMB)":
                for category in measure.get('classes', [{}])[0].get('categories', []):
                    race_title = category.get('title')
                    total_count = sum(int(m.get('value', 0)) for m in category.get('measurements', []))
                    if race_title:
                        race_counts[f"Race_{race_title.replace(' ', '_')}"] = total_count

    details.update(race_counts)
    return details

print("Helper functions defined.")

Helper functions defined.


In [None]:
def main_processing_pipeline():
    """
    Main function to run the entire data processing and enrichment pipeline.
    """
    # --- 1. Load Raw Data ---
    try:
        with open(RAW_JSON_FILENAME, 'r', encoding='utf-8') as f:
            data = json.load(f)
        studies = data.get('studies', [])
        print(f"[*] Loaded {len(studies)} studies from '{RAW_JSON_FILENAME}'.")
    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"[!] Error loading raw JSON file: {e}. Please run Cell 3 to fetch the data.")
        return

    # --- 2. Process Each Study ---
    processed_rows = []
    
    for study in studies:
        nct_id = study.get('protocolSection', {}).get('identificationModule', {}).get('nctId', 'N/A')
        eligibility_sentences = parse_eligibility_criteria(study, SEARCH_KEYWORD)
        study_details = extract_study_details(study, STATE_TO_ISOLATE)

        if not eligibility_sentences:
            continue
            
        for sent_info in eligibility_sentences:
            if sent_info['is_exclusion']:
                continue

            sentence = sent_info['sentence']
            row = {'nctId': nct_id}
            row.update(extract_and_standardize_scores(sentence))
            row.update(study_details)
            processed_rows.append(row)

    if not processed_rows:
        print("[!] No studies with the specified keyword in inclusion criteria were found.")
        return
        
    # --- 3. Create and Clean DataFrame ---
    df = pd.DataFrame(processed_rows)

    # Filter out rows that are not about skin type scores
    initial_rows = len(df)
    df_final = df[df['extracted_score'] != 'Not a Skin Type Score'].copy()
    print(f"[*] Dropped {initial_rows - len(df_final)} rows that were not skin type scores.")

    # --- Convert race columns to numeric types for calculations ---
    race_cols = [col for col in df_final.columns if col.startswith('Race_')]
    for col in race_cols:
        # Coerce errors will turn non-numbers (like empty strings) into NaN
        df_final[col] = pd.to_numeric(df_final[col], errors='coerce').fillna(0).astype(int)
    print("[*] Converted race demographic columns to integer type.")

    # --- Reorder columns to place 'other_facilities' last ---
    cols = df_final.columns.tolist()
    if 'other_facilities' in cols:
        cols.remove('other_facilities')
        cols.append('other_facilities')
        df_final = df_final[cols]
        print("[*] Reordered columns to place 'other_facilities' at the end.")

    # --- 4. Save Final CSV ---
    try:
        df_final.to_csv(FINAL_OUTPUT_CSV, index=False, encoding='utf-8')
        print(f"\n[*] Success! Final dataset with {len(df_final)} rows saved to '{FINAL_OUTPUT_CSV}'.")
        print("\n--- Final Data Preview ---")
        display(df_final.head())
    except IOError as e:
        print(f"[!] Error writing final CSV file: {e}")

# --- Run the Pipeline ---
main_processing_pipeline()

[*] Loaded 49 studies from 'fitzpatrick_search.json'.
[*] Dropped 2 rows that were not skin type scores.
[*] Converted race demographic columns to integer type.
[*] Reordered columns to place 'other_facilities' at the end.

[*] Success! Final dataset with 46 rows saved to 'final_fitzpatrick_trials_dataset.csv'.

--- Final Data Preview ---


Unnamed: 0,nctId,extracted_score,Type_I,Type_II,Type_III,Type_IV,Type_V,Type_VI,status,Massachusetts_facilities,Race_American_Indian_or_Alaska_Native,Race_Asian,Race_Native_Hawaiian_or_Other_Pacific_Islander,Race_Black_or_African_American,Race_White,Race_More_than_one_race,Race_Unknown_or_Not_Reported,other_facilities
0,NCT01559922,All,1,1,1,1,1,1,COMPLETED,"Call Suneva for Info (Wellesley, Massachusetts)",4,16,2,56,216,0,0,"Call Suneva for Info (Beverly Hills, Californi..."
1,NCT00836342,I-IV,1,1,1,1,0,0,COMPLETED,Clinical Unit for Research Trials in Skin - MG...,0,6,0,2,152,2,0,
2,NCT05411484,All,1,1,1,1,1,1,ACTIVE_NOT_RECRUITING,MGH Clinical Unit for Research Trials And Outc...,0,0,0,0,0,0,0,
3,NCT01754233,III-IV,0,0,1,1,0,0,COMPLETED,"Skin Care Physicians (Chestnut Hill, Massachus...",0,0,0,0,0,0,0,
4,NCT01438047,I-VI,1,1,1,1,1,1,WITHDRAWN,"BWH (Boston, Massachusetts)",0,0,0,0,0,0,0,
