#**Sources**

* [metapub](https://pypi.org/project/metapub/)
* [NCI thesaurus](https://ncit.nci.nih.gov/ncitbrowser/pages/home.jsf?version=24.06d)

* https://github.com/NCIEVS/evsrestapi-client-SDK/tree/master/python-examples
* https://medium.com/nirman-tech-blog/locationtagger-a-python-package-to-extract-locations-from-text-or-web-page-dbb05f1648d3

* [dateparser](https://dateparser.readthedocs.io/en/latest/)


# **Installs & Imports**

In [None]:
!pip install metapub -q

In [None]:
!pip install medspacy -q

In [None]:
!pip install spacy==3.7.5 -q
!pip install scispacy -q
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz -q

In [None]:
!pip install nltk -q
!python -m spacy download en_core_web_sm -q
!pip install locationtagger -q

In [None]:
!pip install dateparser -q

In [None]:
import pandas

In [None]:
from metapub import PubMedFetcher

In [None]:
import scispacy
import spacy
from spacy import displacy

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download("maxent_ne_chunker")
nltk.download('words')
import locationtagger

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [None]:
import re

In [None]:
from dateparser.search import search_dates
from dateparser import DateDataParser

# **Data preparation**

## GeoARK attributes dataframes

In [None]:
file_path = "/content/drive/MyDrive/Su2024/GeoARK/geoark_attributes_06-2023.xlsx"
all_attributes_dataframe = pandas.read_excel(file_path)

In [None]:
ISO_Code_dataframe = pandas.read_csv("/content/drive/MyDrive/Su2024/GeoARK/ISO_Code.csv", encoding='latin-1')

In [None]:
ISO_Code_dataframe.keys()

Index(['Name', 'Code', 'Definition'], dtype='object')

In [None]:
def write_ISO_category_names(ISO_dataframe: pandas.DataFrame, all_attributes_dataframe: pandas.DataFrame):
  for rowNum, row in all_attributes_dataframe.iterrows():
    iso_code = row["iso_key"]
    if not pandas.isna(iso_code):
      iso_category_name = ISO_dataframe.loc[ISO_dataframe['Code'] == iso_code, "Name"]
      if iso_category_name.empty:
        print(f"No matching ISO category found for code: {iso_code} in row {row}")
      else:
        all_attributes_dataframe["iso_key_add"][rowNum] = iso_category_name.iloc[0]
  return all_attributes_dataframe

In [None]:
def write_to_seperate_csvs(dataframe: pandas.DataFrame):
  dataset_ids = dataframe['dataset_id'].unique()
  all_datasets = {}
  for dataset_id in dataset_ids:
    dataset = all_attributes_dataframe.loc[all_attributes_dataframe['dataset_id'] == dataset_id]
    file_path = f"/content/drive/MyDrive/Su2024/GeoARK/{dataset_id}.xlsx"""
    dataset.to_csv(file_path, encoding="utf-8")
    all_datasets[dataset_id] = dataset
  return all_datasets

In [None]:
all_attributes_dataframe = write_ISO_category_names(ISO_Code_dataframe, all_attributes_dataframe)

In [None]:
all_attributes_dataframe.to_csv(file_path, encoding="utf-8")

In [None]:
def ISO_category_only_dataframe(ISO_category_name: str, all_attributes_dataframe: pandas.DataFrame):
  return all_attributes_dataframe.loc[all_attributes_dataframe['iso_key_add'] == ISO_category_name]

In [None]:
all_datasets = write_to_seperate_csvs(all_attributes_dataframe)

In [None]:
health_attributes_dataframe = ISO_category_only_dataframe("health", all_attributes_dataframe)

In [None]:
health_attributes_dataframe.to_csv("/content/drive/MyDrive/Su2024/GeoARK/health_attributes.csv", encoding="utf-8")

In [None]:
print(all_attributes_dataframe.keys())

Index(['dataset_id', 'attr_label', 'attr_orig', 'attr_desc', 'attr_id',
       'start_date', 'end_date', 'attr_dtype', 'iso_key', 'iso_key_add',
       'scale', 'positional_accuracy', 'spatial_rep', 'datum',
       'coordinate_system', 'entity_type', 'tags', 'originator_id'],
      dtype='object')


In [None]:
all_datasets.keys()

dict_keys(['6fbbd315_01_01', 'a781336d_01_01', 'a781336d_02_01', 'a781336d_03_01', 'ddbfd2c3_01_01', '04d18a18_01_01', '04d18a18_02_01', '04d18a18_03_01', '04d18a18_04_01', '04d18a18_05_01', 'ddbfd2c3_02_01', '04d18a18_06_01', '04d18a18_07_01', '04d18a18_08_01', 'd8409bee_01_01', '04d18a18_09_01', '1cf20ca7_01_01', '2023_Q1_HIFLD_DataCatalog'])

In [None]:
# Hand-classified & renamed the dataset .xlsx files
xlsx_files = ['6fbbd315_01_01 (hospital beds).xlsx',
 'a781336d_01_01 (supermarket proximity).xlsx',
 'a781336d_02_01 (food accessability).xlsx',
 'a781336d_03_01 (demographics, age, employment, nationality, ethnicity).xlsx',
 'ddbfd2c3_01_01.xlsx',
 '04d18a18_01_01 (employment).xlsx',
 '04d18a18_02_01 (veterans).xlsx',
 '04d18a18_03_01 (health coverage).xlsx',
 '04d18a18_04_01 (housing).xlsx',
 '04d18a18_05_01 (race, sex, age).xlsx',
 'ddbfd2c3_02_01 (smoking).xlsx',
 '04d18a18_06_01 (transportation).xlsx',
 '04d18a18_07_01 (low income).xlsx',
 '04d18a18_08_01 (population demographics).xlsx',
 'd8409bee_01_01 (Mizzou degrees).xlsx',
 '04d18a18_09_01 (household types).xlsx',
 '1cf20ca7_01_01 (EJ data).xlsx']

In [None]:
all_datasets = write_ISO_category_names(all_datasets, ISO_Code_dataframe)

## Pubmed dataframes

In [None]:
def create_pubmed_dataframe(query: str, articles_count: int, columns_names=["pmid", "title", "abstract", "year", "citation", "link"]) -> pandas.DataFrame:
  pubmed_dataframe = pandas.DataFrame(columns=columns_names)
  fetcher = PubMedFetcher()

  pubmed_dataframe["pmid"] = fetcher.pmids_for_query(query, articles_count)

  articles = [fetcher.article_by_pmid(pmid) for pmid in pubmed_dataframe["pmid"]]

  pubmed_dataframe["title"] = [article.title for article in articles]
  pubmed_dataframe["abstract"] = [article.abstract for article in articles]
  pubmed_dataframe["year"] = [article.year for article in articles]
  pubmed_dataframe["citation"] = [article.citation for article in articles]

  return pubmed_dataframe

In [None]:
def delete_empty_abstract_rows(pubmed_dataframe: pandas.DataFrame):
  empty_abstract_rows = pubmed_dataframe.loc[pubmed_dataframe["abstract"].isna()].index
  pubmed_dataframe = pubmed_dataframe.drop(empty_abstract_rows)
  return pubmed_dataframe

In [None]:
pubmed_dataframe = create_pubmed_dataframe("cancer", 10)
pubmed_dataframe = delete_empty_abstract_rows(pubmed_dataframe)
pubmed_dataframe.to_csv("/content/drive/MyDrive/Su2024/GeoARK/pubmed_cancer.csv", encoding="utf-8")

In [None]:
pubmed_dataframe = pandas.read_csv("/content/drive/MyDrive/Su2024/GeoARK/pubmed_cancer.csv", encoding="utf-8")

In [None]:
pubmed_dataframe.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,pmid,title,abstract,year,citation,link
0,1,1,39141399,Minimal Access vs Conventional Nipple-Sparing ...,IMPORTANCE: While nipple-sparing mastectomy (N...,2024,"Kim JH, et al. Minimal Access vs Conventional ...",
1,2,2,39141388,Long-Term Survival in Patients With Advanced M...,IMPORTANCE: Long-term survival data from clini...,2024,"van Not OJ, et al. Long-Term Survival in Patie...",
2,4,4,39141363,Exploring the Germline Genetics of In Situ and...,IMPORTANCE: It is unknown whether germline gen...,2024,"Ingold N, et al. Exploring the Germline Geneti...",
3,6,6,39141355,Pressure-enhanced sensing of tissue oxygenatio...,Fluorescence guidance is routinely used in sur...,2024,"Petusseau AF, et al. Pressure-enhanced sensing...",
4,7,7,39141353,Circadian period is compensated for repressor ...,Most mammalian cells have molecular circadian ...,2024,"Gabriel CH, et al. Circadian period is compens...",


# **Annotate cancer-related texts**

In [None]:
all_datasets["6fbbd315_01_01"]['iso_key_add'].unique()

array([nan, 'health'], dtype=object)

In [None]:
for file_name in xlsx_files:
  for dataset_id, dataset in all_datasets.items():
    if dataset_id in file_name:
      dataset.to_csv("/content/drive/MyDrive/Su2024/GeoARK/" + file_name, encoding="utf-8")

In [None]:
import medspacy
from medspacy.ner import TargetRule

nlp = medspacy.load()
print(nlp.pipe_names)

nlp.get_pipe('medspacy_target_matcher').add([TargetRule('stroke', 'CONDITION'), TargetRule('diabetes', 'CONDITION'), TargetRule('pna', 'CONDITION')])
doc = nlp('Patient has hx of stroke. Mother diagnosed with diabetes. No evidence of pna.')

for ent in doc.ents:
    print(ent, ent._.is_negated, ent._.is_family, ent._.is_historical)
displacy.render(doc, style="ent", jupyter=True)

['medspacy_pyrush', 'medspacy_target_matcher', 'medspacy_context']
stroke False False True
diabetes False True False
pna True False False


In [None]:
def label_pubmed_abstracts(pubmed_dataframe: pandas.DataFrame):
  nlp = spacy.load("en_ner_bc5cdr_md")
  for rowNum, row in pubmed_dataframe.iterrows():
    file_name = "".join([c if c.isalnum() else "_" for c in row["title"]]) + "_labels.xlsx"
    nlp_labeled = nlp(row["abstract"])
    annotated_abstracts = pandas.DataFrame(columns=["text", "label", "start", "end"])
    first_row = pandas.DataFrame({"text": [row["abstract"]], "label": None, "start": None, "end": None})
    annotated_abstracts = pandas.concat([annotated_abstracts, first_row], ignore_index=True)
    for entity in nlp_labeled.ents:
      new_row = pandas.DataFrame({"text": [entity.text], "label": [str(entity.label_)], "start": [entity.start_char], "end": [entity.end_char]})
      annotated_abstracts = pandas.concat([annotated_abstracts, new_row], ignore_index=True)
    annotated_abstracts = annotated_abstracts.loc[annotated_abstracts["label"] == "DISEASE"]
    if not annotated_abstracts.empty:
      annotated_abstracts.to_csv("/content/drive/MyDrive/Su2024/GeoARK/pubmed_labeled_abstracts/" + file_name, encoding="utf-8", index=False)

In [None]:
label_pubmed_abstracts(pubmed_dataframe)

# **Extract GEO-words from Natural Language**

In [None]:
def extract_only_country_locations(prompt: str, country: str):
  entities = locationtagger.find_locations(text=prompt)

  explicit_regions = []
  explicit_cities = []
  region_to_cities = {}

  if country in entities.countries or country in entities.other_countries:
    if country in entities.country_regions.keys():
      explicit_regions = entities.country_regions[country]
    if country in entities.country_cities.keys():
      explicit_cities = entities.country_cities[country]
    for explicit_city in explicit_cities:
      for region, cities in entities.region_cities.items():
        if explicit_city in cities:
          if region not in region_to_cities:
            region_to_cities[region] = [explicit_city]
          else:
            region_to_cities[region].append(explicit_city)

  location_entities = locationtagger.find_locations(text=str(region_to_cities))
  reextracted_region_to_cities = {}
  for region, city in region_to_cities.items():
    if region in location_entities.country_regions[country]:
      reextracted_region_to_cities[region] = city
  return {"explicit regions": explicit_regions, "explicit cities": explicit_cities, "implicit": reextracted_region_to_cities, "other": entities.other}


In [None]:
prompt = """What is the cancer prevelance rate during 2019 in Washington D.C to Columbia, California, VA to Taipei to Chile?"""
extracted_locations = extract_only_country_locations(prompt, "United States")

In [None]:
extracted_locations

{'explicit regions': ['California', 'Washington'],
 'explicit cities': ['Washington', 'Columbia', 'California'],
 'implicit': {'Missouri': ['Washington', 'Columbia', 'California'],
  'Indiana': ['Washington'],
  'Pennsylvania': ['Washington', 'Columbia', 'California'],
  'Connecticut': ['Washington', 'Columbia'],
  'Mississippi': ['Washington', 'Columbia'],
  'Michigan': ['Washington'],
  'Utah': ['Washington'],
  'Virginia': ['Washington', 'Columbia', 'California'],
  'Wisconsin': ['Washington'],
  'Kansas': ['Washington'],
  'Georgia': ['Washington'],
  'Maine': ['Washington'],
  'Iowa': ['Washington', 'Columbia'],
  'Oklahoma': ['Washington'],
  'Arkansas': ['Washington'],
  'Maryland': ['Columbia', 'California'],
  'Tennessee': ['Columbia'],
  'California': ['Columbia'],
  'Kentucky': ['Columbia', 'California']},
 'other': ['Washington D.C', 'VA']}

In [None]:
def extract_abbreviated_locations(prompt: str, extracted_locations: dict, print_added=False) -> dict:
  """
    Input:
      prompt: str, geospatial query
      extracted_locations: dict, output from extract_only_country_locations()
    Output:
      extracted_locations: dict, updated version of extracted_locations that includes abbreviated regions/minus any loc
  """
  states_abbreviations = {'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware', 'DC': 'District of Columbia', 'D.C': 'District of Columbia', 'D.C.': 'District of Columbia', 'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho', 'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland', 'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi', 'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada', 'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York', 'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina', 'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia', 'WI': 'Wisconsin', 'WY': 'Wyoming'}
  for abbreviation in states_abbreviations.keys():
    if prompt.endswith(abbreviation):
      if prompt[-3] == ' ' or prompt[-3] == ',':
        extracted_locations["explicit regions"].append(states_abbreviations[prompt[-2:]])
        prompt = prompt[:-3]
        if print_added:
          print("Added", states_abbreviations[prompt[-2:]], "to explicit regions...")

    matches = re.findall(f"\W+{abbreviation}\W+", prompt)
    if matches:
      first_match = re.sub('[\W_]+', '', matches[0])
      state = states_abbreviations[first_match]
      extracted_locations["explicit regions"].append(state)
      if print_added:
        print("Added", state, "to explicit regions...")
  return extracted_locations

In [None]:
extracted_locations = extract_abbreviated_locations(prompt, extracted_locations, print_added=True)
extracted_locations

Added District of Columbia ...
Added Virginia ...


{'explicit regions': ['California',
  'Washington',
  'District of Columbia',
  'Virginia'],
 'explicit cities': ['Washington', 'Columbia', 'California'],
 'implicit': {'Missouri': ['Washington', 'Columbia', 'California'],
  'Indiana': ['Washington'],
  'Pennsylvania': ['Washington', 'Columbia', 'California'],
  'Connecticut': ['Washington', 'Columbia'],
  'Mississippi': ['Washington', 'Columbia'],
  'Michigan': ['Washington'],
  'Utah': ['Washington'],
  'Virginia': ['Washington', 'Columbia', 'California'],
  'Wisconsin': ['Washington'],
  'Kansas': ['Washington'],
  'Georgia': ['Washington'],
  'Maine': ['Washington'],
  'Iowa': ['Washington', 'Columbia'],
  'Oklahoma': ['Washington'],
  'Arkansas': ['Washington'],
  'Maryland': ['Columbia', 'California'],
  'Tennessee': ['Columbia'],
  'California': ['Columbia'],
  'Kentucky': ['Columbia', 'California']},
 'other': ['Washington D.C', 'VA']}

In [None]:
def test(prompt: str, extracted_locations: dict, print_added=False) -> dict:
  """
    Input:
      prompt: str, geospatial query
      extracted_locations: dict, output from extract_only_country_locations()
    Output:
      extracted_locations: dict, updated version of extracted_locations that includes abbreviated regions/minus any loc
  """
  states_abbreviations = {'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware', 'DC': 'District of Columbia', 'D.C': 'District of Columbia', 'D.C.': 'District of Columbia', 'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho', 'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland', 'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi', 'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada', 'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York', 'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina', 'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia', 'WI': 'Wisconsin', 'WY': 'Wyoming'}
  for abbreviation in states_abbreviations.keys():
    if prompt.endswith(abbreviation):
      if prompt[-3] == ' ' or prompt[-3] == ',':
        extracted_locations["explicit regions"].append(states_abbreviations[prompt[-2:]])
        prompt = prompt[:-3]
        if print_added:
          print("Added", states_abbreviations[prompt[-2:]], "...")

    region_matches = re.findall(f"\W+{abbreviation}\W+", prompt)
    if region_matches:
      first_match = re.sub('[\W_]+', '', region_matches[0])
      region = states_abbreviations[first_match]
      extracted_locations["explicit regions"].append(region)
      if print_added:
        print("Added", region, "to explicit regions...")

    city_matches = re.findall(r"\w+\W+(?=\s*" + abbreviation +  r")", prompt)
    if city_matches:
      city = re.sub('[\W_]+', '', city_matches[0])
      if city:
        if "explicit_regions_cities" not in extracted_locations.keys():
          extracted_locations["explicit_regions_cities"] = [{region: city}]
        else:
          extracted_locations["explicit_regions_cities"].append({region: city})
  return extracted_locations

In [None]:
extracted = extract_only_country_locations(prompt, "United States")
test(prompt, extracted)

NameError: name 'extract_only_country_locations' is not defined

In [None]:
nlp = spacy.load("en_core_web_sm")
document = nlp(prompt)
displacy.render(document, style="ents", jupyter=True)

#**Extract temporal entities from Natural Language**

In [None]:
def extract_temporal_entities(text: str) -> list:
  """
  Given English natural language text, detect all mentions of temporal entities.
  If entitiy isn't specific enough, defaults to previously mentioned year's date OR current year-1, January, and or 1st day of month
  input:
    text: str, geospatial query
  output:
    list of tuples [("detected entity", detected entity as datetime entity)]
  """
  defaults = {"PREFER_DATES_FROM": "past", "PREFER_MONTH_OF_YEAR": "first", "PREFER_DAY_OF_MONTH": "first"}
  return search_dates(text, settings=defaults, languages=["en"])

In [None]:
prompt = "The percentage of people in California, VA without health insurance versus the percentage of people in Columbia, MO covered under VU versus Washington DC from 2019 February to 2020 June 12, 6pm"

In [None]:
extract_temporal_entities("Around 2000, " + prompt + ". What about in December?")

[('2000, The', datetime.datetime(2000, 1, 1, 0, 0)),
 ('2019 February', datetime.datetime(2019, 2, 1, 0, 0)),
 ('2020 June 12, 6pm', datetime.datetime(2020, 6, 12, 18, 0)),
 ('about in December', datetime.datetime(2019, 12, 1, 0, 0))]