In [22]:
# imports
from os import listdir
from os.path import join

# 1. Frequency of given keywords in the data
## Check if these keywords exist in the data, if so, how often?

In [23]:
# 1.1 Get keywords
from typing import List


def get_keywords(path):
    with open(path, "r") as keywords_file:
        keywords_data = keywords_file.readlines()

    category_keywords = {}
    for i in range(len(keywords_data)):
        category, keywords = keywords_data[i].split(":")
        keywords = keywords.split(",")
        keywords = [keyword.strip() for keyword in keywords]
        category_keywords[category] = keywords

    return category_keywords


entity_names_path = "/Users/akhil/code/lexical_lab/companies/tempus/entity_names.txt"
category_keywords = get_keywords(entity_names_path)

In [24]:
def get_text_data(data_path) -> List[str]:
  # Yeah, I'm keeping everything in memory. It is not large, so I assume it is fine.
  list_of_text = []

  file_names = listdir(data_path)

  for file_name in file_names:
      file_path = join(data_path, file_name)

      with open(file_path, "r") as text_file:
          text = text_file.read()
          list_of_text.append(text)

  return list_of_text


data_path = "/Users/akhil/code/lexical_lab/companies/tempus/Data"
list_of_text = get_text_data(data_path)

In [None]:
# 1.2 For each keyword, find how many times it is repeated


def count_and_update(category_keywords, keyword_counts, text):
  for category in category_keywords:
    for keyword in category_keywords[category]:
      keyword_counts[category][keyword] += text.count(keyword)

def get_keyword_counts(list_of_text):
  keyword_counts = {
      category: {keyword: 0 for keyword in category_keywords[category]}
      for category in category_keywords
  }

  for text in list_of_text:
    count_and_update(category_keywords, keyword_counts, text)

  return keyword_counts


keyword_counts = get_keyword_counts(list_of_text)

In [25]:
keyword_counts

{'cancertype': {'nsclc': 0,
  'sclc': 0,
  'dcis': 0,
  'non small cell lung cancer': 0,
  'breast cancer': 13,
  'lung cancer': 10,
  'prostate cancer': 8,
  'adenocarcinoma': 8,
  'testicular cancer': 1,
  "Kaposi's sarcoma": 1,
  'colorectal cancer': 18,
  'glioblastoma': 2,
  'melanoma': 14,
  'hodgkin lymphoma': 0,
  'squamous cell carcinoma': 1},
 'medication': {'pembrolizumab': 1,
  'bicalutamide': 0,
  'Gefitinib': 0,
  'Pertuzumab': 0,
  'Giotrif': 0,
  'imfinzi': 0,
  'tagrisso': 0,
  'nivolumab': 1,
  'lapatinib': 12,
  'capecitabine': 6},
 'biomarkers': {'metex14': 0,
  'egfr': 0,
  'kras': 0,
  'braf': 0,
  'ntrk': 0,
  'nrg1': 0,
  'her2': 0,
  'her-2': 0,
  'erbb2': 0,
  'er-bb2': 0,
  'brca': 0,
  'brca1': 0,
  'brca2': 0,
  'brca3': 0,
  'alk': 9,
  'ret': 27,
  'fgfr2': 0,
  'her 2': 2,
  'pr': 489,
  'er': 1865,
  'pgr': 1,
  'met': 58},
 'procedure_therapy': {'lumpectomy': 2, 'biopsy': 6}}

### Most of these keywords have low frequency.

Biomarkers er and per got unusually high numbers because "er" is part of many common words.
And I am not checking for "er" as a token, but just getting the substring count.

This is fine because I just want to get an idea of how often these keywords are used.

# 2. What are the most used words that are not stopwords (like "the")?

2.1. tokenize the text and then find frequency.

Reference: https://blog.ekbana.com/nlp-for-beninners-using-spacy-6161cf48a229

In [47]:
import spacy
nlp = spacy.load("en_core_web_sm")

text = '. '.join(list_of_text)
text = text.replace('\n', ' ')
text = text.replace('  ', ' ')

doc = nlp(text)
del text

# Raw words
words = [token.text for token in doc[:100]]

[print(word, end=' ') for word in words]
print('\n-------------------------------------')

# Raw sentences
[print(f'--- {sentence}') for sentence in list(doc.sents)[:5]]
print()

Curwen Eliot Hodgkin was born on 19 June 1905 , the only son of Charles Ernest Hodgkin and his wife , Alice Jane ( née Brooke).[1 ] The Hodgkins were a Quaker family and were related to Roger Fry.[1 ] The scientist Thomas Hodgkin was his great - grandfather 's older brother and the abstract painter Howard Hodgkin ( 1932–2017 ) was his cousin.[1 ] Hodgkin was educated at Harrow School from 1919 to 1923 . His artistic life started in London at the Byam Shaw School of Art and at the Royal Academy Schools under Francis Ernest Jackson . 
-------------------------------------
--- Curwen Eliot Hodgkin was born on 19 June 1905, the only son of Charles Ernest Hodgkin and his wife, Alice Jane (née Brooke).[1] The Hodgkins were a Quaker family and were related to Roger
--- Fry.[1]
--- The scientist Thomas Hodgkin was his great-grandfather's older brother and the abstract painter Howard Hodgkin (1932–2017) was his
--- cousin.[1]
--- Hodgkin was educated at Harrow School from 1919 to 1923.



In [48]:
# Words without stopwords
words = [token.text for token in doc if not token.is_stop and not token.is_punct]
print (words[:50])

['Curwen', 'Eliot', 'Hodgkin', 'born', '19', 'June', '1905', 'son', 'Charles', 'Ernest', 'Hodgkin', 'wife', 'Alice', 'Jane', 'née', 'Brooke).[1', 'Hodgkins', 'Quaker', 'family', 'related', 'Roger', 'Fry.[1', 'scientist', 'Thomas', 'Hodgkin', 'great', 'grandfather', 'older', 'brother', 'abstract', 'painter', 'Howard', 'Hodgkin', '1932–2017', 'cousin.[1', 'Hodgkin', 'educated', 'Harrow', 'School', '1919', '1923', 'artistic', 'life', 'started', 'London', 'Byam', 'Shaw', 'School', 'Art', 'Royal']


In [49]:
# Lemmantized words without stopwords
words = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
print (words[:50])

['Curwen', 'Eliot', 'Hodgkin', 'bear', '19', 'June', '1905', 'son', 'Charles', 'Ernest', 'Hodgkin', 'wife', 'Alice', 'Jane', 'née', 'Brooke).[1', 'Hodgkins', 'Quaker', 'family', 'relate', 'Roger', 'fry.[1', 'scientist', 'Thomas', 'Hodgkin', 'great', 'grandfather', 'old', 'brother', 'abstract', 'painter', 'Howard', 'Hodgkin', '1932–2017', 'cousin.[1', 'Hodgkin', 'educate', 'Harrow', 'School', '1919', '1923', 'artistic', 'life', 'start', 'London', 'Byam', 'Shaw', 'School', 'Art', 'Royal']


In [52]:
from collections import Counter

word_freq = Counter(words)
common_words = word_freq.most_common(15)
[print(word_count) for word_count in common_words]
print()

('cancer', 251)
('patient', 135)
('mutation', 90)
('cell', 74)
('treatment', 72)
('cause', 69)
('include', 67)
('gene', 66)
('study', 63)
('disease', 62)
('year', 62)
('rate', 60)
('\u2009', 52)
('symptom', 50)
('test', 49)



kind of obvious, but the data (most frequent words) show that the text information is mostly related to clinical/health.

In [57]:
labels = set([w.label_ for w in doc.ents])
for label in labels:
    entities = [e for e in doc.ents if label == e.label_]
    entities = list(set(entities))
    print(label, entities)

ORG [CRISPR, BRCA2, PE, VWF, The Centers for Disease Control and Prevention, the Global Fund, RNA, Keytruda, BRCA2, Genital herpes Influenza Measles Chickenpox, Profiler, NED, MH, VWF, RRT, KRAS G12C, CT, FD, RTK, U.N., Normal Human Neonatal Epidermal Keratinocytes, AI, LES, Portal, BRAF, BCR, ARF, CRC, BRCA2, the American Heart Association journal, RRT, LES, TKI, CDC, KATHERINE, MH, MSI-H, BRAF, MH, KRAS, ARF, AMIL, the US Centers for Disease Control and Prevention, LUMAKRAS, FDA, ER, NSCLC, PDGF, NGS, pan-Erb TKI, The Cancer Genome Atlas, Annals of Oncology, study14, UTUC, Genetic Test May Help Predict Whether Prostate Cancer, VWF, The Associated Press IRVING, PROSINT, VWF, MET, NEOD, BRCA, CDC, MRC, KRAS, HPA, BRCA1, BRCA2, Drug, Medicare, Keytruda, Hydroxychloroquine, HP, CRC, VWF, VWF, Erythrocyte, CT, ECX, DDAC, ERK, BRCA1, the Cancer Control Blueprint, ITT, Irving, CIPA, ICU, NSCLC, VWF, motorcars, AMIL, PE, LUMAKRAS, LES, VWF, Rapid, VWF, CT, WES, GERD, MEK, MS, NTRK, BCR, MS, 

# Now let's train the model to learn new Named Entities

Reference: https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy/