# This file is *raw*.
A few experiments if you are interested in taking a look at how I tried to get insights from the data.
I wrote neat python files based off these experiments.

# 1. Frequency of given keywords in the data
## Check if these keywords exist in the data, if so, how often?

In [1]:
# imports
import re
from typing import Dict
import spacy
from os import listdir, makedirs
from os.path import join
from typing import Tuple
from typing import List, Mapping
from spacy.lang.en import English
from spacy.training import Example

In [2]:
# 1.1 Get keywords


def get_keywords(path) -> Dict[str, List[str]]:
    with open(path, "r") as keywords_file:
        keywords_data = keywords_file.readlines()

    category_keywords = {}
    for i in range(len(keywords_data)):
        category, keywords = keywords_data[i].split(":")
        keywords = keywords.split(",")
        keywords = [keyword.strip() for keyword in keywords]
        category_keywords[category] = keywords

    return category_keywords


entity_names_path = "/Users/akhil/code/lexical_lab/companies/tempus/Data/entity_names.txt"
category_keywords = get_keywords(entity_names_path)

In [3]:
def get_text_data(data_path) -> List[str]:
    # Yeah, I'm keeping everything in memory. It is not large, so I assume it is fine.
    list_of_text = []

    file_names = listdir(data_path)

    for file_name in file_names:
        # Validate filename
        if not re.match(r"j[0-9]+\.txt", file_name):
            continue

        with open(join(data_path, file_name), "r") as text_file:
            text = text_file.read()
            list_of_text.append(text)

    return list_of_text


data_path = "/Users/akhil/code/lexical_lab/companies/tempus/Data"
list_of_text = get_text_data(data_path)

In [4]:
# 1.2 For each keyword, find how many times it is repeated


def count_and_update(category_keywords, keyword_counts, text):
    for category in category_keywords:
        for keyword in category_keywords[category]:
            keyword_counts[category][keyword] += text.count(keyword)

def get_keyword_counts(list_of_text):
  keyword_counts = {
      category: {keyword: 0 for keyword in category_keywords[category]}
      for category in category_keywords
  }

  for text in list_of_text:
      count_and_update(category_keywords, keyword_counts, text)

  return keyword_counts


keyword_counts = get_keyword_counts(list_of_text)

In [5]:
keyword_counts

{'cancertype': {'nsclc': 0,
  'sclc': 0,
  'dcis': 0,
  'non small cell lung cancer': 0,
  'breast cancer': 13,
  'lung cancer': 10,
  'prostate cancer': 8,
  'adenocarcinoma': 8,
  'testicular cancer': 1,
  "Kaposi's sarcoma": 1,
  'colorectal cancer': 18,
  'glioblastoma': 2,
  'melanoma': 14,
  'hodgkin lymphoma': 0,
  'squamous cell carcinoma': 1},
 'medication': {'pembrolizumab': 1,
  'bicalutamide': 0,
  'Gefitinib': 0,
  'Pertuzumab': 0,
  'Giotrif': 0,
  'imfinzi': 0,
  'tagrisso': 0,
  'nivolumab': 1,
  'lapatinib': 12,
  'capecitabine': 6},
 'biomarkers': {'metex14': 0,
  'egfr': 0,
  'kras': 0,
  'braf': 0,
  'ntrk': 0,
  'nrg1': 0,
  'her2': 0,
  'her-2': 0,
  'erbb2': 0,
  'er-bb2': 0,
  'brca': 0,
  'brca1': 0,
  'brca2': 0,
  'brca3': 0,
  'alk': 9,
  'ret': 27,
  'fgfr2': 0,
  'her 2': 2,
  'pr': 489,
  'er': 1865,
  'pgr': 1,
  'met': 58},
 'procedure_therapy': {'lumpectomy': 2, 'biopsy': 6}}

### Most of these keywords have low frequency.

Biomarkers er and per got unusually high numbers because "er" is part of many common words.
And I am not checking for "er" as a token, but just getting the substring count.

This is fine because I just want to get an idea of how often these keywords are used.

# 2. What are the most used words that are not stopwords (like "the")?

2.1. tokenize the text and then find frequency.

Reference: https://blog.ekbana.com/nlp-for-beninners-using-spacy-6161cf48a229

In [6]:
def preprocess_text(text):
    text = text.replace('\n', ' ')
    text = text.replace('  ', ' ')
    text = re.sub(r'\[[0-9]+]', '', text)  # Removes citations of kind [1]
    return text

preprocess_text('hello.[1]')

'hello.'

In [7]:
nlp = spacy.load("en_core_web_sm")

text = '. '.join(list_of_text)
text = preprocess_text(text)

In [8]:
doc = nlp(text)
del text

In [9]:
# Raw words
words = [token.text for token in doc[:100]]

[print(word, end=' ') for word in words]
print('\n-------------------------------------')

# Raw sentences
[print(f'--- {sentence}') for sentence in list(doc.sents)[:5]]
print()

Curwen Eliot Hodgkin was born on 19 June 1905 , the only son of Charles Ernest Hodgkin and his wife , Alice Jane ( née Brooke ) . The Hodgkins were a Quaker family and were related to Roger Fry . The scientist Thomas Hodgkin was his great - grandfather 's older brother and the abstract painter Howard Hodgkin ( 1932–2017 ) was his cousin . Hodgkin was educated at Harrow School from 1919 to 1923 . His artistic life started in London at the Byam Shaw School of Art and at the Royal Academy Schools under Francis Ernest Jackson 
-------------------------------------
--- Curwen Eliot Hodgkin was born on 19 June 1905, the only son of Charles Ernest Hodgkin and his wife, Alice Jane (née Brooke).
--- The Hodgkins were a Quaker family and were related to Roger Fry.
--- The scientist Thomas Hodgkin was his great-grandfather's older brother and the abstract painter Howard Hodgkin (1932–2017) was his cousin.
--- Hodgkin was educated at Harrow School from 1919 to 1923.
--- His artistic life started i

In [10]:
# Words without stopwords
words = [token.text for token in doc if not token.is_stop and not token.is_punct]
print (words[:50])

['Curwen', 'Eliot', 'Hodgkin', 'born', '19', 'June', '1905', 'son', 'Charles', 'Ernest', 'Hodgkin', 'wife', 'Alice', 'Jane', 'née', 'Brooke', 'Hodgkins', 'Quaker', 'family', 'related', 'Roger', 'Fry', 'scientist', 'Thomas', 'Hodgkin', 'great', 'grandfather', 'older', 'brother', 'abstract', 'painter', 'Howard', 'Hodgkin', '1932–2017', 'cousin', 'Hodgkin', 'educated', 'Harrow', 'School', '1919', '1923', 'artistic', 'life', 'started', 'London', 'Byam', 'Shaw', 'School', 'Art', 'Royal']


In [11]:
# Lemmantized words without stopwords
words = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
print (words[:50])

['Curwen', 'Eliot', 'Hodgkin', 'bear', '19', 'June', '1905', 'son', 'Charles', 'Ernest', 'Hodgkin', 'wife', 'Alice', 'Jane', 'née', 'Brooke', 'Hodgkins', 'Quaker', 'family', 'relate', 'Roger', 'Fry', 'scientist', 'Thomas', 'Hodgkin', 'great', 'grandfather', 'old', 'brother', 'abstract', 'painter', 'Howard', 'Hodgkin', '1932–2017', 'cousin', 'Hodgkin', 'educate', 'Harrow', 'School', '1919', '1923', 'artistic', 'life', 'start', 'London', 'Byam', 'Shaw', 'School', 'Art', 'Royal']


In [12]:
from collections import Counter

word_freq = Counter(words)
common_words = word_freq.most_common(15)
[print(word_count) for word_count in common_words]
print()

('cancer', 254)
('patient', 136)
('mutation', 90)
('cell', 74)
('treatment', 72)
('cause', 69)
('include', 67)
('gene', 66)
('study', 63)
('disease', 62)
('year', 62)
('rate', 60)
('\u2009', 52)
('symptom', 50)
('test', 49)



kind of obvious, but the data (most frequent words) show that the text information is mostly related to clinical/health.

---

Let's check what name
Let's check what named entities the model picked up by defaultd entities the model picked up by default

In [13]:
labels = set([w.label_ for w in doc.ents])
for label in labels:
    entities = [e for e in doc.ents if label == e.label_]
    entities = list(set(entities))
    print(label, entities)

MONEY [less than 200 mL, tens of millions, nearly $700 billion]
PERSON [sorafenib.¹ Malignant, Measures Loss, Uhlen et al., Pancreatitis, EGFR, Maalox, von Willebrand, Endocrine, Main Outcomes, PrEP, Officer Robert Reeves, Ridley, Nivolumab, Cancer, Brooke, docetaxel, TABRECTA14, Plaquenil, Hydroxychloroquine, Wen, survival.4, Madison McDonald, Main Outcomes, Gastric, Behan et al., Kidney, Huang et al., Main Outcomes, Nivolumab, Main Outcomes, Dr Emmanuel, oncoEnrichR, Kaposi, Nivolumab, Archer Hammond, CRISPR, Cox, Leana Wen, Ridley, Roger Fry, Kaposi, Val, von Willebrand, von Willebrand, Kemp, Howard Hodgkin, EGFR, Nivolumab, Individual Molecular Tests, H2RAs, Main Outcomes, White, Creative Bioarray, Nivolumab, RAS)/MAPK, White, Francis Ernest Jackson, Chen et al., Normal Human Epidermal Melanocytes, Gleevec, Fungi, Enlarged liver, McDonald, Hepatitis A. Most, CDx, Tagamet, EGFR, Alice Jane, Antibiotics, Arg, Lepidochelys, V599K, TRKA, Embase, diarrhea headache heartburn, Curwen Elio

# 3. Now let's train the model to learn new Named Entities

Reference: https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy/

Each sentence has to be split and any containing named entities has to be written in this way:
```python
("Walmart is a leading e-commerce company", {"entities": [(0, 7, "ORG")]})
```

In [14]:
def get_entities_data_for_sentence(sentence, category_keywords):
    data: List[Tuple[int, int, str]] = []

    for category in category_keywords:
        for keyword in category_keywords[category]:
            pieces = keyword.split(' ')
            words = sentence.split(' ')
            current_index = 0
            for word in words:
                if pieces[0].lower() == word.lower():
                    end_index = current_index + len(keyword)
                    if sentence[current_index: end_index].lower() == keyword.lower():
                        data.append((current_index, end_index, category))
                current_index += len(word) + 1
    return data

In [15]:
sentencizer = English()  # just the language with no pipeline
sentencizer.add_pipe("sentencizer")
tokenizer = sentencizer.tokenizer

training_data: List[Tuple[str, Mapping[str, List[Tuple[int, int, str]]]]] = []
for text in list_of_text:
    text = preprocess_text(text)
    for sentence in sentencizer(text).sents:
        doc = tokenizer(sentence.text)
        data = get_entities_data_for_sentence(sentence.text, category_keywords)
        if data:
            training_data.append((sentence.text, {'entities': data}))

[print(x) for x in training_data]
print()

('When an abnormal, malignant growth of cells — which is called a tumor — forms in the prostate, it’s called prostate cancer.', {'entities': [(107, 122, 'cancertype')]})
('In these cases, because the cancer is made of cells from the prostate, it’s still called prostate cancer.', {'entities': [(89, 104, 'cancertype')]})
('According to the Urology Care Foundation, prostate cancer is the second-leading cause of cancer deaths for men in the United States.', {'entities': [(42, 57, 'cancertype')]})
('Your healthcare provider may also need to delay or completely stop treatment with IMFINZI if you have severe side effects.', {'entities': [(82, 89, 'medication')]})
('Erlotinib hydrochloride is approved to be used alone or with other drugs to treat: Non-small cell lung cancer that is metastatic and has certain EGFR gene mutations.', {'entities': [(98, 109, 'cancertype'), (145, 149, 'biomarkers')]})
('The use of erlotinib hydrochloride to treat non-small cell lung cancer that does not have the EG

In [16]:
for i in range(len(training_data)):
    sentence, config = training_data[i]
    doc = nlp.make_doc(sentence)
    example = Example.from_dict(doc, config)
    training_data[i] = example



In [17]:
nlp = spacy.load("en_core_web_sm")

# Getting the pipeline component
ner = nlp.get_pipe("ner")

In [18]:
for category in category_keywords:
    ner.add_label(category)

In [19]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [20]:
# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec", 'transformer']
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
unaffected_pipes

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer']

In [21]:
# Import requirements
import random
from spacy.util import minibatch, compounding
from tqdm import tqdm

In [22]:
# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

    # Training for 30 iterations
    for iteration in range(30):
        # shuffling examples  before every iteration
        random.shuffle(training_data)
        losses = {}
        epoch_loss = 0
        n_batches = 0
        # batch up the examples using spaCy's minibatch
        batches = minibatch(training_data, size=4)
        for batch in batches:
            nlp.update(
                batch,  # Batch of examples
                drop=0.5,  # dropout - make it harder to memorise data
                losses=losses,
            )
            epoch_loss += losses['ner']
            n_batches += 1
        epoch_loss /= n_batches
        print(f'Epoch: {iteration}\tLoss: {epoch_loss}')

  d_xhat = N * dY - sum_dy - dist * var ** (-1.0) * sum_dy_dist


Epoch: 0	Loss: 254.3746056325797
Epoch: 1	Loss: 152.74144408660402
Epoch: 2	Loss: 170.8404259619373
Epoch: 3	Loss: 123.22028200434019
Epoch: 4	Loss: 77.26609484289574
Epoch: 5	Loss: 64.6417690404454
Epoch: 6	Loss: 61.421162147304216
Epoch: 7	Loss: 55.738643590117206
Epoch: 8	Loss: 43.61318373446662
Epoch: 9	Loss: 34.14602403049772
Epoch: 10	Loss: 29.957658600854067
Epoch: 11	Loss: 24.937913697134007
Epoch: 12	Loss: 22.939140397281683
Epoch: 13	Loss: 18.26442534025517
Epoch: 14	Loss: 24.522137893131376
Epoch: 15	Loss: 8.736459508790267
Epoch: 16	Loss: 12.991512612958731
Epoch: 17	Loss: 10.932450152834084
Epoch: 18	Loss: 14.41187215969225
Epoch: 19	Loss: 18.63197126806981
Epoch: 20	Loss: 11.416505657967507
Epoch: 21	Loss: 25.420725724146124
Epoch: 22	Loss: 12.403011272780054
Epoch: 23	Loss: 11.664326771427472
Epoch: 24	Loss: 9.016455042132538
Epoch: 25	Loss: 10.704378663445047
Epoch: 26	Loss: 6.713856581902071
Epoch: 27	Loss: 6.382436490139685
Epoch: 28	Loss: 4.6218088123257814
Epoch: 29

In [23]:
output_model_path = '/Users/akhil/code/lexical_lab/companies/tempus/output/model2'
makedirs(output_model_path, exist_ok=True)
nlp.to_disk(output_model_path)

In [24]:
# Prediction

doc = nlp("I am suffering from asdfewga cancer")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('asdfewga cancer', 'cancertype')]


# Okay, so the model is now able to predict new cancer types.

---

- The primary goal of this assessment is complete.
- It needs some more data - like more cancer types, and more examples of other entities.
- Further manual inspection of the data might also reveal more insights (where the model is failing, etc.)
- I will definitely need to split the data into training and testing, so I can quantify how good the model is training. However, the training data is too small.

In [25]:

file_names = listdir(data_path)

for file_name in file_names:
    if not re.match(r"j[0-9]+\.txt", file_name):
        continue

    with open(join(data_path, file_name), "r") as text_file:
        text = text_file.read()
        text = preprocess_text(text)
        doc = nlp(text)
        for ent in doc.ents:
            print(f'{file_name}\t{ent.label_}\t{ent.text}')

j101.txt	medication	Hodgkin
j101.txt	medication	Hodgkin
j101.txt	medication	Hodgkin
j101.txt	medication	London
j38.txt	cancertype	prostate cancer
j38.txt	cancertype	prostate cancer
j38.txt	cancertype	prostate cancer
j11.txt	medication	IMFINZI
j11.txt	medication	IMFINZI
j100.txt	cancertype	lung cancer
j100.txt	biomarkers	EGFR
j100.txt	cancertype	lung cancer
j100.txt	biomarkers	EGFR
j6.txt	biomarkers	METex14
j6.txt	biomarkers	MET
j6.txt	biomarkers	METex14
j6.txt	biomarkers	METex14
j6.txt	cancertype	NSCLC
j12.txt	cancertype	breast cancers
j12.txt	cancertype	prostate cancer
j12.txt	cancertype	Prostate Cancer
j7.txt	cancertype	lung carcinogenesis
j7.txt	cancertype	Squamous cell carcinoma
j7.txt	cancertype	adenocarcinoma
j7.txt	cancertype	lung epithelium
j7.txt	procedure_therapy	Carcinoma
j7.txt	cancertype	lung cancer
j2.txt	cancertype	breast cancer
j29.txt	biomarkers	PEPFAR
j29.txt	medication	COVID-19
j1.txt	cancertype	prostate cancer
j1.txt	procedure_therapy	biopsy
j98.txt	biomarkers	FGFR2