This notebook contains code to download the title, description and inclusion/exclsion criteria for oncology trials  using clinicaltrials.gov api.

Author - Akshay Chougule<br>
Created on - 30th May 2020<br>
<br>

In [1]:
import pandas as pd
import numpy as np
import requests
import datetime
import json
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
pd.set_option('max_colwidth', 4000)

### Preprocessing the corpus

In [9]:
import nltk
from nltk.corpus import stopwords
# Get nltk stopword list into a set.
stop_words = set(stopwords.words('english'))
str(stop_words)

'{\'being\', \'your\', \'t\', \'hadn\', \'yours\', \'to\', \'off\', "mustn\'t", \'here\', \'won\', \'our\', "you\'d", \'weren\', \'he\', \'her\', \'but\', \'ain\', \'and\', \'m\', \'a\', \'at\', "weren\'t", \'the\', \'it\', \'each\', \'who\', \'more\', \'than\', \'my\', \'o\', \'wasn\', \'we\', \'until\', \'should\', "it\'s", \'this\', \'hasn\', \'didn\', \'once\', \'or\', \'during\', "shan\'t", \'nor\', \'if\', \'himself\', "she\'s", \'me\', \'yourself\', \'doesn\', \'up\', \'all\', \'having\', \'mustn\', \'their\', \'above\', \'can\', "wasn\'t", \'between\', \'had\', \'which\', \'of\', \'am\', \'shouldn\', \'about\', "won\'t", \'those\', \'needn\', \'his\', \'re\', \'too\', \'such\', \'out\', \'y\', \'on\', \'mightn\', \'some\', \'while\', \'ma\', \'below\', \'wouldn\', \'hers\', \'haven\', \'just\', \'herself\', \'after\', \'ourselves\', \'is\', \'from\', \'then\', "mightn\'t", \'she\', \'how\', \'been\', \'shan\', \'as\', \'against\', "that\'ll", \'be\', \'couldn\', \'most\', \'in\

In [10]:
stop_words2 = set('for a ( of the ) study interventions and to in is at an must be with are but not no none has have other from as prior or except none see below study , use " one two three four five six patients before start greater than any allowed by for they since'.split())
str(stop_words2)

'{\'is\', \'interventions\', \'study\', \'from\', \'for\', \'with\', \'to\', \'must\', \'prior\', \'have\', \'six\', \'as\', \'are\', \'(\', \'five\', \'of\', \'but\', \'use\', \')\', \'be\', \'and\', \'a\', \'two\', \'in\', \'at\', \'an\', \'one\', \'not\', \'the\', \'see\', \'start\', \'four\', \'since\', \'other\', \'patients\', \'has\', \'no\', \'"\', \'three\', \'than\', \'before\', \'except\', \'below\', \'greater\', \',\', \'none\', \'by\', \'or\', \'allowed\', \'they\', \'any\'}'

In [11]:
master_stop_words = stop_words.union(stop_words2)
str(master_stop_words)

'{\'being\', \'your\', \'study\', \'t\', \'hadn\', \'yours\', \'to\', \'off\', "mustn\'t", \'here\', \'won\', \'our\', "you\'d", \'weren\', \'he\', \'her\', \'but\', \'ain\', \'and\', \'m\', \'a\', \'at\', "weren\'t", \'the\', \'it\', \'since\', \'each\', \'who\', \'more\', \'three\', \'than\', \'my\', \'o\', \'wasn\', \'we\', \'until\', \'should\', "it\'s", \'this\', \'hasn\', \'didn\', \'once\', \'or\', \'during\', "shan\'t", \'nor\', \'if\', \'himself\', "she\'s", \'me\', \'yourself\', \'doesn\', \'up\', \'all\', \'having\', \'mustn\', \'their\', \'prior\', \'above\', \'can\', "wasn\'t", \'between\', \'had\', \'five\', \'which\', \'of\', \'am\', \'shouldn\', \'about\', "won\'t", \'those\', \'needn\', \'two\', \'his\', \'re\', \'four\', \'too\', \'patients\', \'such\', \'out\', \'y\', \'on\', \'mightn\', \'some\', \'while\', \'ma\', \'below\', \'greater\', \'wouldn\', \'hers\', \'haven\', \'just\', \'herself\', \'allowed\', \'after\', \'ourselves\', \'is\', \'interventions\', \'from\

In [12]:
# Open and read in a text file.
txt_file = open("/home/ubuntu/datasets/cancer-clinical-trials/labeledEligibilitySample1000000.txt")
txt_line = txt_file.read()
txt_words = txt_line.split()
 
# stopwords found counter.
sw_found = 0
 
# If each word checked is not in stopwords list,
# then append word to a new text file.
for check_word in txt_words:
    if not check_word.lower() in master_stop_words:
        # Not found on stopword list, so remove noise and then append.
        check_word = check_word.replace('(','').replace(')','').replace('[','').replace(']','').replace('.','').replace('-','').replace(':','').replace('.','')
        appendFile = open('/home/ubuntu/datasets/cancer-clinical-trials/1M_cancer_trial_eligibility_preprocessed.txt','a')
        appendFile.write(" "+check_word)
        appendFile.close()
    else:
        # It's on the stopword list
        sw_found +=1
        #print(check_word)

print(sw_found,"stop words found and removed")
print("Saved as 'stopwords-removed.txt' ")

8943395 stop words found and removed
Saved as 'stopwords-removed.txt' 


### 1. Training on Cancer trial data

In [28]:
pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[K     |████████████████████████████████| 68 kB 2.3 MB/s eta 0:00:011
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25ldone
[?25h  Created wheel for fasttext: filename=fasttext-0.9.2-cp36-cp36m-linux_x86_64.whl size=3173570 sha256=f5897a7aa95461d44db97e2b5e627864b3ccd78ba756974385849c46cb94fe00
  Stored in directory: /home/ubuntu/.cache/pip/wheels/c3/5c/d0/4a725c6ee7df3267d818d3bc9d89bb173b94832f2b9eca6368
Successfully built fasttext
Installing collected packages: fasttext
Successfully installed fasttext-0.9.2
Note: you may need to restart the kernel to use updated packages.


In [13]:
import fasttext

In [14]:
model = fasttext.train_unsupervised('/home/ubuntu/datasets/cancer-clinical-trials/1M_cancer_trial_eligibility_preprocessed.txt')

In [15]:
model.get_word_vector("estrogen")

array([ 0.02196198,  0.12391646,  0.13404952,  0.18138213,  0.04413488,
        0.18806663,  0.25214043,  0.10602474,  0.28508407, -0.07287144,
        0.8035991 , -0.29302526,  0.62895876, -0.12890631,  0.04837237,
        0.06768012,  0.04129648, -0.5938437 ,  0.38910812,  0.50719196,
        0.90053403, -0.4737405 ,  0.142965  ,  0.773092  ,  0.56227523,
        0.06776884, -0.5678581 ,  0.5214773 ,  0.08030684, -0.6906355 ,
        0.21252692,  0.31908318, -0.03269489,  0.04802901, -0.13038303,
       -1.0319707 , -0.25387838,  1.0011411 , -0.20405711,  0.16535135,
        0.2999551 , -0.35183156, -0.17931534, -0.19283974, -0.07703391,
       -0.01197859, -0.5908381 , -0.16636941, -0.27477872, -0.3364141 ,
       -0.00383568,  0.09149738,  0.17858836, -0.18495567,  0.3213171 ,
       -0.41683763, -0.66784984, -0.22615245,  0.31174737, -0.16621633,
        0.15924199, -0.0219992 , -0.12436614, -0.24230647, -0.10695533,
       -0.3250166 , -0.335125  , -0.6373275 , -0.053503  ,  0.20

In [16]:
model.get_nearest_neighbors('estrogen') 

[(0.9248054623603821, 'progesterone'),
 (0.9014894366264343, 'estroge'),
 (0.873781144618988, 'progesterones'),
 (0.8621060848236084, 'receptor'),
 (0.8285125494003296, 'estrogens'),
 (0.8080430030822754, 'oestrogens'),
 (0.8055126667022705, 'oestrogen'),
 (0.8045371174812317, 'estrogenic'),
 (0.7995848059654236, 'recepto'),
 (0.7740491628646851, 'breast')]

In [17]:
model.get_nearest_neighbors('Pembrolizumab') 

[(0.8714774250984192, 'Nivolumab'),
 (0.794731080532074, 'Tremelimumab'),
 (0.7138570547103882, 'PDR001'),
 (0.7095555663108826, 'Docetaxel'),
 (0.705943763256073, 'Monoclonal'),
 (0.6988462805747986, 'Lenvatinib'),
 (0.6959121227264404, 'Antibodies,'),
 (0.6905092000961304, 'Cisplatin'),
 (0.6884687542915344, 'MGCD265'),
 (0.6822120547294617, 'Axitinib')]

In [18]:
model.get_nearest_neighbors('NSCLC') 

[(0.6703999638557434, 'cohort'),
 (0.6480443477630615, 'BIW8962'),
 (0.6437572240829468, '2%'),
 (0.6420202255249023, 'Pemetrexed'),
 (0.6418660879135132, 'Rociletinib'),
 (0.638372004032135, 'nsclc'),
 (0.6365253329277039, '4%'),
 (0.6157108545303345, 'PLX8394'),
 (0.6140182018280029, 'expansion'),
 (0.6118809580802917, 'Ponatinib')]

In [19]:
model.get_analogies("immunotherapy","oncology","cardiovascular")

[(0.570726752281189, 'anticancer'),
 (0.5494672060012817, 'antitumor'),
 (0.542633593082428, 'nitroureas'),
 (0.5291298031806946, 'immunostimulatory'),
 (0.527664303779602, 'immunotherapies'),
 (0.5155563354492188, 'NMS1286937'),
 (0.5106099247932434, 'khk'),
 (0.5059861540794373, 'cardiopathy'),
 (0.5022950172424316, 'cardiogenic'),
 (0.5016393661499023, 'endocrinotherapy')]

In [20]:
model.get_nearest_neighbors(['estrogen','progesterone']) 

TypeError: getNN(): incompatible function arguments. The following argument types are supported:
    1. (self: fasttext_pybind.fasttext, arg0: str, arg1: int, arg2: str) -> List[Tuple[float, str]]

Invoked with: <fasttext_pybind.fasttext object at 0x7fca847f1ce0>, ['estrogen', 'progesterone'], 10, 'strict'