In [1]:
!pip install pycrf
!pip install sklearn-crfsuite



In [2]:
import spacy
import sklearn_crfsuite
from sklearn_crfsuite import metrics

In [3]:
def process_file(filename):
    input_file=open(filename,'r')
    file_content=input_file.readlines()
    input_file.close()
    out_lines=[]# to store list of sequences
    line_content=""
    for word in file_content:
        word=word.strip()
        if word =="":#if line is empty add sent out_lines
            out_lines.append(line_content)
            line_content=''
        else:    
            if line_content:
                line_content+=" "+word # for non empty word add word to previous word with space
            else:
                line_content=word # first time no need to add space
            
    return out_lines      
    

In [4]:
train_sent=process_file('train_sent')
train_label=process_file('train_label')
test_sent=process_file('test_sent')
test_label=process_file('test_label')

In [5]:
print(train_sent[:10])
print("\n\n")
print(train_label[:10])
print("\n\n")
print(test_sent[:10])
print("\n\n")
print(test_label[:10])

['All live births > or = 23 weeks at the University of Vermont in 1995 ( n = 2395 ) were retrospectively analyzed for delivery route , indication for cesarean , gestational age , parity , and practice group ( to reflect risk status )', 'The total cesarean rate was 14.4 % ( 344 of 2395 ) , and the primary rate was 11.4 % ( 244 of 2144 )', 'Abnormal presentation was the most common indication ( 25.6 % , 88 of 344 )', "The `` corrected '' cesarean rate ( maternal-fetal medicine and transported patients excluded ) was 12.4 % ( 273 of 2194 ) , and the `` corrected '' primary rate was 9.6 % ( 190 of 1975 )", "Arrest of dilation was the most common indication in both `` corrected '' subgroups ( 23.4 and 24.6 % , respectively )", 'Cesarean rates at tertiary care hospitals should be compared with rates at community hospitals only after correcting for dissimilar patient groups or gestational age', 'In the third trimester , the amniotic fluid index ( AFI ) may be affected by maternal fluid status

In [6]:
print("TRAIN")
for i in range(20,25):
    print("Sentence:", train_sent[i])
    print("Labels:", train_label[i], "\n\n")
print("="*50)
print("="*50)
print("TEST")
for i in range(-5,-1):
    print("Sentence:", test_sent[i])
    print("Labels:", test_label[i], "\n\n")

TRAIN
Sentence: Down syndrome ( 12 cases ) and Edward syndrome ( 11 cases ) were the most common trisomies , while 4 cases of Patau syndrome were also diagnosed
Labels: D D O O O O O D D O O O O O O O O D O O O O O D D O O O 


Sentence: Down syndrome fetuses ( 41.7 % ) had prenatally detected sonographic anomalies , 63.6 % for Edward syndrome , and all fetuses with Patau syndrome ( 4 of 4 ) showed abnormal sonographic signs
Labels: D D O O O O O O O O O O O O O O D D O O O O O D D O O O O O O O O O 


Sentence: Trisomy 21 presented with the following features : hydramnios , complex malformations , pyelectasis , and duodenal atresia
Labels: D D O O O O O O O O O O O D O O D D 


Sentence: Signs observed in fetuses with trisomy 13 were : hydrocephalus , intrauterine growth retardation , oligoanhydramnios , complex malformations , severe fetal bradycardia and hydronephrosis
Labels: O O O O O D D O O O O O O O O O O O O O O O D O D 


Sentence: Fifty-three triplet pregnancies between 1986

#### count number of sentences in train and test datasets

In [7]:

print("length of train_sent",len(train_sent))
print("length of test_sent",len(test_sent))
print("length of train_label",len(train_label))
print("length of test_label",len(test_label))

length of train_sent 2599
length of test_sent 1056
length of train_label 2599
length of test_label 1056


### Extract those tokens which have NOUN or PROPN as their PoS tag and find their frequency.

In [8]:
import spacy
from collections import Counter
nlp=spacy.load('en_core_web_sm')

def extract_noun_pronoun(sentences):
    counter=Counter()
    for sentence in sentences:
        doc=nlp(sentence)
        for token in doc:
            if token.pos_=='NOUN' or token.pos_=='PROPN':
                counter[token.text]+=1
    return counter            

In [9]:
all_text=train_sent+test_sent
all_text_noun_propn_count=extract_noun_pronoun(all_text)


### Print the top 25 most common tokens with Noun or propn pos tags

In [10]:
for noun_propn in all_text_noun_propn_count.most_common(25):
    print(noun_propn)

('patients', 492)
('treatment', 281)
('%', 247)
('cancer', 200)
('therapy', 175)
('study', 154)
('disease', 142)
('cell', 140)
('lung', 116)
('group', 94)
('chemotherapy', 88)
('gene', 87)
('effects', 85)
('women', 77)
('results', 77)
('use', 75)
('risk', 71)
('cases', 71)
('surgery', 71)
('analysis', 70)
('rate', 67)
('dose', 66)
('response', 66)
('survival', 65)
('children', 64)


## Defining features for CRF

In [11]:
def word2features(sentence,pos,pos_tags):
    word=sentence[pos]
    
    #define features
    features={
        'bias':1.0,
        'word.lower()':word.lower(),
        'word[-3:]':word[-3:],
        'word[-2:]':word[-2:],
        'word.isupper()':word.isupper(),
        'word.istitle()':word.istitle(),
        'word.isdigit()':word.isdigit(),
        'pos':pos_tags[pos]
    }
    #features for the word preceding the current word
    if pos>0:
        prev_word=sentence[pos-1]
        features.update({
            '-1:word.lower()':prev_word.lower(),
            '-1:word.istitle()':prev_word.istitle(),
            '-1:word.isupper()':word.isupper(),
            '-1:pos':pos_tags[pos-1]
        })
    else:
            #Indicate that it is the 'begining of a document'
            features['BOS']=True
    #Features for the word succeeding the current word
    if pos < len(sentence) -1:
        next_word=sentence[pos+1]
        features.update({
            '+1:word.lower()':next_word.lower(),
            '+1:word.istitle()':next_word.istitle(),
            '+1:word.isupper()':word.isupper(),
            '+1:pos':pos_tags[pos+1]
        })
    else:   
        #Indicates that it is the 'end of document'
        features['EOS']=True
    return features    

## Getting the features

### Write a code/function to get the features for a sentence

In [12]:
#Define a function to get features for a sentence using the 'get features for one word' function
def sent2features(sentence):
    processed_sentence=nlp(sentence)# spacy is applied to sentence
    
    pos_tags=[]# identify pos tags
    for token in processed_sentence:
        pos_tags.append(token.pos_)
    sentence_list=sentence.split()  # list of words in sentence
    
    #calling getfeatures for oneword defined above
    return [word2features(sentence_list,pos,pos_tags) for pos in range(len(sentence_list))]

### Write a code/function to get the labels of a sentence

In [13]:
#write a code to get the labels for a sentence
def get_labels(labels):
    return labels.split()

## Define input and target variables

### Define the features values for each sentence as input variable  for CRF model in test and the train dataset 

In [14]:
X_train=[sent2features(sentence) for sentence in train_sent]
X_test=[sent2features(sentence) for sentence in test_sent]

In [15]:
print("Example train:")
display(X_train[0][0])
display(X_train[0][1])

print("="*50)
print("="*50)

print("Examples test:")
display(X_test[0][0])
display(X_test[0][1])

Example train:


{'bias': 1.0,
 'word.lower()': 'all',
 'word[-3:]': 'All',
 'word[-2:]': 'll',
 'word.isupper()': False,
 'word.istitle()': True,
 'word.isdigit()': False,
 'pos': 'DET',
 'BOS': True,
 '+1:word.lower()': 'live',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:pos': 'ADJ'}

{'bias': 1.0,
 'word.lower()': 'live',
 'word[-3:]': 'ive',
 'word[-2:]': 've',
 'word.isupper()': False,
 'word.istitle()': False,
 'word.isdigit()': False,
 'pos': 'ADJ',
 '-1:word.lower()': 'all',
 '-1:word.istitle()': True,
 '-1:word.isupper()': False,
 '-1:pos': 'DET',
 '+1:word.lower()': 'births',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:pos': 'NOUN'}

Examples test:


{'bias': 1.0,
 'word.lower()': 'furthermore',
 'word[-3:]': 'ore',
 'word[-2:]': 're',
 'word.isupper()': False,
 'word.istitle()': True,
 'word.isdigit()': False,
 'pos': 'ADV',
 'BOS': True,
 '+1:word.lower()': ',',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:pos': 'PUNCT'}

{'bias': 1.0,
 'word.lower()': ',',
 'word[-3:]': ',',
 'word[-2:]': ',',
 'word.isupper()': False,
 'word.istitle()': False,
 'word.isdigit()': False,
 'pos': 'PUNCT',
 '-1:word.lower()': 'furthermore',
 '-1:word.istitle()': True,
 '-1:word.isupper()': False,
 '-1:pos': 'ADV',
 '+1:word.lower()': 'when',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:pos': 'SCONJ'}

### Define the labels as the target variable for test and the train dataset

In [16]:
y_train = [get_labels(labels) for labels in train_label]
y_test = [get_labels(labels) for labels in test_label]

## Build the CRF Model

Parameters chosen:

- algorithm='lbfgs': The algorithm parameter specifies the training algorithm used for optimization. lbfgs stands for Limited-memory Broyden-Fletcher-Goldfarb-Shanno, which is an optimization algorithm in the family of quasi-Newton methods that approximates the Broyden-Fletcher-Goldfarb-Shanno algorithm using a limited amount of computer memory. It's a popular choice for optimization problems.

- c1=0.1 and c2=0.1: These are regularization parameters. Regularization is a technique used to prevent overfitting by adding a penalty term to the objective function. The c1 parameter is for L1 regularization and c2 is for L2 regularization. L1 can lead to sparsity (i.e., some of the feature weights will be zero), which can be beneficial if you have a lot of features and believe that many of them are irrelevant. L2 encourages the weights to be small but doesn't force them to zero. The specific values 0.1 are a common starting point, but these should be tuned for your specific problem. You might want to perform a hyperparameter search or cross-validation to find the best values.

- max_iterations=100: The max_iterations parameter specifies the maximum number of iterations for the optimizer. This means the optimization process (the learning process) will run for 100 iterations at most. If the best weights for the features are found before 100 iterations, the process will stop early.

- all_possible_transitions=True: The all_possible_transitions parameter, when set to True, creates additional transition features that are not present in the data. This can potentially improve the performance by capturing the transitions that haven't been seen in the training data but might be in the test data.


In [17]:
# Build the CRF model
from sklearn_crfsuite import CRF
 
# Define the model
crf=CRF(algorithm='lbfgs',# gradient descent using the L-BFGS method
       c1=0.1,# coefficient for l1 regularization
       c2=0.1,# coefficient for l2 regularization
       max_iterations=100,#maximum number of iterations for the optimizer
       all_possible_transitions=True)  # Whether to include transitions that are not present in the data
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass


## Evaluation

### Predict the labels of each of the tokens in each sentence of the test dataset that has been pre processed earlier.

In [18]:
y_pred = crf.predict(X_test)

### Calculate the f1 score using the actual labels and the predicted labels of the test dataset.

In [19]:
metrics.flat_f1_score(y_test, y_pred, average='weighted')

0.927000918447066

In [20]:
def extract_diseases_treatments(sentences, predicted_labels):
    """
    This function extracts diseases and their corresponding treatments from the sentences
    using the predicted labels and returns them in a dictionary.

    :param sentences: List of sentences
    :param predicted_labels: List of lists of labels for each sentence
    :return: A dictionary where keys are diseases and values are lists of treatments
    """
    disease_treatment_dict = {}  # Initialize an empty dictionary

    # For each sentence and its corresponding labels
    for sent, labels in zip(sentences, predicted_labels):
        disease = ""  # Initialize disease as an empty string
        treatment = ""  # Initialize treatment as an empty string

        words = sent.split()

        # For each word and its corresponding label
        for word, label in zip(words, labels):
            if label == 'O':  # Ignore if label is 'O'
                continue
            elif label == 'D':  # If label is 'D', add the word to disease
                disease += word.lower() + " "
            elif label == 'T':  # If label is 'T', add the word to treatment
                treatment += word.lower() + " "

        disease = disease.strip()  # Remove leading/trailing spaces
        treatment = treatment.strip()  # Remove leading/trailing spaces

        # If both disease and treatment are not empty
        if disease and treatment:
            # If disease is not in the dictionary, add it
            if disease not in disease_treatment_dict:
                disease_treatment_dict[disease] = [treatment]
            # If disease is already in the dictionary, append the treatment
            else:
                disease_treatment_dict[disease].append(treatment)

    return disease_treatment_dict

  
def find_treatments_for_disease(disease_treatment_dict, keyword):
  """
  This function looks for a disease that contains a certain keyword in the dictionary and 
  returns a dictionary with those diseases and their corresponding treatments.

  :param disease_treatment_dict: A dictionary where keys are diseases and values are lists of treatments
  :param keyword: The keyword to search for in the diseases
  :return: A dictionary where keys are diseases containing the keyword and values are lists of treatments
  """
  return {disease: treatments for disease, treatments in disease_treatment_dict.items() if keyword in disease}

In [21]:
disease_treatment_dict = extract_diseases_treatments(test_sent, y_pred)

retinoblastoma_treatments = find_treatments_for_disease(disease_treatment_dict, "retinoblastoma")
print(retinoblastoma_treatments)


{'retinoblastoma': ['radiotherapy']}


# Let's check another disease such as diseases with the word "cerebral" 

In [22]:
find_treatments_for_disease(disease_treatment_dict, "cerebral")

{'acute occlusion of the middle cerebral artery': ['thrombolytic therapy'],
 'acute cerebral ischemia': ['antiplatelet therapy'],
 'cerebral palsy': ['hyperbaric oxygen therapy']}