In [1]:
!pip install pycrf
!pip install sklearn-crfsuite
import spacy
import sklearn_crfsuite
from sklearn_crfsuite import metrics

model = spacy.load("en_core_web_sm")



In [2]:
import pandas as pd

In [3]:
# create a function t process the file and return a sentence list
def preprocess_inputfile(input_file):
    i_file = open(input_file, 'r')
    file_name = i_file.readlines()
    i_file.close()
    
    output_list = []
    
    full_sentence = " "
    
    for each_word in file_name:
        each_word = each_word.strip()
        if each_word == "":
            output_list.append(full_sentence)# to append the complete sentence to the output list
            full_sentence = "" # for new sentence start
        else:
            if full_sentence:
                full_sentence +=  " " + each_word
            else:
                full_sentence = each_word
                
    return output_list

In [4]:
train_sentences = preprocess_inputfile('C:/Users/DELL/Downloads/train_sent')
train_labels = preprocess_inputfile('C:/Users/DELL/Downloads/train_label')
test_sentences = preprocess_inputfile('C:/Users/DELL/Downloads/test_sent')
test_labels = preprocess_inputfile('C:/Users/DELL/Downloads/test_label')

In [5]:
# Print first five sentences from the processed dataset
for each_item in range(5):
    print(f"sentence {each_item+1}) is: {train_sentences[each_item]}")
    print(f"Label {each_item+1}  is: {train_labels[each_item]}")
    print("*"*100)

sentence 1) is:   All live births > or = 23 weeks at the University of Vermont in 1995 ( n = 2395 ) were retrospectively analyzed for delivery route , indication for cesarean , gestational age , parity , and practice group ( to reflect risk status )
Label 1  is:   O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O
****************************************************************************************************
sentence 2) is: The total cesarean rate was 14.4 % ( 344 of 2395 ) , and the primary rate was 11.4 % ( 244 of 2144 )
Label 2  is: O O O O O O O O O O O O O O O O O O O O O O O O O
****************************************************************************************************
sentence 3) is: Abnormal presentation was the most common indication ( 25.6 % , 88 of 344 )
Label 3  is: O O O O O O O O O O O O O O O
****************************************************************************************************
sentence 4) is: The `` corr

In [6]:
## Count the number of sentences in the processed train and test dataset 

print(f"Number of sentences in processed train dataset is: {len(train_sentences)}")
print(f"Number of sentences in processed test dataset is : {len(test_sentences)}")


Number of sentences in processed train dataset is: 2599
Number of sentences in processed test dataset is : 1056


In [7]:
## Count the number of labels in the processed train and test dataset 

print(f"Number of sentences in processed train dataset is: {len(train_labels)}")
print(f"Number of sentences in processed test dataset is : {len(test_labels)}")

Number of sentences in processed train dataset is: 2599
Number of sentences in processed test dataset is : 1056


In [8]:
# Creating a list to hold all the tokens which are either NOUN or PROPER NOUN
noun_propn_tokens_list = []

In [9]:
# Each token which is a NOUN or PROPN will be appended to the list "noun_propn_tokens_list"
for sentences in (train_sentences, test_sentences):
    for sent in sentences:
        processed_sent = model(sent)
        for each_token in processed_sent:
            if each_token.pos_ == "NOUN" or each_token.pos_ == "PROPN":
                noun_propn_tokens_list.append(each_token.text)

In [10]:
# Creating a Series to hold the tokens which are either NOUN or PROPER NOUN
df_noun_propn = pd.Series(noun_propn_tokens_list)

In [11]:
### Print the top 25 most common tokens with NOUN or PROPN PoS tags

In [12]:
# Getting then count of each token and sorting the data in top 25 most token counts
df_noun_propn.value_counts().sort_values(ascending=False).head(25)

patients        492
treatment       281
%               247
cancer          200
therapy         175
study           152
disease         141
cell            140
lung            116
group            94
chemotherapy     88
gene             87
effects          85
results          78
women            77
use              74
risk             71
cases            71
surgery          71
analysis         70
rate             67
response         66
survival         65
children         64
effect           63
dtype: int64

In [13]:
### Defining features for CRF

# Let's define the features to get the feature value for one word.

def getFeaturesForOneWord(sentence, pos, pos_tags):
  word = sentence[pos]

  features = [
    'word.lower=' + word.lower(), # serves as word id
    'word[-3:]=' + word[-3:],     # last three characters
    'word[-2:]=' + word[-2:],     # last two characters
    'word.isupper=%s' % word.isupper(),  # is the word in all uppercase
    'word.isdigit=%s' % word.isdigit(),  # is the word a number
    'word.startsWithCapital=%s' % word[0].isupper(), # is the word starting with a capital letter
    'word.pos=' + pos_tags[pos]
  ]

  #Use the previous word also while defining features
  if(pos > 0):
    prev_word = sentence[pos-1]
    features.extend([
    'prev_word.lower=' + prev_word.lower(), 
    'prev_word.isupper=%s' % prev_word.isupper(),
    'prev_word.isdigit=%s' % prev_word.isdigit(),
    'prev_word.startsWithCapital=%s' % prev_word[0].isupper(),
    'prev_word.pos=' + pos_tags[pos-1]
  ])
  # Mark the begining and the end words of a sentence correctly in the form of features.
  else:
    features.append('BEG') # feature to track begin of sentence 

  if(pos == len(sentence)-1):
    features.append('END') # feature to track end of sentence

  return features

In [14]:
### Getting the features

## Write a code/function to get the features for a sentence

In [15]:
# Function to get features for a sentence.
def getFeaturesForOneSentence(sentence):
    
    # We need to get the pos_tags to be passed to the function
    processed_sent = model(sentence)
    postags = []
    
    for each_token in processed_sent:
        postags.append(each_token.pos_)
    
    sentence_list = sentence.split()
    return [getFeaturesForOneWord(sentence_list, pos, postags) for pos in range(len(sentence_list))]

In [16]:
### Write a code/function to get the labels of a sentence

In [17]:
# Function to get the labels for a sentence.
def getLabelsInListForOneSentence(labels):
  return labels.split()

In [18]:
### Define input and target variables

In [19]:
## Define the features' values for each sentence as input variable for CRF model in test and the train dataset

In [20]:
X_train = [getFeaturesForOneSentence(sentence) for sentence in train_sentences]
X_test = [getFeaturesForOneSentence(sentence) for sentence in test_sentences]

In [21]:
## Define the labels as the target variable for test and the train dataset

In [22]:
Y_train = [getLabelsInListForOneSentence(labels) for labels in train_labels]
Y_test = [getLabelsInListForOneSentence(labels) for labels in test_labels]

In [23]:
### Build the CRF Model

In [24]:
# This is needed to not get the error AttributeError: 'CRF' object has no attribute 'keep_tempfiles'
# pip install scikit-learn==0.22.2 --user

In [25]:
# Build the CRF model.
crf = sklearn_crfsuite.CRF(max_iterations=100)
try:
    crf.fit(X_train, Y_train)
except AttributeError:
    pass

In [26]:
### Evaluation

In [27]:
## Predict the labels of each of the tokens in each sentence of the test dataset that has been pre processed earlier

In [28]:
Y_pred = crf.predict(X_test)

In [29]:
## Calculate the f1 score using the actual labels and the predicted labels of the test dataset.

In [30]:
f1_score = metrics.flat_f1_score(Y_test, Y_pred, average='weighted')
print(f"F1 score is: {round(f1_score,4)}")

F1 score is: 0.9059


In [31]:
### Identifying Diseases and Treatments using Custom NER

In [32]:
# Creating an empty dictionary to hold diseases and their corresponding treatments
D_T_dict = dict()

for i in range(len(Y_pred)):
    # Get the predicted labels of each test sentence into "val"
    val = Y_pred[i]
    
    # Empty strings to store the values of Diseases and Treatments
    Diseases = ""
    Treatments = ""
    
    # Each loop will iterate through the individual labels and focus on mapping D and T labels
    # with Diseases and Treatments within each sentence into a concatenated string
    for j in range(len(val)):
        if val[j] == 'D': # If label is D, it indicates a Disease 
            Diseases += test_sentences[i].split()[j] + " "
        elif val[j] == 'T': # If label is T, it indicates a Treatment
            Treatments += test_sentences[i].split()[j] + " "
            
    # Removes any extra whitespaces to either end of the string
    Diseases = Diseases.lstrip().rstrip()
    Treatments = Treatments.lstrip().rstrip()

    # If Diseases and Treatments are blank, ignore them
    # If Disease is not present in Dictionary, add it along with the corresponding treatment
    # If Disease is present in the Dictionary, append the treatments for that diseases with existing
    # treatments
    if Diseases != "" and Treatments != "":
        if Diseases in D_T_dict.keys():
            treat_out = list(D_T_dict[Diseases])
            treat_out.append(Treatments)
            D_T_dict[Diseases] = treat_out
        elif Diseases not in D_T_dict.keys():
            D_T_dict[Diseases] = Treatments

In [33]:
## Predict the treatment for the disease name: 'hereditary retinoblastoma'

In [34]:
D_T_dict['hereditary retinoblastoma']

'radiotherapy'