**Download Datasets**

In [87]:
# ====================================================
# Download datasets
# ====================================================
%cd /content
%rm -r project
%mkdir project
%cd project
%mkdir input
%cd input
!wget https://github.com/Ameer-eng/nbme-score-clinical-patient-notes-files/raw/main/nbme-score-clinical-patient-notes.zip
!unzip nbme-score-clinical-patient-notes.zip -d nbme-score-clinical-patient-notes
%cd ..
%ls

/content
/content/project
/content/project/input
--2022-05-02 03:38:02--  https://github.com/Ameer-eng/nbme-score-clinical-patient-notes-files/raw/main/nbme-score-clinical-patient-notes.zip
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/Ameer-eng/nbme-score-clinical-patient-notes-files/main/nbme-score-clinical-patient-notes.zip [following]
--2022-05-02 03:38:02--  https://raw.githubusercontent.com/Ameer-eng/nbme-score-clinical-patient-notes-files/main/nbme-score-clinical-patient-notes.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10305097 (9.8M) [application/zip]
Saving to: ‘nbme-score-cl

**Library**

In [88]:
import ast
import numpy as np
import pandas as pd
from collections import Counter

pd.options.display.max_colwidth = 200

**Data**

In [89]:
DATA_PATH = "./input/nbme-score-clinical-patient-notes/"

patient_notes = pd.read_csv(DATA_PATH + "patient_notes.csv")
features = pd.read_csv(DATA_PATH + "features.csv")
df_train = pd.read_csv(DATA_PATH + "train.csv")
display(patient_notes.shape)
display(features.shape)
display(df_train.shape)

(42146, 3)

(143, 3)

(14300, 6)

In [90]:
df_train["annotation"][1]

'[\'mom with "thyroid disease\']'

In [91]:
df_train["location"][1]

"['668 693']"

In [92]:
# The annotations and locations are stored as strings representing python lists, so we must evaluate them.
df_train['annotation'] = df_train['annotation'].apply(ast.literal_eval)
df_train['location'] = df_train['location'].apply(ast.literal_eval)

In [93]:
df_train = df_train.merge(features, on=['feature_num', 'case_num'], how='left')
df_train = df_train.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
df_train.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myocardial-infarction,"HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of ""heart beating/pounding out of my chest."" 2 days ago during a soccer game had an episode, but this ti..."
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,"HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of ""heart beating/pounding out of my chest."" 2 days ago during a soccer game had an episode, but this ti..."
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,"HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of ""heart beating/pounding out of my chest."" 2 days ago during a soccer game had an episode, but this ti..."
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,"HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of ""heart beating/pounding out of my chest."" 2 days ago during a soccer game had an episode, but this ti..."
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,"HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of ""heart beating/pounding out of my chest."" 2 days ago during a soccer game had an episode, but this ti..."


**CV Split**

In [94]:
from sklearn.model_selection import StratifiedKFold, GroupKFold
K = 5
Fold = GroupKFold(n_splits=K)
groups = df_train['pn_num'].values
for n, (train_index, val_index) in enumerate(Fold.split(df_train, df_train['location'], groups)):
    df_train.loc[val_index, 'fold'] = int(n)
df_train['fold'] = df_train['fold'].astype(int)
display(df_train.groupby('fold').size())

fold
0    2860
1    2860
2    2860
3    2860
4    2860
dtype: int64

**Helper Functions for Scoring**

In [95]:
from sklearn.metrics import f1_score

def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    
    return f1_score(truths, preds)

In [96]:
def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    
    return binary

In [97]:
spans_to_binary([[0, 5], [10, 15]])

array([1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1.])

In [98]:
def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
        
    bin_preds = []
    bin_truths = []
    
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue

        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
        
    return micro_f1(bin_preds, bin_truths)

In [99]:
def location_to_span(location):
    spans = []
    for loc in location:
        if ";" in loc:
            loc = loc.split(';')
        else:
            loc = [loc]
        
        for l in loc:
            spans.append(list(np.array(l.split(' ')).astype(int)))
    
    return spans

In [100]:
df_train['span'] = df_train['location'].apply(location_to_span)
spans = df_train['span'].tolist()[:100]

pred = spans
truth = [span[:2] for span in spans]

print(pred)
print(truth)

[[[696, 724]], [[668, 693]], [[203, 217]], [[70, 91], [176, 183]], [[222, 258]], [], [[321, 329], [404, 413], [652, 661]], [], [], [[26, 38], [96, 118]], [[56, 69]], [[5, 9]], [[10, 11]], [], [[532, 556]], [[263, 284]], [[131, 145], [150, 168]], [], [], [], [[258, 261]], [], [[32, 46]], [[48, 66]], [[0, 6]], [[7, 8]], [[824, 844]], [[803, 822]], [], [[184, 196]], [[552, 568]], [[582, 619]], [[380, 388], [730, 738]], [[573, 576]], [], [[146, 158], [436, 448], [478, 490]], [[197, 211]], [[19, 23]], [[24, 25]], [[622, 631]], [[633, 652]], [], [[76, 84], [171, 180]], [[254, 270]], [], [[389, 396]], [[284, 303]], [], [[85, 99], [126, 138], [126, 131], [143, 151]], [[64, 75], [187, 209]], [[0, 5]], [[6, 7]], [[735, 741]], [[714, 733]], [[310, 324]], [[247, 256]], [[292, 308]], [], [[595, 603]], [], [], [[87, 101], [372, 386]], [[111, 121], [267, 277]], [[46, 57]], [[58, 62]], [[601, 620]], [[571, 599]], [[258, 277]], [[149, 161]], [], [], [], [[226, 240]], [], [[46, 57]], [[125, 135]], [[0, 

In [101]:
span_micro_f1(pred, truth)

0.9797675419715884

In [102]:
def find_all(a_str, sub):
    start = 0
    while True:
        start = a_str.find(sub, start)
        if start == -1: return
        yield start
        start += len(sub) # use start += 1 to find overlapping matches

list(find_all('spam spam spam spam', 'spam')) # [0, 5, 10, 15]

[0, 5, 10, 15]

In [103]:
scores = []
for fold in range(K):
    
    print(f"\n-------------   Fold {fold + 1} / {K}  -------------\n")
    
    df_train_set = df_train[df_train['fold'] != fold].reset_index(drop=True)
    df_val_set = df_train[df_train['fold'] == fold].reset_index(drop=True)
    
    matching_dict = df_train_set[['case_num', 'feature_num', 'annotation']].groupby(['case_num', 'feature_num']).agg(list).T.to_dict()
    matching_dict = {k: np.concatenate(v['annotation']) for k, v in matching_dict.items()}
    matching_dict = {k: np.unique([v_.lower() for v_ in v]) for k, v in matching_dict.items()}
    
    preds = []
    for i in range(len(df_val_set)):
        key = (df_val_set['case_num'][i], df_val_set['feature_num'][i])

        candidates = matching_dict[key]

        text = df_val_set['pn_history'][i].lower()

        spans = []
        #for c in candidates:
         # indices = find_all(text, c)
          #for i in indices:
           # spans.append([i, i + len(c)])

        for c in candidates:
            start = text.find(c)
            if start > -1:
                spans.append([start, start + len(c)])
        preds.append(spans)
        
    score = span_micro_f1(preds, df_val_set['span'])
    scores.append(score)
    print(f"-> F1 score: {score :.3f}")

print(f"CV F1 score: {sum(scores) / len(scores) :.3f}")


-------------   Fold 1 / 5  -------------

-> F1 score: 0.576

-------------   Fold 2 / 5  -------------

-> F1 score: 0.596

-------------   Fold 3 / 5  -------------

-> F1 score: 0.575

-------------   Fold 4 / 5  -------------

-> F1 score: 0.589

-------------   Fold 5 / 5  -------------

-> F1 score: 0.577
CV F1 score: 0.583


**Inference on test data**

In [104]:
df_test = pd.read_csv(DATA_PATH + "test.csv")
df_test = df_test.merge(features, on=['feature_num', 'case_num'], how='left')
df_test = df_test.merge(patient_notes, on=['pn_num', 'case_num'], how='left')

matching_dict = df_train[['case_num', 'feature_num', 'annotation']].groupby(['case_num', 'feature_num']).agg(list).T.to_dict()
matching_dict = {k: np.concatenate(v['annotation']) for k, v in matching_dict.items()}
matching_dict = {k: np.unique([v_.lower() for v_ in v]) for k, v in matching_dict.items()}

preds = []
for i in range(len(df_test)):
    key = (df_test['case_num'][i], df_test['feature_num'][i])

    candidates = matching_dict[key]

    text = df_test['pn_history'][i].lower()

    spans = []
    for c in candidates:
      indices = find_all(text, c)
      for i in indices:
          spans.append([i, i + len(c)])

    #for c in candidates:
     #   start = text.find(c)
      #  if start > -1:
       #     spans.append([start, start + len(c)])
    preds.append(spans)

In [105]:
def preds_to_location(preds):
    locations = []
    for pred in preds:
        loc = ";".join([" ".join(np.array(p).astype(str)) for p in pred])
        locations.append(loc)
    return locations

In [106]:
sub = pd.read_csv(DATA_PATH + 'sample_submission.csv')
sub['location'] = preds_to_location(preds)

sub.to_csv('submission.csv', index=False)
sub

Unnamed: 0,id,location
0,00016_000,696 724
1,00016_001,668 693
2,00016_002,203 217;209 217
3,00016_003,83 90;176 183;83 91;70 82;70 91
4,00016_004,222 258


In [107]:
display(df_test)

Unnamed: 0,id,case_num,pn_num,feature_num,feature_text,pn_history
0,00016_000,0,16,0,Family-history-of-MI-OR-Family-history-of-myocardial-infarction,"HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of ""heart beating/pounding out of my chest."" 2 days ago during a soccer game had an episode, but this ti..."
1,00016_001,0,16,1,Family-history-of-thyroid-disorder,"HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of ""heart beating/pounding out of my chest."" 2 days ago during a soccer game had an episode, but this ti..."
2,00016_002,0,16,2,Chest-pressure,"HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of ""heart beating/pounding out of my chest."" 2 days ago during a soccer game had an episode, but this ti..."
3,00016_003,0,16,3,Intermittent-symptoms,"HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of ""heart beating/pounding out of my chest."" 2 days ago during a soccer game had an episode, but this ti..."
4,00016_004,0,16,4,Lightheaded,"HPI: 17yo M presents with palpitations. Patient reports 3-4 months of intermittent episodes of ""heart beating/pounding out of my chest."" 2 days ago during a soccer game had an episode, but this ti..."
