# NBME - Score Clinical Patient Notes 
- **Framework:** Pytorch
- **Model Architecture:**
    - BERT
    - Linear(768, 512)
    - Linear(512, 512)
    - Linear(512, 1)
- **LR:** 1e-5
- **Batch Size:** 8
- **Epoch:** 3
- **Dropout:** 0.2
- **Criterion:** BCEWithLogitsLoss
- **Optimizer:** AdamW

# Tokenizer params
- **Max Lenght:** 416
- **Padding:** max_lenght
- **Truncation:** only_scond


In [129]:
from ast import literal_eval
from itertools import chain

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from torch import optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from tqdm.notebook import tqdm
from transformers import AutoModel, AutoTokenizer

# Helper Functions
### 1. Datasets Helper Function
need to merge `features.csv`, `patient_notes.csv` with `train.csv`

In [130]:
pd.set_option('display.max_colwidth', None)

In [131]:
BASE_URL = "../input/nbme-score-clinical-patient-notes"


def process_feature_text(text):
    return text.replace("-OR-", ";-").replace("-", " ")


def prepare_datasets():
    features = pd.read_csv(f"{BASE_URL}/features.csv")
    notes = pd.read_csv(f"{BASE_URL}/patient_notes.csv")
    df = pd.read_csv(f"{BASE_URL}/train.csv")
    df["annotation_list"] = [literal_eval(x) for x in df["annotation"]]
    df["location_list"] = [literal_eval(x) for x in df["location"]]

    merged = df.merge(notes, how="left")
    merged = merged.merge(features, how="left")

    merged["feature_text"] = [process_feature_text(x) for x in merged["feature_text"]]
    merged["feature_text"] = merged["feature_text"].apply(lambda x: x.lower())
    merged["pn_history"] = merged["pn_history"].apply(lambda x: x.lower())

    return merged

In [132]:
dg = prepare_datasets()

In [133]:
dg.loc[15:35,:]


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,annotation_list,location_list,pn_history,feature_text
15,00041_002,0,41,2,['PRESSURE ON HER CHEST'],['263 284'],[PRESSURE ON HER CHEST],[263 284],"17 y/o m came to the clinic c/o heart pounding. started 2-3 mo ago. it started suddenly. does not recall any triggering events. it comes and goes, it happened 5-6 times since it started. it lasts 3-4 min, after than just goes away. he has also experiencing sob, pressure on her chest when he has this attack. he is a college student, experiencing some stress recently. \r\ndenies cough, chest pain.\r\nros neg except as above.\r\npmh none. meds aterol, for his studies, sharing w his roommate. nkda.\r\npsh/ hosp/ travel/ trauma none.\r\nfh mom has thyroid problems.\r\nsh sex active w girlfriend, no stds, using condoms. smoke none. etoh only weekends. drug only once, 1 mo ago.",chest pressure
16,00041_003,0,41,3,"['COMES AND GOES', 'HAPPENED 5-6 TIMES']","['131 145', '150 168']","[COMES AND GOES, HAPPENED 5-6 TIMES]","[131 145, 150 168]","17 y/o m came to the clinic c/o heart pounding. started 2-3 mo ago. it started suddenly. does not recall any triggering events. it comes and goes, it happened 5-6 times since it started. it lasts 3-4 min, after than just goes away. he has also experiencing sob, pressure on her chest when he has this attack. he is a college student, experiencing some stress recently. \r\ndenies cough, chest pain.\r\nros neg except as above.\r\npmh none. meds aterol, for his studies, sharing w his roommate. nkda.\r\npsh/ hosp/ travel/ trauma none.\r\nfh mom has thyroid problems.\r\nsh sex active w girlfriend, no stds, using condoms. smoke none. etoh only weekends. drug only once, 1 mo ago.",intermittent symptoms
17,00041_004,0,41,4,[],[],[],[],"17 y/o m came to the clinic c/o heart pounding. started 2-3 mo ago. it started suddenly. does not recall any triggering events. it comes and goes, it happened 5-6 times since it started. it lasts 3-4 min, after than just goes away. he has also experiencing sob, pressure on her chest when he has this attack. he is a college student, experiencing some stress recently. \r\ndenies cough, chest pain.\r\nros neg except as above.\r\npmh none. meds aterol, for his studies, sharing w his roommate. nkda.\r\npsh/ hosp/ travel/ trauma none.\r\nfh mom has thyroid problems.\r\nsh sex active w girlfriend, no stds, using condoms. smoke none. etoh only weekends. drug only once, 1 mo ago.",lightheaded
18,00041_005,0,41,5,[],[],[],[],"17 y/o m came to the clinic c/o heart pounding. started 2-3 mo ago. it started suddenly. does not recall any triggering events. it comes and goes, it happened 5-6 times since it started. it lasts 3-4 min, after than just goes away. he has also experiencing sob, pressure on her chest when he has this attack. he is a college student, experiencing some stress recently. \r\ndenies cough, chest pain.\r\nros neg except as above.\r\npmh none. meds aterol, for his studies, sharing w his roommate. nkda.\r\npsh/ hosp/ travel/ trauma none.\r\nfh mom has thyroid problems.\r\nsh sex active w girlfriend, no stds, using condoms. smoke none. etoh only weekends. drug only once, 1 mo ago.",no hair changes; no nail changes; no temperature intolerance
19,00041_006,0,41,6,[],[],[],[],"17 y/o m came to the clinic c/o heart pounding. started 2-3 mo ago. it started suddenly. does not recall any triggering events. it comes and goes, it happened 5-6 times since it started. it lasts 3-4 min, after than just goes away. he has also experiencing sob, pressure on her chest when he has this attack. he is a college student, experiencing some stress recently. \r\ndenies cough, chest pain.\r\nros neg except as above.\r\npmh none. meds aterol, for his studies, sharing w his roommate. nkda.\r\npsh/ hosp/ travel/ trauma none.\r\nfh mom has thyroid problems.\r\nsh sex active w girlfriend, no stds, using condoms. smoke none. etoh only weekends. drug only once, 1 mo ago.",adderall use
20,00041_007,0,41,7,['SOB'],['258 261'],[SOB],[258 261],"17 y/o m came to the clinic c/o heart pounding. started 2-3 mo ago. it started suddenly. does not recall any triggering events. it comes and goes, it happened 5-6 times since it started. it lasts 3-4 min, after than just goes away. he has also experiencing sob, pressure on her chest when he has this attack. he is a college student, experiencing some stress recently. \r\ndenies cough, chest pain.\r\nros neg except as above.\r\npmh none. meds aterol, for his studies, sharing w his roommate. nkda.\r\npsh/ hosp/ travel/ trauma none.\r\nfh mom has thyroid problems.\r\nsh sex active w girlfriend, no stds, using condoms. smoke none. etoh only weekends. drug only once, 1 mo ago.",shortness of breath
21,00041_008,0,41,8,[],[],[],[],"17 y/o m came to the clinic c/o heart pounding. started 2-3 mo ago. it started suddenly. does not recall any triggering events. it comes and goes, it happened 5-6 times since it started. it lasts 3-4 min, after than just goes away. he has also experiencing sob, pressure on her chest when he has this attack. he is a college student, experiencing some stress recently. \r\ndenies cough, chest pain.\r\nros neg except as above.\r\npmh none. meds aterol, for his studies, sharing w his roommate. nkda.\r\npsh/ hosp/ travel/ trauma none.\r\nfh mom has thyroid problems.\r\nsh sex active w girlfriend, no stds, using condoms. smoke none. etoh only weekends. drug only once, 1 mo ago.",caffeine use
22,00041_009,0,41,9,['HEART POUNDING'],['32 46'],[HEART POUNDING],[32 46],"17 y/o m came to the clinic c/o heart pounding. started 2-3 mo ago. it started suddenly. does not recall any triggering events. it comes and goes, it happened 5-6 times since it started. it lasts 3-4 min, after than just goes away. he has also experiencing sob, pressure on her chest when he has this attack. he is a college student, experiencing some stress recently. \r\ndenies cough, chest pain.\r\nros neg except as above.\r\npmh none. meds aterol, for his studies, sharing w his roommate. nkda.\r\npsh/ hosp/ travel/ trauma none.\r\nfh mom has thyroid problems.\r\nsh sex active w girlfriend, no stds, using condoms. smoke none. etoh only weekends. drug only once, 1 mo ago.",heart pounding; heart racing
23,00041_010,0,41,10,['STARTED 2-3 MO AGO'],['48 66'],[STARTED 2-3 MO AGO],[48 66],"17 y/o m came to the clinic c/o heart pounding. started 2-3 mo ago. it started suddenly. does not recall any triggering events. it comes and goes, it happened 5-6 times since it started. it lasts 3-4 min, after than just goes away. he has also experiencing sob, pressure on her chest when he has this attack. he is a college student, experiencing some stress recently. \r\ndenies cough, chest pain.\r\nros neg except as above.\r\npmh none. meds aterol, for his studies, sharing w his roommate. nkda.\r\npsh/ hosp/ travel/ trauma none.\r\nfh mom has thyroid problems.\r\nsh sex active w girlfriend, no stds, using condoms. smoke none. etoh only weekends. drug only once, 1 mo ago.",few months duration
24,00041_011,0,41,11,['17 Y/O'],['0 6'],[17 Y/O],[0 6],"17 y/o m came to the clinic c/o heart pounding. started 2-3 mo ago. it started suddenly. does not recall any triggering events. it comes and goes, it happened 5-6 times since it started. it lasts 3-4 min, after than just goes away. he has also experiencing sob, pressure on her chest when he has this attack. he is a college student, experiencing some stress recently. \r\ndenies cough, chest pain.\r\nros neg except as above.\r\npmh none. meds aterol, for his studies, sharing w his roommate. nkda.\r\npsh/ hosp/ travel/ trauma none.\r\nfh mom has thyroid problems.\r\nsh sex active w girlfriend, no stds, using condoms. smoke none. etoh only weekends. drug only once, 1 mo ago.",17 year


In [134]:
dg.loc[15:35,:]

dg.location_list[0][0]

'696 724'

In [135]:
dg.shape

(14300, 10)

In [136]:
features = pd.read_csv(f"{BASE_URL}/features.csv")

In [137]:
features.head()

Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myocardial-infarction
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded


In [138]:
features.shape

(143, 3)

In [139]:
features.groupby('case_num').size()

case_num
0    13
1    13
2    17
3    16
4    10
5    18
6    12
7     9
8    18
9    17
dtype: int64

In [140]:
pd.set_option('display.max_colwidth', None)

In [141]:
features[features['case_num']==0]

Unnamed: 0,feature_num,case_num,feature_text
0,0,0,Family-history-of-MI-OR-Family-history-of-myocardial-infarction
1,1,0,Family-history-of-thyroid-disorder
2,2,0,Chest-pressure
3,3,0,Intermittent-symptoms
4,4,0,Lightheaded
5,5,0,No-hair-changes-OR-no-nail-changes-OR-no-temperature-intolerance
6,6,0,Adderall-use
7,7,0,Shortness-of-breath
8,8,0,Caffeine-use
9,9,0,heart-pounding-OR-heart-racing


In [142]:
features[features['case_num']==6]

Unnamed: 0,feature_num,case_num,feature_text
87,600,6,Subjective-fevers
88,601,6,Male
89,602,6,17-year
90,603,6,Recent-upper-respiratory-symptoms
91,604,6,Worse-with-deep-breath-OR-pleuritic
92,605,6,Exercise-induced-asthma
93,606,6,Chest-pain
94,607,6,Duration-x-1-day
95,608,6,No-shortness-of-breath
96,609,6,Recent-heavy-lifting-at-work-OR-recent-rock-climbing


In [143]:
features[features['case_num']==5]

Unnamed: 0,feature_num,case_num,feature_text
69,500,5,Onset-5-years-ago
70,501,5,Female
71,502,5,No-caffeine-use
72,503,5,Associated-SOB-OR-Associated-shortness-of-breath
73,504,5,Episodes-of-heart-racing
74,505,5,Recent-visit-to-emergency-department-with-negative-workup
75,506,5,No-chest-pain
76,507,5,No-illicit-drug-use
77,508,5,Associated-nausea
78,509,5,Increased-frequency-recently


In [144]:
patient_notes = pd.read_csv(f"{BASE_URL}/patient_notes.csv")

In [145]:
patient_notes.head()

Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student health clinic complaining of heart pounding. Mr. Cleveland's mother has given verbal consent for a history, physical examination, and treatment\r\n-began 2-3 months ago,sudden,intermittent for 2 days(lasting 3-4 min),worsening,non-allev/aggrav\r\n-associated with dispnea on exersion and rest,stressed out about school\r\n-reports fe feels like his heart is jumping out of his chest\r\n-ros:denies chest pain,dyaphoresis,wt loss,chills,fever,nausea,vomiting,pedal edeam\r\n-pmh:non,meds :aderol (from a friend),nkda\r\n-fh:father had MI recently,mother has thyroid dz\r\n-sh:non-smoker,mariguana 5-6 months ago,3 beers on the weekend, basketball at school\r\n-sh:no std"
1,1,0,"17 yo male with recurrent palpitations for the past 3 mo lasting about 3 - 4 min, it happened about 5 - 6 times since the beginning. One time durign a baskeball game two days ago light headedness, pressure in the chest, catching breath, but no fainting. During teh episodes no sweating. No diarrhea, no heat intolerance, no weight loss. Has tried aterol to be able to better concentrate, has received it from his roommate. ."
2,2,0,"Dillon Cleveland is a 17 y.o. male patient with no significant PMH who presents with complaints of heart pounding. This has been going on for a few months and happens once or twice a month. He cannot think of any triggers, and it has occurred both with activity and at rest. Occasionally, it is accompanied by chest pressure but not pain that is located at the center of his chest. On one instance, he experienced chest pressure, lightheadedness and shortness of breath for 10 minutes with the heart pounding. \r\n\r\nOtherwise, he has not had shortness of breath, chest pain, anxiety, \r\n\r\nMedication: adderall twice a week as study aid (not prescribed)\r\nFH: mother - thyroid disease, father - heart attack at age 52, both living\r\nSH: no smoking; 3-4 drinks on the weekend per sitting; marijuanna once recently, no other recreational drugs"
3,3,0,"a 17 yo m c/o palpitation started 3 mos ago; \r\nNOTHING IMPROVES OR EXACERBATES THE SYMPTOMS ACCORDING TO HIM; IT CAN HAPPEN ANY TIME; MAY TAKE A FEW MINUTES; LAST TIME HAPPENED 2 DAYS AGO DURING PLAYING A GAME AND IT WAS ASSOCIATED WITH RETROSTERNAL PRESSURE LIKE DISCOMFORT; AND HE FELT LIGHTHEADED. BUT HE DID NOT LOSE CONCIOUSNESS AND DID NOT FALL. \r\nNOT ASSOCIATED WITH NAUSEA VOMITING; HEADACHE; ABDOMINAL PAIN; CHANGES IN URINATION OR BOWEL HABITS, OR TREMOR OR SKIN OR HAIR CHANGE OR INTOLERABC\r\n\r\nPMH NONE\r\nPSHH: NONE\r\nMEDS: ADEROL TO STAY AWAKE\r\nHOSP: NONE\r\nFH: MOTHER HAS THYROID DISEASE; FATHER HAS CAD X 1 YR\r\nSH: NO SMOKING; DRINKING ON WEEKENDS CAGE 0/4; ONE TIME USE OF MARIJUANA\r\n\r\n\r\n"
4,4,0,"17yo male with no pmh here for evaluation of palpitations. States for the last 3-4mo he has felt that his heart with intermittently ""beat out of his chest,"" with some associated difficulty catching his breath. States that the most recent event was 2 days ago, and during activity at a soccer game. He does not seem to note any specific precipitatinig factors at this time. He also states that he feels as if he will faint during these events, but has not lost consciousness at any point. Furthermore, he does endorse theses attacks occuring 1-2 times a month and peak at 4 mins. He denies any stressors at home. ROS: denies weight loss, fevers, recnet illness, change in bowel habits. PMH: negative, PSH negative, FHX mom with thyroid disorder, dad with heart condition and MI at 52yo. SHX no tobacco, ETOH on weekends, Marijuana tried a month ago. Med: is taking some of roommates Adderoll intermittently (last was 2 days ago prior to event). KNDA"


### 2. Tokenizer Helper Function

In [146]:
hyperparameters = {
    "max_length": 416,
    "padding": "max_length",
    "return_offsets_mapping": True,
    "truncation": "only_second",
    "model_name": "../input/huggingface-bert/bert-base-uncased",
    "dropout": 0.2,
    "lr": 1e-5,
    "test_size": 0.2,
    "seed": 1268,
    "batch_size": 8
}

In [147]:
dg.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,annotation_list,location_list,pn_history,feature_text
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724'],[dad with recent heart attcak],[696 724],"hpi: 17yo m presents with palpitations. patient reports 3-4 months of intermittent episodes of ""heart beating/pounding out of my chest."" 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). of note patient endorses abusing adderall, primarily to study (1-3 times per week). before recent soccer game, took adderrall night before and morning of game. denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or urinary habits. \r\npmhx: none\r\nrx: uses friends adderrall\r\nfhx: mom with ""thyroid disease,"" dad with recent heart attcak\r\nall: none\r\nimmunizations: up to date\r\nshx: freshmen in college. endorses 3-4 drinks 3 nights / week (on weekends), denies tabacco, endorses trying marijuana. sexually active with girlfriend x 1 year, uses condoms",family history of mi; family history of myocardial infarction
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693'],"[mom with ""thyroid disease]",[668 693],"hpi: 17yo m presents with palpitations. patient reports 3-4 months of intermittent episodes of ""heart beating/pounding out of my chest."" 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). of note patient endorses abusing adderall, primarily to study (1-3 times per week). before recent soccer game, took adderrall night before and morning of game. denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or urinary habits. \r\npmhx: none\r\nrx: uses friends adderrall\r\nfhx: mom with ""thyroid disease,"" dad with recent heart attcak\r\nall: none\r\nimmunizations: up to date\r\nshx: freshmen in college. endorses 3-4 drinks 3 nights / week (on weekends), denies tabacco, endorses trying marijuana. sexually active with girlfriend x 1 year, uses condoms",family history of thyroid disorder
2,00016_002,0,16,2,['chest pressure'],['203 217'],[chest pressure],[203 217],"hpi: 17yo m presents with palpitations. patient reports 3-4 months of intermittent episodes of ""heart beating/pounding out of my chest."" 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). of note patient endorses abusing adderall, primarily to study (1-3 times per week). before recent soccer game, took adderrall night before and morning of game. denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or urinary habits. \r\npmhx: none\r\nrx: uses friends adderrall\r\nfhx: mom with ""thyroid disease,"" dad with recent heart attcak\r\nall: none\r\nimmunizations: up to date\r\nshx: freshmen in college. endorses 3-4 drinks 3 nights / week (on weekends), denies tabacco, endorses trying marijuana. sexually active with girlfriend x 1 year, uses condoms",chest pressure
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']","[intermittent episodes, episode]","[70 91, 176 183]","hpi: 17yo m presents with palpitations. patient reports 3-4 months of intermittent episodes of ""heart beating/pounding out of my chest."" 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). of note patient endorses abusing adderall, primarily to study (1-3 times per week). before recent soccer game, took adderrall night before and morning of game. denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or urinary habits. \r\npmhx: none\r\nrx: uses friends adderrall\r\nfhx: mom with ""thyroid disease,"" dad with recent heart attcak\r\nall: none\r\nimmunizations: up to date\r\nshx: freshmen in college. endorses 3-4 drinks 3 nights / week (on weekends), denies tabacco, endorses trying marijuana. sexually active with girlfriend x 1 year, uses condoms",intermittent symptoms
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258'],[felt as if he were going to pass out],[222 258],"hpi: 17yo m presents with palpitations. patient reports 3-4 months of intermittent episodes of ""heart beating/pounding out of my chest."" 2 days ago during a soccer game had an episode, but this time had chest pressure and felt as if he were going to pass out (did not lose conciousness). of note patient endorses abusing adderall, primarily to study (1-3 times per week). before recent soccer game, took adderrall night before and morning of game. denies shortness of breath, diaphoresis, fevers, chills, headache, fatigue, changes in sleep, changes in vision/hearing, abdominal paun, changes in bowel or urinary habits. \r\npmhx: none\r\nrx: uses friends adderrall\r\nfhx: mom with ""thyroid disease,"" dad with recent heart attcak\r\nall: none\r\nimmunizations: up to date\r\nshx: freshmen in college. endorses 3-4 drinks 3 nights / week (on weekends), denies tabacco, endorses trying marijuana. sexually active with girlfriend x 1 year, uses condoms",lightheaded


In [148]:
def loc_list_to_ints(loc_list):
    to_return = []
    for loc_str in loc_list:
        loc_strs = loc_str.split(";")
        for loc in loc_strs:
            start, end = loc.split()
            to_return.append((int(start), int(end)))
    return to_return


def tokenize_and_add_labels(tokenizer, data, config):
    out = tokenizer(
        data["feature_text"],
        data["pn_history"],
        truncation=config['truncation'],
        max_length=config['max_length'],
        padding=config['padding'],
        return_offsets_mapping=config['return_offsets_mapping']
    )
    labels = [0.0] * len(out["input_ids"])
    out["location_int"] = loc_list_to_ints(data["location_list"])
    out["sequence_ids"] = out.sequence_ids()

    for idx, (seq_id, offsets) in enumerate(zip(out["sequence_ids"], out["offset_mapping"])):
        if not seq_id or seq_id == 0:
            labels[idx] = -1
            continue

        token_start, token_end = offsets
        for feature_start, feature_end in out["location_int"]:
            if token_start >= feature_start and token_end <= feature_end:
                labels[idx] = 1.0
                break

    out["labels"] = labels

    return out


### 3. Predection and Score Helper Function

In [149]:
from sklearn.metrics import accuracy_score

def get_location_predictions(preds, offset_mapping, sequence_ids, test=False):
    all_predictions = []
    for pred, offsets, seq_ids in zip(preds, offset_mapping, sequence_ids):
        pred = 1 / (1 + np.exp(-pred))
        start_idx = None
        end_idx = None
        current_preds = []
        for pred, offset, seq_id in zip(pred, offsets, seq_ids):
            if seq_id is None or seq_id == 0:
                continue

            if pred > 0.5:
                if start_idx is None:
                    start_idx = offset[0]
                end_idx = offset[1]
            elif start_idx is not None:
                if test:
                    current_preds.append(f"{start_idx} {end_idx}")
                else:
                    current_preds.append((start_idx, end_idx))
                start_idx = None
        if test:
            all_predictions.append("; ".join(current_preds))
        else:
            all_predictions.append(current_preds)
            
    return all_predictions


def calculate_char_cv(predictions, offset_mapping, sequence_ids, labels):
    all_labels = []
    all_preds = []
    for preds, offsets, seq_ids, labels in zip(predictions, offset_mapping, sequence_ids, labels):

        num_chars = max(list(chain(*offsets)))
        char_labels = np.zeros(num_chars)

        for o, s_id, label in zip(offsets, seq_ids, labels):
            if s_id is None or s_id == 0:
                continue
            if int(label) == 1:
                char_labels[o[0]:o[1]] = 1

        char_preds = np.zeros(num_chars)

        for start_idx, end_idx in preds:
            char_preds[start_idx:end_idx] = 1

        all_labels.extend(char_labels)
        all_preds.extend(char_preds)

    results = precision_recall_fscore_support(all_labels, all_preds, average="binary", labels=np.unique(all_preds))
    accuracy = accuracy_score(all_labels, all_preds)
    

    return {
        "Accuracy": accuracy,
        "precision": results[0],
        "recall": results[1],
        "f1": results[2]
    }

# Dataset

In [150]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, config):
        self.data = data
        self.tokenizer = tokenizer
        self.config = config

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data = self.data.iloc[idx]
        tokens = tokenize_and_add_labels(self.tokenizer, data, self.config)

        input_ids = np.array(tokens["input_ids"])
        attention_mask = np.array(tokens["attention_mask"])
        token_type_ids = np.array(tokens["token_type_ids"])

        labels = np.array(tokens["labels"])
        offset_mapping = np.array(tokens['offset_mapping'])
        sequence_ids = np.array(tokens['sequence_ids']).astype("float16")
        
        return input_ids, attention_mask, token_type_ids, labels, offset_mapping, sequence_ids

# Model
- Lets use **BERT** base Architecture
- Also Used 3 FC layers

**Comments:** 3 layers improve accuracy 2% on public score

In [151]:
import torch.nn.functional as F

class CustomModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.bert = AutoModel.from_pretrained(config['model_name'])  # BERT model
        self.dropout = nn.Dropout(p=config['dropout'])
        self.config = config
        self.fc1 = nn.Linear(768, 512)
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, 1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = F.relu(self.fc1(outputs[0]))
        logits = F.relu(self.fc2(self.dropout(logits)))
        logits = self.fc3(self.dropout(logits)).squeeze(-1)
        return logits

## Hyperparameters


In [152]:
hyperparameters = {
    "max_length": 416,
    "padding": "max_length",
    "return_offsets_mapping": True,
    "truncation": "only_second",
    "model_name": "../input/huggingface-bert/bert-base-uncased",
    "dropout": 0.2,
    "lr": 5e-5,
    "test_size": 0.2,
    "seed": 1268,
    "batch_size": 8
}

# Prepare Datasets
Train and Test split: 20%

Total Data:
- Train: 11440
- Test: 2860

In [153]:
train_df = prepare_datasets()

X_train, X_test = train_test_split(train_df, test_size=hyperparameters['test_size'],
                                   random_state=hyperparameters['seed'])


print("Train size", len(X_train))
print("Test Size", len(X_test))

Train size 11440
Test Size 2860


In [154]:
tokenizer = AutoTokenizer.from_pretrained(hyperparameters['model_name'])

training_data = CustomDataset(X_train, tokenizer, hyperparameters)
train_dataloader = DataLoader(training_data, batch_size=hyperparameters['batch_size'], shuffle=True)

test_data = CustomDataset(X_test, tokenizer, hyperparameters)
test_dataloader = DataLoader(test_data, batch_size=hyperparameters['batch_size'], shuffle=False)

In [155]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

model = CustomModel(hyperparameters).to(DEVICE)
criterion = torch.nn.BCEWithLogitsLoss(reduction = "none")
optimizer = optim.AdamW(model.parameters(), lr=hyperparameters['lr'])

Some weights of the model checkpoint at ../input/huggingface-bert/bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [156]:
def train_model(model, dataloader, optimizer, criterion):
        model.train()
        train_loss = []

        for batch in tqdm(dataloader):
            optimizer.zero_grad()
            input_ids = batch[0].to(DEVICE)
            attention_mask = batch[1].to(DEVICE)
            token_type_ids = batch[2].to(DEVICE)
            labels = batch[3].to(DEVICE)

            logits = model(input_ids, attention_mask, token_type_ids)
            loss = criterion(logits, labels)
            # since, we have
            loss = torch.masked_select(loss, labels > -1.0).mean()
            train_loss.append(loss.item() * input_ids.size(0))
            loss.backward()
            # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
            # it's also improve f1 accuracy slightly
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

        return sum(train_loss)/len(train_loss)

In [157]:
def eval_model(model, dataloader, criterion):
        model.eval()
        valid_loss = []
        preds = []
        offsets = []
        seq_ids = []
        valid_labels = []

        for batch in tqdm(dataloader):
            input_ids = batch[0].to(DEVICE)
            attention_mask = batch[1].to(DEVICE)
            token_type_ids = batch[2].to(DEVICE)
            labels = batch[3].to(DEVICE)
            offset_mapping = batch[4]
            sequence_ids = batch[5]

            logits = model(input_ids, attention_mask, token_type_ids)
            loss = criterion(logits, labels)
            loss = torch.masked_select(loss, labels > -1.0).mean()
            valid_loss.append(loss.item() * input_ids.size(0))

            preds.append(logits.detach().cpu().numpy())
            offsets.append(offset_mapping.numpy())
            seq_ids.append(sequence_ids.numpy())
            valid_labels.append(labels.detach().cpu().numpy())

        preds = np.concatenate(preds, axis=0)
        offsets = np.concatenate(offsets, axis=0)
        seq_ids = np.concatenate(seq_ids, axis=0)
        valid_labels = np.concatenate(valid_labels, axis=0)
        location_preds = get_location_predictions(preds, offsets, seq_ids, test=False)
        score = calculate_char_cv(location_preds, offsets, seq_ids, valid_labels)

        return sum(valid_loss)/len(valid_loss), score

In [158]:
import time

train_loss_data, valid_loss_data = [], []
score_data_list = []
valid_loss_min = np.Inf
since = time.time()
epochs = 6

In [159]:
best_loss = np.inf

for i in range(epochs):
    print("Epoch: {}/{}".format(i + 1, epochs))
    # first train model
    train_loss = train_model(model, train_dataloader, optimizer, criterion)
    train_loss_data.append(train_loss)
    print(f"Train loss: {train_loss}")
    # evaluate model
    valid_loss, score = eval_model(model, test_dataloader, criterion)
    valid_loss_data.append(valid_loss)
    score_data_list.append(score)
    print(f"Valid loss: {valid_loss}")
    print(f"Valid score: {score}")
    
    if valid_loss < best_loss:
        best_loss = valid_loss
        torch.save(model.state_dict(), "nbme_bert_v2.pth")

    
time_elapsed = time.time() - since
print('Training completed in {:.0f}m {:.0f}s'.format(
    time_elapsed // 60, time_elapsed % 60))

Epoch: 1/6


  0%|          | 0/1430 [00:00<?, ?it/s]

Train loss: 0.2989399148633189


  0%|          | 0/358 [00:00<?, ?it/s]

Valid loss: 0.13819801550175514
Valid score: {'Accuracy': 0.9928767718292614, 'precision': 0.7671881984801073, 'recall': 0.7441678952389211, 'f1': 0.7555027293537595}
Epoch: 2/6


  0%|          | 0/1430 [00:00<?, ?it/s]

Train loss: 0.11511947412273321


  0%|          | 0/358 [00:00<?, ?it/s]

Valid loss: 0.1352871195805452
Valid score: {'Accuracy': 0.9925625494846754, 'precision': 0.6937970517964207, 'recall': 0.8898042956667533, 'f1': 0.7796704618852822}
Epoch: 3/6


  0%|          | 0/1430 [00:00<?, ?it/s]

Train loss: 0.0825679362471678


  0%|          | 0/358 [00:00<?, ?it/s]

Valid loss: 0.12170894964714486
Valid score: {'Accuracy': 0.9933325011072597, 'precision': 0.7384700359016846, 'recall': 0.8502876304454658, 'f1': 0.7904439428141459}
Epoch: 4/6


  0%|          | 0/1430 [00:00<?, ?it/s]

Train loss: 0.06452174109302061


  0%|          | 0/358 [00:00<?, ?it/s]

Valid loss: 0.1270681007762526
Valid score: {'Accuracy': 0.9934440821030923, 'precision': 0.7422511824494314, 'recall': 0.8528604052843061, 'f1': 0.7937208269999059}
Epoch: 5/6


  0%|          | 0/1430 [00:00<?, ?it/s]

Train loss: 0.05133669059936187


  0%|          | 0/358 [00:00<?, ?it/s]

Valid loss: 0.14102070625014618
Valid score: {'Accuracy': 0.9925026976095161, 'precision': 0.6948277437631363, 'recall': 0.8791952129043448, 'f1': 0.7762138709883238}
Epoch: 6/6


  0%|          | 0/1430 [00:00<?, ?it/s]

KeyboardInterrupt: 

### Experimets:
- exp 1
Params: Base bert with 1FC, epoch 5, lr 1e-5

{'Accuracy': 0.9922235313632376, 'precision': 0.699022058288238, 'recall': 0.8327118203104674, 'f1': 0.7600327168148598}

- exp 2:
Params: Base bert with 2FC, epoch 2, lr 1e-5

{'Accuracy': 0.9931995444417273, 'precision': 0.755762387079113, 'recall': 0.7980805365247304, 'f1': 0.7763452047860748}

- exp 3:
params: 2FC, epoch 2, lr 1e-5 with gradient clip
{'Accuracy': 0.9932764968526464, 'precision': 0.7633003963601853, 'recall': 0.7905067499205042, 'f1': 0.7766653886025079}

- exp 4: 3FC, epoch 2, 1e-5 with gradient clip

{'Accuracy': 0.9933637095850213, 'precision': 0.7576469952442715, 'recall': 0.8105397045645073, 'f1': 0.7832013519364255}


In [2]:
dg

NameError: name 'dg' is not defined

In [1]:
from matplotlib import pyplot as plt

plt.plot(train_loss_data, label="Training loss")
plt.plot(valid_loss_data, label="validation loss")
plt.legend(frameon=False)

NameError: name 'train_loss_data' is not defined

In [None]:
import pandas as pd

score_df = pd.DataFrame.from_dict(score_data_list)
score_df.head()

# Prepare For Testing

Load best model

In [None]:
model.load_state_dict(torch.load("nbme_bert_v2.pth", map_location = DEVICE))

In [None]:
def create_test_df():
    feats = pd.read_csv(f"{BASE_URL}/features.csv")
    notes = pd.read_csv(f"{BASE_URL}/patient_notes.csv")
    test = pd.read_csv(f"{BASE_URL}/test.csv")

    merged = test.merge(notes, how = "left")
    merged = merged.merge(feats, how = "left")

    def process_feature_text(text):
        return text.replace("-OR-", ";-").replace("-", " ")
    
    merged["feature_text"] = [process_feature_text(x) for x in merged["feature_text"]]
    
    return merged


class SubmissionDataset(Dataset):
    def __init__(self, data, tokenizer, config):
        self.data = data
        self.tokenizer = tokenizer
        self.config = config
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        example = self.data.loc[idx]
        tokenized = self.tokenizer(
            example["feature_text"],
            example["pn_history"],
            truncation = self.config['truncation'],
            max_length = self.config['max_length'],
            padding = self.config['padding'],
            return_offsets_mapping = self.config['return_offsets_mapping']
        )
        tokenized["sequence_ids"] = tokenized.sequence_ids()

        input_ids = np.array(tokenized["input_ids"])
        attention_mask = np.array(tokenized["attention_mask"])
        token_type_ids = np.array(tokenized["token_type_ids"])
        offset_mapping = np.array(tokenized["offset_mapping"])
        sequence_ids = np.array(tokenized["sequence_ids"]).astype("float16")

        return input_ids, attention_mask, token_type_ids, offset_mapping, sequence_ids


test_df = create_test_df()

submission_data = SubmissionDataset(test_df, tokenizer, hyperparameters)
submission_dataloader = DataLoader(submission_data, batch_size=hyperparameters['batch_size'], shuffle=False)

In [None]:
model.eval()
preds = []
offsets = []
seq_ids = []

for batch in tqdm(submission_dataloader):
    input_ids = batch[0].to(DEVICE)
    attention_mask = batch[1].to(DEVICE)
    token_type_ids = batch[2].to(DEVICE)
    offset_mapping = batch[3]
    sequence_ids = batch[4]

    logits = model(input_ids, attention_mask, token_type_ids)
    
    preds.append(logits.detach().cpu().numpy())
    offsets.append(offset_mapping.numpy())
    seq_ids.append(sequence_ids.numpy())

preds = np.concatenate(preds, axis=0)
offsets = np.concatenate(offsets, axis=0)
seq_ids = np.concatenate(seq_ids, axis=0)

In [None]:
location_preds = get_location_predictions(preds, offsets, seq_ids, test=True)

In [None]:
len(location_preds), len(test_df)

In [None]:
test_df["location"] = location_preds

In [None]:
test_df[["id", "location"]].to_csv("submission.csv", index = False)
pd.read_csv("submission.csv").head()

# Special Credits
- [tomohiroh](https://www.kaggle.com/tomohiroh/nbme-bert-for-beginners)
- [gazu468](https://www.kaggle.com/gazu468/nbme-details-eda)
