In [53]:
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModel
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from tqdm import tqdm
import pandas as pd
import torch

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\josep\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


## Load Medical Reports and Cancer Type Data

In [54]:
reports = pd.read_csv(r'../../data/tcga_pathology/raw/tcga_pathology_reports.csv')
cancer_type = pd.read_csv(r'../../data/tcga_pathology/raw/tcga_patient_to_cancer_type.csv')

## Length Based Feature Engineering

In [56]:
reports['num_chars'] = reports['text'].apply(lambda x: len(x))
reports['num_words'] = reports['text'].apply(lambda x: len(word_tokenize(x)))
reports['num_sentences'] = reports['text'].apply(lambda x: len(sent_tokenize(x)))
reports['avg_word_length'] = reports['num_chars'] / reports['num_words']

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\josep\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


## Convert Medical Text Into Bert Text Embeddings

In [40]:
model_name = 'emilyalsentzer/Bio_ClinicalBERT'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

def get_embedding(text):
    inputs = tokenizer(
        text,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state

        mask = inputs['attention_mask'].unsqueeze(-1).expand(embeddings.size()).float()
        summed = torch.sum(embeddings * mask, 1)
        counts = torch.clamp(mask.sum(1), min=1e-9)
        mean_pooled = summed / counts

    return mean_pooled.squeeze().numpy()

tqdm.pandas()

reports['embeddings'] = reports['text'].apply(get_embedding)


## Flatten Text Embeddings & Merge Datasets

In [41]:
reports_emb = pd.DataFrame([emb.flatten() for emb in reports.embeddings.values])
reports_emb['patient_id'] = reports['patient_filename'].str.split('.').str[0]
data = reports_emb.merge(cancer_type, on='patient_id')

## Encode Cancer Type

In [43]:
le = LabelEncoder()
data['cancer_labels'] = le.fit_transform(data['cancer_type'])

In [46]:
final_data = pd.DataFrame(data)
final_data.drop(columns=['patient_id', 'cancer_type',], inplace=True)

In [47]:
final_data.to_csv(r'../../data/tcga_pathology/processed/tcga_pathology_reports.csv', index=False)