In [2]:
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModel
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from tqdm import tqdm
import pandas as pd
import torch

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\jhowert\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

## Load Medical Reports and Cancer Type Data

In [3]:
reports = pd.read_csv(r'../../data/tcga_pathology/raw/tcga_pathology_reports.csv')
cancer_type = pd.read_csv(r'../../data/tcga_pathology/raw/tcga_patient_to_cancer_type.csv')

## Length Based Feature Engineering

In [4]:
reports['num_chars'] = reports['text'].apply(lambda x: len(x))
reports['num_words'] = reports['text'].apply(lambda x: len(word_tokenize(x)))
reports['num_sentences'] = reports['text'].apply(lambda x: len(sent_tokenize(x)))
reports['avg_word_length'] = reports['num_chars'] / reports['num_words']

In [17]:
reports['patient_id'] = reports['patient_filename'].str.split('.').str[0]
combined = reports.merge(cancer_type, on='patient_id')

np.int64(0)

In [38]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# cancer_types list you want to cover
cancer_types = ['READ', 'KIRC', 'PCPG', 'BRCA', 'LUAD', 'LUSC', 'UCS',
                'COAD', 'KIRP', 'LAML', 'BLCA', 'UCEC', 'TGCT', 'UVM',
                'SARC', 'THYM', 'PRAD', 'MESO', 'HNSC', 'OV', 'ACC', 'GBM',
                'STAD', 'PAAD', 'CESC', 'KICH', 'THCA', 'DLBC', 'ESCA',
                'SKCM', 'CHOL', 'LGG', 'LIHC']

top_words_per_cancer = {}
top_ngrams_per_cancer = {}

for ctype in cancer_types:
    subset = combined[combined['cancer_type'] == ctype]

    if subset.empty:
        # Create empty DataFrames if no reports exist
        top_words_per_cancer[ctype] = pd.DataFrame(columns=['word', 'count'])
        top_ngrams_per_cancer[ctype] = pd.DataFrame(columns=['ngram', 'count'])
        continue

    # ---- Unigrams ----
    vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,1))
    X_counts = vectorizer.fit_transform(subset['text'])
    words = vectorizer.get_feature_names_out()
    counts = X_counts.sum(axis=0).A1
    word_freq = pd.DataFrame({'word': words, 'count': counts}).sort_values(by='count', ascending=False)
    top_words_per_cancer[ctype] = word_freq.head(20)

    # ---- Bigrams / Trigrams ----
    vectorizer = CountVectorizer(stop_words='english', ngram_range=(2,3))
    X_ngrams = vectorizer.fit_transform(subset['text'])
    ngrams = vectorizer.get_feature_names_out()
    counts = X_ngrams.sum(axis=0).A1
    ngram_freq = pd.DataFrame({'ngram': ngrams, 'count': counts}).sort_values(by='count', ascending=False)
    top_ngrams_per_cancer[ctype] = ngram_freq.head(20)

# Example: print top 5 words and n-grams for each type
for ctype in cancer_types:
    print(f"--- {ctype} ---")
    print("Top words:", top_words_per_cancer[ctype]['word'].tolist()[:5])
    print("Top n-grams:", top_ngrams_per_cancer[ctype]['ngram'].tolist()[:5])
    print()

--- READ ---
Top words: ['tumor', 'cm', 'margin', 'lymph', 'resection']
Top n-grams: ['lymph nodes', 'lymph node', 'moderately differentiated', 'distal margin', 'resection margin']

--- KIRC ---
Top words: ['tumor', 'renal', 'cm', 'kidney', 'tissue']
Top n-grams: ['lymph nodes', 'adrenal gland', 'renal cell', 'renal vein', 'cell carcinoma']

--- PCPG ---
Top words: ['adrenal', 'tumor', 'cm', 'specimen', 'tissue']
Top n-grams: ['adrenal gland', 'right adrenal', 'medical record', 'measuring cm', 'left adrenal']

--- BRCA ---
Top words: ['cm', 'lymph', 'node', 'breast', 'tumor']
Top n-grams: ['lymph node', 'lymph nodes', 'sentinel lymph', 'left breast', 'sentinel lymph node']

--- LUAD ---
Top words: ['lymph', 'node', 'cm', 'tumor', 'lung']
Top n-grams: ['lymph node', 'lymph nodes', 'upper lobe', 'frozen section', 'measuring cm']

--- LUSC ---
Top words: ['lymph', 'node', 'cm', 'tumor', 'nodes']
Top n-grams: ['lymph node', 'lymph nodes', 'frozen section', 'upper lobe', 'cell carcinoma']



In [39]:
for ctype in cancer_types:
    df = top_ngrams_per_cancer[ctype]
    print(f"Top words for {ctype} are:\n{df.ngram}")

Top words for READ are:
12579                      lymph nodes
12520                       lymph node
14488        moderately differentiated
7202                     distal margin
20172                 resection margin
22096                      soft tissue
17277                         pink tan
24635                       tumor free
13886                     measuring cm
18603                  proximal margin
18958                    radial margin
14918               muscularis propria
4555                         cm distal
22427                specimen received
19759                   regional lymph
20010          representative sections
4676                         cm length
13003                        margin cm
6864     differentiated adenocarcinoma
20220                resection margins
Name: ngram, dtype: object
Top words for KIRC are:
49729                lymph nodes
6639               adrenal gland
71181                 renal cell
72084                 renal vein
15705        

In [40]:
# Collect all unique top words
all_top_words = set()
for df in top_words_per_cancer.values():
    all_top_words.update(df['word'].tolist())

# Collect all unique top n-grams
all_top_ngrams = set()
for df in top_ngrams_per_cancer.values():
    all_top_ngrams.update(df['ngram'].tolist())


In [42]:
import re

for word in all_top_words:
    pattern = r'\b' + re.escape(word) + r'\b'
    combined[f'word_{word}'] = combined['text'].str.contains(pattern, case=False, regex=True).astype(int)

for ngram in all_top_ngrams:
    pattern = re.escape(ngram)
    combined[f'ngram_{ngram}'] = combined['text'].str.contains(pattern, case=False, regex=True).astype(int)

  combined[f'word_{word}'] = combined['text'].str.contains(pattern, case=False, regex=True).astype(int)
  combined[f'word_{word}'] = combined['text'].str.contains(pattern, case=False, regex=True).astype(int)
  combined[f'word_{word}'] = combined['text'].str.contains(pattern, case=False, regex=True).astype(int)
  combined[f'word_{word}'] = combined['text'].str.contains(pattern, case=False, regex=True).astype(int)
  combined[f'word_{word}'] = combined['text'].str.contains(pattern, case=False, regex=True).astype(int)
  combined[f'word_{word}'] = combined['text'].str.contains(pattern, case=False, regex=True).astype(int)
  combined[f'word_{word}'] = combined['text'].str.contains(pattern, case=False, regex=True).astype(int)
  combined[f'word_{word}'] = combined['text'].str.contains(pattern, case=False, regex=True).astype(int)
  combined[f'word_{word}'] = combined['text'].str.contains(pattern, case=False, regex=True).astype(int)
  combined[f'word_{word}'] = combined['text'].str.contains(patte

In [43]:
combined.keys()

Index(['patient_filename', 'text', 'num_chars', 'num_words', 'num_sentences',
       'avg_word_length', 'patient_id', 'cancer_type', 'word_level',
       'word_temporal',
       ...
       'ngram_cystic duct', 'ngram_left kidney', 'ngram_left breast',
       'ngram_received fresh labeled', 'ngram_right upper',
       'ngram_right frontal', 'ngram_cm length', 'ngram_partial nephrectomy',
       'ngram_outer quadrant', 'ngram_measures cm'],
      dtype='object', length=490)

## Convert Medical Text Into Bert Text Embeddings

In [40]:
model_name = 'emilyalsentzer/Bio_ClinicalBERT'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

def get_embedding(text):
    inputs = tokenizer(
        text,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state

        mask = inputs['attention_mask'].unsqueeze(-1).expand(embeddings.size()).float()
        summed = torch.sum(embeddings * mask, 1)
        counts = torch.clamp(mask.sum(1), min=1e-9)
        mean_pooled = summed / counts

    return mean_pooled.squeeze().numpy()

tqdm.pandas()

reports['embeddings'] = reports['text'].apply(get_embedding)


## Flatten Text Embeddings & Merge Datasets

In [41]:
reports_emb = pd.DataFrame([emb.flatten() for emb in reports.embeddings.values])
reports_emb['patient_id'] = reports['patient_filename'].str.split('.').str[0]
data = reports_emb.merge(cancer_type, on='patient_id')

## Encode Cancer Type

In [43]:
le = LabelEncoder()
data['cancer_labels'] = le.fit_transform(data['cancer_type'])

In [46]:
final_data = pd.DataFrame(data)
final_data.drop(columns=['patient_id', 'cancer_type',], inplace=True)

In [47]:
final_data.to_csv(r'../../data/tcga_pathology/processed/tcga_pathology_reports.csv', index=False)