In [None]:
%pip install nltk
import nltk

# Ensure nltk data is downloaded
nltk.download('punkt')
nltk.download('punkt_tab')  # Include this line to target the specific missing tokenizer resource

from nltk.tokenize import word_tokenize
test_sentence = "This is a test sentence."
tokens = word_tokenize(test_sentence)
print("Tokens:", tokens)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Tokens: ['This', 'is', 'a', 'test', 'sentence', '.']


In [None]:
# %pip uninstall torch torchtext -y

%pip install torch==2.0.0 torchtext==0.15.1


Collecting torch==2.0.0
  Downloading torch-2.0.0-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Collecting torchtext==0.15.1
  Downloading torchtext-0.15.1-cp310-cp310-manylinux1_x86_64.whl.metadata (7.4 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.0)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.0)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.0)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu11==8.5.0.96 (from torch==2.0.0)
  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu11==11.10.3.66 (from torch==2.0.0)
  Downloading nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Co

In [None]:
%pip install torchmetrics
from torchmetrics.text import BLEUScore
preds = ['the cat is on the mat']
target = [['there is a cat on the mat', 'a cat is on the mat']]
bleu = BLEUScore()
bleu(preds, target)

Collecting torchmetrics
  Downloading torchmetrics-1.6.1-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.11.9-py3-none-any.whl.metadata (5.2 kB)
Downloading torchmetrics-1.6.1-py3-none-any.whl (927 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m927.3/927.3 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.11.9-py3-none-any.whl (28 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.11.9 torchmetrics-1.6.1


tensor(0.7598)

In [None]:
import os
import nltk
from torchmetrics.text import BLEUScore
from nltk.tokenize import word_tokenize

# Ensure nltk resources are available
nltk.download('punkt')

def load_references(reference_folder: str) -> list[list[str]]:
    """
    Load reference sentences from text files in the given folder.
    Tokenize each reference and return as a list of tokenized sentences.
    """
    references = []
    for filename in sorted(os.listdir(reference_folder)):  # Ensure files are processed in order
        if filename.endswith(".txt"):
            file_path = os.path.join(reference_folder, filename)
            with open(file_path, 'r') as file:
                content = file.read().strip()
                tokenized = word_tokenize(content.lower())  # Convert to lowercase
                references.append(tokenized)  # Append tokenized reference
    return references

def calculate_average_bleu(references: list[list[str]], predictions: list[str]) -> tuple[float, list[float]]:
    """
    Calculate the average BLEU score for individual reference-prediction pairs.
    Includes debug output for tokenized data and n-gram overlap.
    """
    bleu_scores = []
    bleu_calculator = BLEUScore(n_gram=4, smooth=True)  # Enable smoothing for BLEU calculation

    for idx, (ref_tokens, pred) in enumerate(zip(references, predictions)):
        # Tokenize the prediction
        pred_tokens = word_tokenize(pred.lower())

        # Debugging: Output reference and prediction tokens
        print(f"\nPair {idx + 1}:")
        print(f"Reference Tokens: {ref_tokens}")
        print(f"Prediction Tokens: {pred_tokens}")

        # Calculate BLEU score
        bleu = bleu_calculator(pred_tokens, ref_tokens).item()  # Extract scalar value
        print(f"BLEU Score: {bleu:.4f}")

        # Debugging: Display n-gram overlaps
        for n in range(1, 5):  # Check 1-gram to 4-gram matches
            ref_ngrams = list(nltk.ngrams(ref_tokens, n))
            pred_ngrams = list(nltk.ngrams(pred_tokens, n))
            overlap = set(pred_ngrams) & set(ref_ngrams)
            print(f"{n}-Gram Overlap: {overlap}")
            print(f"{n}-Gram Match Count: {len(overlap)}/{len(pred_ngrams)}")

        bleu_scores.append(bleu)

    # Calculate the average BLEU score
    average_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0.0
    return average_bleu, bleu_scores

def main(reference_folder: str, predictions: list[str]):
    # Load references from the folder
    references = load_references(reference_folder)

    # Ensure the number of references matches the number of predictions
    if len(references) != len(predictions):
        raise ValueError(f"Number of references ({len(references)}) does not match number of predictions ({len(predictions)})")

    # Calculate average BLEU score
    average_bleu, all_scores = calculate_average_bleu(references, predictions)
    return average_bleu, all_scores

# Example Usage
predictions = [
    "The lungs are clear. The cardiomediastinal silhouette and hilar contours are normal. The pleural surfaces are normal without effusion or pneumothorax.",
    "The patient is status post median sternotomy and CABG. The heart size is normal. The mediastinal and hilar contours are unremarkable. The pulmonary vascularity is normal. The lungs are clear. No pleural effusion or pneumothorax is present. No acute osseous abnormalities are seen."
]

# Path to your reference folder
reference_folder = "/content/findings"  # Update with the actual path

# Calculate BLEU scores
average_bleu, all_bleu_scores = main(reference_folder, predictions)

print(f"\nIndividual BLEU Scores: {all_bleu_scores}")
print(f"Average BLEU Score: {average_bleu:.4f}")



Pair 1:
Reference Tokens: ['the', 'cardiac', 'silhouette', 'and', 'mediastinum', 'size', 'are', 'within', 'normal', 'limits', '.', 'there', 'is', 'no', 'pulmonary', 'edema', '.', 'there', 'is', 'no', 'focal', 'consolidation', '.', 'there', 'are', 'no', 'xxxx', 'of', 'a', 'pleural', 'effusion', '.', 'there', 'is', 'no', 'evidence', 'of', 'pneumothorax', '.']
Prediction Tokens: ['the', 'lungs', 'are', 'clear', '.', 'the', 'cardiomediastinal', 'silhouette', 'and', 'hilar', 'contours', 'are', 'normal', '.', 'the', 'pleural', 'surfaces', 'are', 'normal', 'without', 'effusion', 'or', 'pneumothorax', '.']
BLEU Score: 0.0000
1-Gram Overlap: {('pleural',), ('silhouette',), ('pneumothorax',), ('and',), ('normal',), ('the',), ('are',), ('effusion',), ('.',)}
1-Gram Match Count: 9/24
2-Gram Overlap: {('silhouette', 'and'), ('pneumothorax', '.')}
2-Gram Match Count: 2/23
3-Gram Overlap: set()
3-Gram Match Count: 0/22
4-Gram Overlap: set()
4-Gram Match Count: 0/21

Pair 2:
Reference Tokens: ['borde

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
pip install rouge_score



In [None]:
import os
from rouge_score import rouge_scorer
from nltk.tokenize import word_tokenize

def load_references(reference_folder: str) -> list[str]:
    """
    Load reference sentences from text files in the given folder.
    Returns a list of reference strings.
    """
    references = []
    for filename in sorted(os.listdir(reference_folder)):  # Ensure files are processed in order
        if filename.endswith(".txt"):  # Only process .txt files
            file_path = os.path.join(reference_folder, filename)
            with open(file_path, 'r') as file:
                content = file.read().strip()
                references.append(content)  # Add reference as a string
    return references

def calculate_rouge_l_score(references: list[str], predictions: list[str]) -> tuple[float, list[float]]:
    """
    Calculate ROUGE-L scores for each reference-prediction pair.
    Returns the average ROUGE-L score and individual scores.
    """
    scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)  # Use stemming for better matching
    rouge_scores = []

    for idx, (ref, pred) in enumerate(zip(references, predictions)):
        # Calculate ROUGE-L for the pair
        score = scorer.score(ref, pred)["rougeL"]
        rouge_scores.append(score.fmeasure)  # Use F-measure as the primary score

        # Debugging output
        print(f"\nPair {idx + 1}:")
        print(f"Reference: {ref}")
        print(f"Prediction: {pred}")
        print(f"ROUGE-L Precision: {score.precision:.4f}")
        print(f"ROUGE-L Recall: {score.recall:.4f}")
        print(f"ROUGE-L F-Measure: {score.fmeasure:.4f}")

    # Calculate the average ROUGE-L F-measure
    average_rouge_l = sum(rouge_scores) / len(rouge_scores) if rouge_scores else 0.0
    return average_rouge_l, rouge_scores

def main(reference_folder: str, predictions: list[str]):
    # Load references from the folder
    references = load_references(reference_folder)

    # Ensure the number of references matches the number of predictions
    if len(references) != len(predictions):
        raise ValueError(f"Number of references ({len(references)}) does not match number of predictions ({len(predictions)})")

    # Calculate ROUGE-L scores
    average_rouge_l, rouge_l_scores = calculate_rouge_l_score(references, predictions)
    return average_rouge_l, rouge_l_scores

# Example Usage
predictions = [
    "The lungs are clear. The cardiomediastinal silhouette and hilar contours are normal. The pleural surfaces are normal without effusion or pneumothorax.",
    "The patient is status post median sternotomy and CABG. The heart size is normal. The mediastinal and hilar contours are unremarkable. The pulmonary vascularity is normal. The lungs are clear. No pleural effusion or pneumothorax is present. No acute osseous abnormalities are seen."
]

reference_folder = "/content/findings"  # Folder containing .txt files, one reference per file

# Calculate ROUGE-L scores
average_rouge_l, all_rouge_l_scores = main(reference_folder, predictions)

print(f"\nIndividual ROUGE-L Scores: {all_rouge_l_scores}")
print(f"Average ROUGE-L Score: {average_rouge_l:.4f}")



Pair 1:
Reference: The cardiac silhouette and mediastinum size are within normal limits. There is no pulmonary edema. There is no focal consolidation. There are no XXXX of a pleural effusion. There is no evidence of pneumothorax.
Prediction: The lungs are clear. The cardiomediastinal silhouette and hilar contours are normal. The pleural surfaces are normal without effusion or pneumothorax.
ROUGE-L Precision: 0.3810
ROUGE-L Recall: 0.2353
ROUGE-L F-Measure: 0.2909

Pair 2:
Reference: Borderline cardiomegaly. Midline sternotomy XXXX. Enlarged pulmonary arteries. Clear lungs. Inferior XXXX XXXX XXXX.
Prediction: The patient is status post median sternotomy and CABG. The heart size is normal. The mediastinal and hilar contours are unremarkable. The pulmonary vascularity is normal. The lungs are clear. No pleural effusion or pneumothorax is present. No acute osseous abnormalities are seen.
ROUGE-L Precision: 0.0698
ROUGE-L Recall: 0.2143
ROUGE-L F-Measure: 0.1053

Individual ROUGE-L Scores

In [None]:
!pip install --upgrade torch
!pip install --upgrade transformers


Collecting torch
  Downloading torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)




In [None]:
# %pip uninstall bert_score
%pip install bert_score
import os
from bert_score import score

def load_references(reference_folder: str) -> list[str]:
    """
    Load reference sentences from text files in the given folder.
    Returns a list of reference strings.
    """
    references = []
    for filename in sorted(os.listdir(reference_folder)):  # Ensure files are processed in order
        if filename.endswith(".txt"):  # Only process .txt files
            file_path = os.path.join(reference_folder, filename)
            with open(file_path, 'r') as file:
                content = file.read().strip()
                references.append(content)  # Add reference as a string
    return references

def calculate_bert_scores(references: list[str], predictions: list[str], model: str = "microsoft/deberta-xlarge-mnli") -> tuple[float, list[float]]:
    """
    Calculate BERTScore for reference-prediction pairs.
    Returns the average BERTScore and individual scores.
    """
    # Compute BERTScore
    P, R, F1 = score(predictions, references, model_type=model, lang="en", verbose=True)

    # Convert F1 Tensor to a list of float values
    bert_scores = [f.item() for f in F1]

    # Calculate average BERTScore
    average_bert_score = sum(bert_scores) / len(bert_scores) if bert_scores else 0.0

    # Debugging output for individual scores
    for idx, (ref, pred, f1) in enumerate(zip(references, predictions, bert_scores)):
        print(f"\nPair {idx + 1}:")
        print(f"Reference: {ref}")
        print(f"Prediction: {pred}")
        print(f"BERTScore F1: {f1:.4f}")

    return average_bert_score, bert_scores

def main(reference_folder: str, predictions: list[str]):
    # Load references from the folder
    references = load_references(reference_folder)

    # Ensure the number of references matches the number of predictions
    if len(references) != len(predictions):
        raise ValueError(f"Number of references ({len(references)}) does not match number of predictions ({len(predictions)})")

    # Calculate BERTScore
    average_bert, all_bert_scores = calculate_bert_scores(references, predictions)
    return average_bert, all_bert_scores

# Example Usage
predictions = [
    "The lungs are clear. The cardiomediastinal silhouette and hilar contours are normal. The pleural surfaces are normal without effusion or pneumothorax.",
    "The patient is status post median sternotomy and CABG. The heart size is normal. The mediastinal and hilar contours are unremarkable. The pulmonary vascularity is normal. The lungs are clear. No pleural effusion or pneumothorax is present. No acute osseous abnormalities are seen."
]

reference_folder = "/content/findings"  # Folder containing .txt files, one reference per file

# Calculate BERTScore
average_bert, all_bert_scores = main(reference_folder, predictions)

print(f"\nIndividual BERTScores: {all_bert_scores}")
print(f"Average BERTScore: {average_bert:.4f}")




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


pytorch_model.bin:   0%|          | 0.00/3.04G [00:00<?, ?B/s]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 6.89 seconds, 0.29 sentences/sec

Pair 1:
Reference: The cardiac silhouette and mediastinum size are within normal limits. There is no pulmonary edema. There is no focal consolidation. There are no XXXX of a pleural effusion. There is no evidence of pneumothorax.
Prediction: The lungs are clear. The cardiomediastinal silhouette and hilar contours are normal. The pleural surfaces are normal without effusion or pneumothorax.
BERTScore F1: 0.7309

Pair 2:
Reference: Borderline cardiomegaly. Midline sternotomy XXXX. Enlarged pulmonary arteries. Clear lungs. Inferior XXXX XXXX XXXX.
Prediction: The patient is status post median sternotomy and CABG. The heart size is normal. The mediastinal and hilar contours are unremarkable. The pulmonary vascularity is normal. The lungs are clear. No pleural effusion or pneumothorax is present. No acute osseous abnormalities are seen.
BERTScore F1: 0.5355

Individual BERTScores: [0.7309274077415466, 0.5355066657066345]
Average BERTScore: 0.6332


In [None]:
class CheXbertModel(torch.nn.Module):
    def __init__(self, num_labels=14):  # Assuming 14 conditions
        super(CheXbertModel, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.pooler_output)  # Use pooled output for classification
        return logits

CHEXBERT_PATH = "chexbert.pth"  # Update this path
model = CheXbertModel(num_labels=14)  # Assuming 14 conditions
checkpoint = torch.load(CHEXBERT_PATH, map_location=torch.device("cpu"))
model.load_state_dict(checkpoint, strict=False)
model.eval()

# Define tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

from torch.nn import Sigmoid

def get_predictions(text, model):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        logits = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
    return (logits.sigmoid() > 0.5).int()  # Multi-label binary predictions
     # Apply sigmoid to the logits to get probabilities (between 0 and 1)
    # sigmoid = Sigmoid()
    # probabilities = sigmoid(logits)

    # # Return probabilities (fractions between 0 and 1)
    # return probabilities
    # # return (probabilities.sigmoid() > 0.5).int()

from sklearn.metrics import f1_score

# Example data
# reference_text = ["The cardiac silhouette and mediastinum size are within normal limits. There is no pulmonary edema. There is no focal consolidation. There are no XXXX of a pleural effusion. There is no evidence of pneumothorax."]
# prediction_text = ["The lungs are clear. The cardiomediastinal silhouette and hilar contours are normal. The pleural surfaces are normal without effusion or pneumothorax."]

reference_text = ["The lungs are clear"]
prediction_text = ["The lungs are not clear"]
# ground_truth = [[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]  # One-hot for conditions

# Get predictions
prediction = get_predictions(prediction_text, model)
predicted_labels = prediction.tolist()
# threshold = 0.5
# predicted_labels = (predicted_labels > threshold).astype(int)

ref = get_predictions(reference_text, model)
ref_labels = ref.tolist()
# ref_labels = (ref_labels > threshold).astype(int)

print(predicted_labels)
print(ref_labels)
# Compute F1 Score
f1 = f1_score(ref_labels, predicted_labels, average="weighted")  # Use "macro" or "weighted"
print(f"F1-Chexbert Score: {f1:.4f}")

  checkpoint = torch.load(CHEXBERT_PATH, map_location=torch.device("cpu"))


[[1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0]]
[[1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0]]
F1-Chexbert Score: 1.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
!pip install torch
!pip install transformers
!pip install scikit-learn
!pip install numpy
!pip install appdirs
!pip install f1chexbert



In [None]:
from f1chexbert import F1CheXbert
f1chexbert = F1CheXbert()

FileNotFoundError: [Errno 2] No such file or directory: '/root/.cache/chexbert/chexbert.pth'

In [None]:
accuracy, accuracy_not_averaged, class_report, class_report_5 = f1chexbert(
    hyps=['No pleural effusion. Normal heart size.',
          'Normal heart size.',
          'Increased mild pulmonary edema and left basal atelectasis.',
          'Bilateral lower lobe bronchiectasis with improved right lower medial lung peribronchial consolidation.',
          'Elevated left hemidiaphragm and blunting of the left costophrenic angle although no definite evidence of pleural effusion seen on the lateral view.',
          ],
    refs=['No pleural effusions.',
          'Enlarged heart.',
          'No evidence of pneumonia. Stable cardiomegaly.',
          'Bilateral lower lobe bronchiectasis with improved right lower medial lung peribronchial consolidation.',
          'No acute cardiopulmonary process. No significant interval change. Please note that peribronchovascular ground-glass opacities at the left greater than right lung bases seen on the prior chest CT of ___ were not appreciated on prior chest radiography on the same date and may still be present. Additionally, several pulmonary nodules measuring up to 3 mm are not not well appreciated on the current study-CT is more sensitive.'
          ])

NameError: name 'f1chexbert' is not defined