
**Reproduction of paper "Do We Still Need Clinical Language Models?"**


Project work for CS 598 Deep Learning for Healthcare, UIUC, Spring 2025. We are reporducing the [Do We Still Need Clinical Language Models?](https://arxiv.org/pdf/2302.08091).


In [None]:
print("Reproduction of the paper \'Do we still need CLinical Language Models\'")

Reproduction of the paper 'Do we still need CLinical Language Models'


In [None]:
# Connect to google drive and mount the filesystems
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Step 1: Environment Setup and Configuration

### 1.1 Import Dependencies

In [None]:
# Install required libraries
!pip install transformers



In [None]:
# Connect to Google Drive and mount the filesystem
from google.colab import drive
drive.mount('/content/drive')

# Import necessary libraries
import os
import json
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW  # Import AdamW from torch instead
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from transformers import (
    T5ForConditionalGeneration, T5Tokenizer,
    RobertaForSequenceClassification, RobertaTokenizer,
    AutoModelForSequenceClassification, AutoTokenizer,
    get_linear_schedule_with_warmup
)
import logging
import time
from tqdm.notebook import tqdm
import gc

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 1.2 Set Random Seeds for Reproducibility

In [None]:
# Set random seeds for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

# Set seed to 42 for reproducibility
SEED = 42
set_seed(SEED)

### 1.3 Configure Paths and Validate Directory Structure

In [None]:
# Base paths
BASE_DIR = '/content/drive/MyDrive/DL4H-Project'
DATA_DIR = os.path.join(BASE_DIR, 'data')
MODELS_DIR = os.path.join(BASE_DIR, 'models')
RESULTS_DIR = os.path.join(BASE_DIR, 'results')

# Dataset paths
MEDNLI_DIR = os.path.join(DATA_DIR, 'mednli')
RADQA_DIR = os.path.join(DATA_DIR, 'radqa')
CLIP_DIR = os.path.join(DATA_DIR, 'clip')

# Model paths
PRETRAINED_DIR = os.path.join(MODELS_DIR, 'pretrained')
FINETUNED_DIR = os.path.join(MODELS_DIR, 'finetuned')

# Constants
MODEL_NAMES = [
    't5-base',
    't5-large',
    'roberta-large',
    'bio-clinical-bert',  # BioClinRoBERTa
    'gatortron'
]

TASK_NAMES = ['mednli', 'radqa', 'clip']
DATA_PERCENTAGES = ['full', '25pct', '10pct', '5pct', '1pct']

# Helper function to validate directory exists
def validate_dir(directory):
    if os.path.exists(directory):
        return True
    else:
        print(f"Warning: Directory {directory} does not exist")
        return False

# Validate directory structure
print("Validating directory structure...")
directories_valid = True

# Validate main directories
for dir_path in [DATA_DIR, MODELS_DIR, RESULTS_DIR]:
    if not validate_dir(dir_path):
        directories_valid = False

# Validate task directories
for task in TASK_NAMES:
    task_dir = os.path.join(DATA_DIR, task)
    if not validate_dir(task_dir):
        directories_valid = False

if directories_valid:
    print("✅ All required directories are present")
else:
    print("⚠️ Some directories are missing - please check the warnings above")

# Helper function to get paths
def get_dataset_path(task, percentage='full'):
    """Get the path to a specific dataset."""
    return os.path.join(DATA_DIR, task, percentage)

def get_pretrained_model_path(model_name):
    """Get the path to a pretrained model."""
    # Determine if general or clinical model
    if model_name in ['t5-base', 't5-large', 'roberta-large']:
        model_type = 'general'
    else:
        model_type = 'clinical'
    return os.path.join(PRETRAINED_DIR, model_type, model_name)

def get_finetuned_model_path(model_name, task, percentage='full'):
    """Get the path to a finetuned model."""
    return os.path.join(FINETUNED_DIR, task, model_name, percentage)

Validating directory structure...
✅ All required directories are present


1.4 Setup Utility Functions

In [None]:
# Configure logging
def setup_logger(name, log_file, level=logging.INFO):
    """Set up a logger for experiment tracking."""
    handler = logging.FileHandler(log_file)
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)

    logger = logging.getLogger(name)
    logger.setLevel(level)
    logger.addHandler(handler)

    # Add console handler to see logs in Colab output
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)

    return logger

# Create experiment logger
log_path = os.path.join(BASE_DIR, 'experiment.log')
logger = setup_logger('experiment_logger', log_path)
logger.info("Starting experiment: Reproducing 'Do We Still Need Clinical Language Models?'")

# Memory management
def clean_memory():
    """Clean up memory to avoid OOM errors."""
    gc.collect()
    torch.cuda.empty_cache()

# Check hardware
def check_hardware():
    """Check available hardware and return device to use."""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info(f"Using device: {device}")

    if device.type == "cuda":
        logger.info(f"CUDA device: {torch.cuda.get_device_name(0)}")
        logger.info(f"CUDA memory allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
        logger.info(f"CUDA memory cached: {torch.cuda.memory_reserved(0) / 1024**2:.2f} MB")

    return device

# Get device
device = check_hardware()

# Experiment tracking
class ExperimentTracker:
    """Helper class to track experiment progress and results."""
    def __init__(self, base_path):
        self.base_path = base_path
        self.results = {}
        self.start_time = None

    def start_experiment(self, model_name, task, data_percentage):
        """Start tracking a new experiment."""
        experiment_id = f"{model_name}_{task}_{data_percentage}"
        self.results[experiment_id] = {
            "model": model_name,
            "task": task,
            "data_percentage": data_percentage,
            "training_metrics": {},
            "eval_metrics": {},
            "status": "running",
            "start_time": time.time()
        }
        logger.info(f"Started experiment: {experiment_id}")
        return experiment_id

    def update_training_metrics(self, experiment_id, epoch, metrics):
        """Update training metrics for an experiment."""
        if experiment_id in self.results:
            self.results[experiment_id]["training_metrics"][epoch] = metrics

    def update_eval_metrics(self, experiment_id, metrics):
        """Update evaluation metrics for an experiment."""
        if experiment_id in self.results:
            self.results[experiment_id]["eval_metrics"] = metrics

    def complete_experiment(self, experiment_id):
        """Mark experiment as complete."""
        if experiment_id in self.results:
            self.results[experiment_id]["status"] = "completed"
            self.results[experiment_id]["end_time"] = time.time()
            self.results[experiment_id]["duration"] = self.results[experiment_id]["end_time"] - self.results[experiment_id]["start_time"]
            logger.info(f"Completed experiment: {experiment_id} (Duration: {self.results[experiment_id]['duration']:.2f}s)")

    def save_results(self):
        """Save all results to disk."""
        results_path = os.path.join(self.base_path, "experiment_results.json")
        with open(results_path, "w") as f:
            json.dump(self.results, f, indent=2)
        logger.info(f"Saved experiment results to {results_path}")

# Create experiment tracker
tracker = ExperimentTracker(RESULTS_DIR)

2025-04-08 18:55:57,314 - experiment_logger - INFO - Starting experiment: Reproducing 'Do We Still Need Clinical Language Models?'
INFO:experiment_logger:Starting experiment: Reproducing 'Do We Still Need Clinical Language Models?'
2025-04-08 18:55:57,701 - experiment_logger - INFO - Using device: cuda
INFO:experiment_logger:Using device: cuda
2025-04-08 18:55:57,740 - experiment_logger - INFO - CUDA device: Tesla T4
INFO:experiment_logger:CUDA device: Tesla T4
2025-04-08 18:55:57,742 - experiment_logger - INFO - CUDA memory allocated: 0.00 MB
INFO:experiment_logger:CUDA memory allocated: 0.00 MB
2025-04-08 18:55:57,743 - experiment_logger - INFO - CUDA memory cached: 0.00 MB
INFO:experiment_logger:CUDA memory cached: 0.00 MB


### 1.5 Validate Dataset Availability

In [None]:
# Check dataset availability
def check_dataset_availability():
    """Check if all required datasets are available and log stats."""
    dataset_files = {
        "mednli": {
            "train": os.path.join(MEDNLI_DIR, "mli_train_v1.jsonl"),
            "dev": os.path.join(MEDNLI_DIR, "mli_dev_v1.jsonl"),
            "test": os.path.join(MEDNLI_DIR, "mli_test_v1.jsonl")
        },
        "radqa": {
            "train": os.path.join(RADQA_DIR, "train.json"),
            "dev": os.path.join(RADQA_DIR, "dev.json"),
            "test": os.path.join(RADQA_DIR, "test.json")
        },
        "clip": {
            # Updated to match actual CLIP file structure
            "sentence_level": os.path.join(CLIP_DIR, "sentence_level.csv"),
            "train_ids": os.path.join(CLIP_DIR, "train_ids.csv"),
            "val_ids": os.path.join(CLIP_DIR, "val_ids.csv"),
            "test_ids": os.path.join(CLIP_DIR, "test_ids.csv")
        }
    }

    dataset_availability = {}
    dataset_stats = {}

    for dataset, files in dataset_files.items():
        dataset_availability[dataset] = {}
        dataset_stats[dataset] = {}

        for file_type, file_path in files.items():
            dataset_availability[dataset][file_type] = os.path.exists(file_path)

            # If file exists, try to gather statistics
            if dataset_availability[dataset][file_type]:
                try:
                    if dataset == "mednli":
                        # JSONL format
                        with open(file_path, 'r') as f:
                            lines = f.readlines()
                            dataset_stats[dataset][file_type] = {
                                'samples': len(lines),
                                'file_size_mb': os.path.getsize(file_path) / (1024 * 1024)
                            }
                    elif dataset == "radqa":
                        # JSON format
                        with open(file_path, 'r') as f:
                            data = json.load(f)
                            samples = sum(len(article['paragraphs']) for article in data['data'])
                            dataset_stats[dataset][file_type] = {
                                'samples': samples,
                                'file_size_mb': os.path.getsize(file_path) / (1024 * 1024)
                            }
                    elif dataset == "clip":
                        # CSV format
                        if file_type == "sentence_level":
                            try:
                                df = pd.read_csv(file_path)
                                dataset_stats[dataset][file_type] = {
                                    'samples': len(df),
                                    'file_size_mb': os.path.getsize(file_path) / (1024 * 1024)
                                }
                            except:
                                dataset_stats[dataset][file_type] = {
                                    'samples': 'unknown',
                                    'file_size_mb': os.path.getsize(file_path) / (1024 * 1024)
                                }
                        else:
                            # ID files
                            try:
                                ids = pd.read_csv(file_path)
                                dataset_stats[dataset][file_type] = {
                                    'ids': len(ids),
                                    'file_size_mb': os.path.getsize(file_path) / (1024 * 1024)
                                }
                            except:
                                dataset_stats[dataset][file_type] = {
                                    'ids': 'unknown',
                                    'file_size_mb': os.path.getsize(file_path) / (1024 * 1024)
                                }
                except Exception as e:
                    dataset_stats[dataset][file_type] = {
                        'samples': 'error',
                        'file_size_mb': os.path.getsize(file_path) / (1024 * 1024) if os.path.exists(file_path) else 0,
                        'error': str(e)
                    }

    # Log availability
    all_available = True
    for dataset, availability in dataset_availability.items():
        if all(availability.values()):
            logger.info(f"✅ {dataset.upper()} dataset is fully available")
            for file_type, stats in dataset_stats[dataset].items():
                if 'samples' in stats:
                    if isinstance(stats['samples'], int):
                        logger.info(f"   - {file_type}: {stats['samples']} samples ({stats['file_size_mb']:.2f} MB)")
                    else:
                        logger.info(f"   - {file_type}: {stats.get('samples', 'unknown')} samples ({stats['file_size_mb']:.2f} MB)")
                elif 'ids' in stats:
                    logger.info(f"   - {file_type}: {stats['ids']} IDs ({stats['file_size_mb']:.2f} MB)")
        else:
            all_available = False
            logger.warning(f"⚠️ {dataset.upper()} dataset is missing some files: {availability}")
            for file_type, exists in availability.items():
                if not exists:
                    logger.warning(f"   - Missing: {dataset_files[dataset][file_type]}")

    if all_available:
        print("✅ All required dataset files are available")
    else:
        print("⚠️ Some dataset files are missing - see logs for details")

    return dataset_availability, dataset_stats

# Check dataset availability
dataset_availability, dataset_stats = check_dataset_availability()

2025-04-08 18:56:07,550 - experiment_logger - INFO - ✅ MEDNLI dataset is fully available
INFO:experiment_logger:✅ MEDNLI dataset is fully available
2025-04-08 18:56:07,552 - experiment_logger - INFO -    - train: 11232 samples (10.52 MB)
INFO:experiment_logger:   - train: 11232 samples (10.52 MB)
2025-04-08 18:56:07,553 - experiment_logger - INFO -    - dev: 1395 samples (1.35 MB)
INFO:experiment_logger:   - dev: 1395 samples (1.35 MB)
2025-04-08 18:56:07,555 - experiment_logger - INFO -    - test: 1422 samples (1.30 MB)
INFO:experiment_logger:   - test: 1422 samples (1.30 MB)
2025-04-08 18:56:07,558 - experiment_logger - INFO - ✅ RADQA dataset is fully available
INFO:experiment_logger:✅ RADQA dataset is fully available
2025-04-08 18:56:07,559 - experiment_logger - INFO -    - train: 1606 samples (2.94 MB)
INFO:experiment_logger:   - train: 1606 samples (2.94 MB)
2025-04-08 18:56:07,560 - experiment_logger - INFO -    - dev: 204 samples (0.39 MB)
INFO:experiment_logger:   - dev: 204 sa

✅ All required dataset files are available


### 1.6 Define Model Mapping and Specs - Updated for CLIP Structure

In [None]:
# Define model specifications including sources and parameter counts
MODEL_SPECS = {
    't5-base': {
        'name': 't5-base',
        'source': 'google/t5-base',
        'parameters': 220_000_000,
        'type': 'encoder-decoder',
        'domain': 'general',
        'tokenizer': T5Tokenizer,
        'model_class': T5ForConditionalGeneration
    },
    't5-large': {
        'name': 't5-large',
        'source': 'google/t5-large',
        'parameters': 770_000_000,
        'type': 'encoder-decoder',
        'domain': 'general',
        'tokenizer': T5Tokenizer,
        'model_class': T5ForConditionalGeneration
    },
    'roberta-large': {
        'name': 'roberta-large',
        'source': 'roberta-large',
        'parameters': 345_000_000,
        'type': 'encoder-only',
        'domain': 'general',
        'tokenizer': RobertaTokenizer,
        'model_class': RobertaForSequenceClassification
    },
    'bio-clinical-bert': {
        'name': 'bio-clinical-bert',
        'source': 'emilyalsentzer/Bio_ClinicalBERT',
        'parameters': 345_000_000,
        'type': 'encoder-only',
        'domain': 'clinical',
        'tokenizer': AutoTokenizer,
        'model_class': AutoModelForSequenceClassification
    },
    'gatortron': {
        'name': 'gatortron',
        'source': 'UFNLP/gatortron-base',  # Update if this is not the correct source
        'parameters': 345_000_000,
        'type': 'encoder-only',
        'domain': 'clinical',
        'tokenizer': AutoTokenizer,
        'model_class': AutoModelForSequenceClassification
    }
}

# Define task specifications
TASK_SPECS = {
    'mednli': {
        'name': 'mednli',
        'type': 'classification',
        'num_labels': 3,
        'labels': ['entailment', 'neutral', 'contradiction'],
        'metrics': ['accuracy'],
        'encoder_input_format': '{premise} [SEP] {hypothesis}',
        'encoder_decoder_input_format': 'mnli premise: {premise} hypothesis: {hypothesis}',
        'max_length': 256
    },
    'radqa': {
        'name': 'radqa',
        'type': 'question-answering',
        'metrics': ['f1', 'exact_match'],
        'encoder_input_format': '{question} [SEP] {context}',
        'encoder_decoder_input_format': 'question: {question} context: {context}',
        'max_length': 512
    },
    'clip': {
        'name': 'clip',
        'type': 'multi-label-classification',
        'num_labels': 7,
        'labels': [
            'appointment-related',
            'medication-related',
            'lab-related',
            'patient-instructions',
            'procedure-related',
            'imaging-related',
            'other'
        ],
        'metrics': ['micro_f1', 'macro_f1'],
        'encoder_input_format': '{sentence}',
        'encoder_decoder_input_format': 'clip: {sentence}',
        'max_length': 256
    }
}

# Check prerequisites for Phase 1
def check_phase1_prerequisites():
    prerequisites = {
        "Required Libraries": {
            "torch": torch.__version__ if 'torch' in globals() else "Not installed",
            # Fix this line to check properly for transformers
            "transformers": "Installed" if 'T5ForConditionalGeneration' in globals() else "Not installed",
            "pandas": pd.__version__ if 'pd' in globals() else "Not installed",
            "numpy": np.__version__ if 'np' in globals() else "Not installed",
            "matplotlib": "Installed" if 'plt' in globals() else "Not installed",
            "sklearn": "Installed" if 'f1_score' in globals() else "Not installed",
        },
        "Google Drive Access": "Connected" if os.path.exists(BASE_DIR) else "Not connected",
        "Dataset Files": {
            "MedNLI Files": all(dataset_availability.get("mednli", {}).values()),
            "RadQA Files": all(dataset_availability.get("radqa", {}).values()),
            # Updated for CLIP's actual structure
            "CLIP Files": all(dataset_availability.get("clip", {}).values())
        },
        "GPU Availability": "Available" if torch.cuda.is_available() else "Not available"
    }

    all_prerequisites_met = (
        all(status != "Not installed" for status in prerequisites["Required Libraries"].values()) and
        prerequisites["Google Drive Access"] == "Connected" and
        all(prerequisites["Dataset Files"].values())
    )

    print("Phase 1 Prerequisites Check:")
    print("----------------------------")

    for category, items in prerequisites.items():
        if isinstance(items, dict):
            print(f"{category}:")
            for name, status in items.items():
                status_symbol = "✅" if status not in ["Not installed", False] else "❌"
                print(f"  {status_symbol} {name}: {status}")
        else:
            status_symbol = "✅" if items not in ["Not installed", "Not connected", False] else "❌"
            print(f"{status_symbol} {category}: {items}")

    if all_prerequisites_met:
        print("\n✅ All Phase 1 prerequisites are met!")
    else:
        print("\n⚠️ Some prerequisites are missing. Please address the issues marked with ❌")

    return all_prerequisites_met

# Check Phase 1 prerequisites
phase1_ready = check_phase1_prerequisites()

print("\nEnvironment setup " + ("complete!" if phase1_ready else "incomplete - see warnings above"))

# Print prerequisite files required for Phase 1
print("\nPrerequisite Files Required for Phase 1:")
print("---------------------------------------")
print("Dataset Files:")
print("1. MedNLI:")
print("   - " + os.path.join(MEDNLI_DIR, "mli_train_v1.jsonl"))
print("   - " + os.path.join(MEDNLI_DIR, "mli_dev_v1.jsonl"))
print("   - " + os.path.join(MEDNLI_DIR, "mli_test_v1.jsonl"))
print("2. RadQA:")
print("   - " + os.path.join(RADQA_DIR, "train.json"))
print("   - " + os.path.join(RADQA_DIR, "dev.json"))
print("   - " + os.path.join(RADQA_DIR, "test.json"))
print("3. CLIP: (updated file structure)")
print("   - " + os.path.join(CLIP_DIR, "sentence_level.csv") + " (main data file)")
print("   - " + os.path.join(CLIP_DIR, "train_ids.csv") + " (training split IDs)")
print("   - " + os.path.join(CLIP_DIR, "val_ids.csv") + " (validation split IDs)")
print("   - " + os.path.join(CLIP_DIR, "test_ids.csv") + " (test split IDs)")
print("\nRequired Python Libraries:")
print("- torch")
print("- transformers")
print("- pandas")
print("- numpy")
print("- matplotlib")
print("- scikit-learn")
print("\nHardware Requirements:")
print("- GPU with CUDA support (recommended)")

Phase 1 Prerequisites Check:
----------------------------
Required Libraries:
  ✅ torch: 2.6.0+cu124
  ✅ transformers: Installed
  ✅ pandas: 2.2.2
  ✅ numpy: 2.0.2
  ✅ matplotlib: Installed
  ✅ sklearn: Installed
✅ Google Drive Access: Connected
Dataset Files:
  ✅ MedNLI Files: True
  ✅ RadQA Files: True
  ✅ CLIP Files: True
✅ GPU Availability: Available

✅ All Phase 1 prerequisites are met!

Environment setup complete!

Prerequisite Files Required for Phase 1:
---------------------------------------
Dataset Files:
1. MedNLI:
   - /content/drive/MyDrive/DL4H-Project/data/mednli/mli_train_v1.jsonl
   - /content/drive/MyDrive/DL4H-Project/data/mednli/mli_dev_v1.jsonl
   - /content/drive/MyDrive/DL4H-Project/data/mednli/mli_test_v1.jsonl
2. RadQA:
   - /content/drive/MyDrive/DL4H-Project/data/radqa/train.json
   - /content/drive/MyDrive/DL4H-Project/data/radqa/dev.json
   - /content/drive/MyDrive/DL4H-Project/data/radqa/test.json
3. CLIP: (updated file structure)
   - /content/drive/MyDri

### 1.7 Environment Verification and Setup Summary

In [None]:
# First, install any missing libraries
if "transformers" not in globals():
    print("Installing transformers library...")
    !pip install transformers
    print("Please restart runtime after installation")

# Verify environment and dataset access
def summarize_environment():
    """Provide a summary of the environment and datasets for reference."""

    # System info
    import platform
    import torch

    # Check if using GPU
    if torch.cuda.is_available():
        device_type = "GPU"
        device_name = torch.cuda.get_device_name(0)
        device_memory = f"{torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB"
    else:
        device_type = "CPU"
        device_name = "N/A"
        device_memory = "N/A"

    # Summarize system info
    system_info = {
        "Python Version": platform.python_version(),
        "OS": platform.system(),
        "PyTorch Version": torch.__version__,
        "Device Type": device_type,
        "Device Name": device_name,
        "Device Memory": device_memory,
        "CUDA Version": torch.version.cuda if torch.cuda.is_available() else "N/A",
    }

    # Summarize dataset stats
    dataset_summary = {}
    for dataset, stats in dataset_stats.items():
        if dataset == "mednli":
            dataset_summary[dataset] = {
                "Train Samples": stats.get("train", {}).get("samples", "unknown"),
                "Dev Samples": stats.get("dev", {}).get("samples", "unknown"),
                "Test Samples": stats.get("test", {}).get("samples", "unknown")
            }
        elif dataset == "radqa":
            dataset_summary[dataset] = {
                "Train Samples": stats.get("train", {}).get("samples", "unknown"),
                "Dev Samples": stats.get("dev", {}).get("samples", "unknown"),
                "Test Samples": stats.get("test", {}).get("samples", "unknown")
            }
        elif dataset == "clip":
            dataset_summary[dataset] = {
                "Total Samples": stats.get("sentence_level", {}).get("samples", "unknown"),
                "Train IDs": stats.get("train_ids", {}).get("ids", "unknown"),
                "Val IDs": stats.get("val_ids", {}).get("ids", "unknown"),
                "Test IDs": stats.get("test_ids", {}).get("ids", "unknown")
            }

    # Print system info
    print("System Information:")
    print("-----------------")
    for key, value in system_info.items():
        print(f"{key}: {value}")

    # Print dataset summary
    print("\nDataset Summary:")
    print("---------------")
    for dataset, summary in dataset_summary.items():
        print(f"{dataset.upper()}:")
        for key, value in summary.items():
            print(f"  {key}: {value}")

    # Return all information for logging
    return {
        "system_info": system_info,
        "dataset_summary": dataset_summary
    }

# Run the summary
if phase1_ready:
    env_summary = summarize_environment()

    # Log the summary
    logger.info("Environment setup complete")
    logger.info(f"System Information: {json.dumps(env_summary['system_info'])}")
    logger.info(f"Dataset Summary: {json.dumps(env_summary['dataset_summary'])}")

    print("\nPhase 1 Complete: Environment is ready for Phase 2 (Dataset Processing)")
    print("Note: Using T4 GPU for training and inference")
else:
    print("\n⚠️ Please fix the issues above before proceeding to Phase 2")

Installing transformers library...


2025-04-08 19:11:55,841 - experiment_logger - INFO - Environment setup complete
INFO:experiment_logger:Environment setup complete
2025-04-08 19:11:55,844 - experiment_logger - INFO - System Information: {"Python Version": "3.11.11", "OS": "Linux", "PyTorch Version": "2.6.0+cu124", "Device Type": "GPU", "Device Name": "Tesla T4", "Device Memory": "14.74 GB", "CUDA Version": "12.4"}
INFO:experiment_logger:System Information: {"Python Version": "3.11.11", "OS": "Linux", "PyTorch Version": "2.6.0+cu124", "Device Type": "GPU", "Device Name": "Tesla T4", "Device Memory": "14.74 GB", "CUDA Version": "12.4"}
2025-04-08 19:11:55,846 - experiment_logger - INFO - Dataset Summary: {"mednli": {"Train Samples": 11232, "Dev Samples": 1395, "Test Samples": 1422}, "radqa": {"Train Samples": 1606, "Dev Samples": 204, "Test Samples": 208}, "clip": {"Total Samples": 107494, "Train IDs": 517, "Val IDs": 99, "Test IDs": 99}}
INFO:experiment_logger:Dataset Summary: {"mednli": {"Train Samples": 11232, "Dev Sa

Please restart runtime after installation
System Information:
-----------------
Python Version: 3.11.11
OS: Linux
PyTorch Version: 2.6.0+cu124
Device Type: GPU
Device Name: Tesla T4
Device Memory: 14.74 GB
CUDA Version: 12.4

Dataset Summary:
---------------
MEDNLI:
  Train Samples: 11232
  Dev Samples: 1395
  Test Samples: 1422
RADQA:
  Train Samples: 1606
  Dev Samples: 204
  Test Samples: 208
CLIP:
  Total Samples: 107494
  Train IDs: 517
  Val IDs: 99
  Test IDs: 99

Phase 1 Complete: Environment is ready for Phase 2 (Dataset Processing)
Note: Using T4 GPU for training and inference
