In [None]:
### Dependecis ###

! pip uninstall numpy
! pip install numpy==1.26.4 --break-system-packages
! pip install -r requirements_training.txt
!pip install sacremoses

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

In [None]:
###### Data Pre-Processing ######

import pandas as pd
import re
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
from sklearn.model_selection import train_test_split
import os
import warnings

# Suppress the past_key_values deprecation warning (internal to transformers)
warnings.filterwarnings("ignore", message=".*past_key_values.*EncoderDecoderCache.*")


###### Model Training bioGPT ######

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    get_linear_schedule_with_warmup,
    TrainingArguments,
    Trainer
)
from tqdm import tqdm
import os
from datetime import datetime
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"




In [None]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

In [None]:
path = "mtsamples.csv"

print("----- CVS PATH TO DATASET --------")

df = pd.read_csv(path)
print(f"\n{df.head(11)}")


In [None]:
print(f"---Shape of data set ---")
old_value = df.shape
print(old_value)

In [None]:
df.isna().sum()

In [None]:
main_df = df.copy()


In [None]:
print(f"This is the firts shape of our DF : {old_value}")
print(f"This is the second shape of our DF : {main_df.shape}")

In [None]:
main_df["Keywords"] = main_df["keywords"].fillna('')
main_df = main_df.dropna(subset=['transcription'])
main_df = main_df.drop(columns=["Unnamed: 0"])

In [None]:
main_df['Keywords']

In [None]:
lemmatizer = WordNetLemmatizer()

stop_words = set(stopwords.words("english"))

In [None]:
transcription_column = main_df['transcription'].dtype
print(transcription_column)

In [None]:
def clean_medical_text(text):

    # Remove excessive whitespace
    text = re.sub(r"\s+", ' ', text)

    # # Remove special formatting characters but keep medical structure
    text = re.sub(r"[^\w\s\.\,\:\;\-\/\(\)]", ' ', text)

    # Normalize section headers
    text = re.sub(r'([A-Z\s]+):', r'\1:', text)

    return text.strip()

main_df["Cleaned_text"] = main_df['transcription'].apply(clean_medical_text)



In [None]:
main_df['Text_length'] = main_df["Cleaned_text"].str.len()
print(main_df['Text_length'].describe())

In [None]:
short_texts = main_df[main_df['Text_length'] < 50]
print(f"Very short texts: {len(short_texts)}")
print((short_texts['Cleaned_text']))

In [None]:
### Checking hoy maby tokens will come in this set ####

main_df["total_tokens"] = main_df["Text_length"] / 4
print("-----Total of tokens -----")
print(f"Total tokens : {len(main_df["total_tokens"])}")
print(main_df["total_tokens"].describe())

In [None]:
main_df = main_df[main_df['Text_length'] >= 100]
print(f" After filtering: {len(main_df)} rows")

In [None]:
max_num = 3840.500000
max_num_int = int(max_num)
print(max_num_int)

In [None]:
login(token="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")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/BioGPT")
model = AutoModelForCausalLM.from_pretrained("microsoft/BioGPT")

In [None]:
def count_tokens(text):
    return len(tokenizer.encode(text, add_special_tokens=False))

main_df["BioGPT_text"] = main_df['Cleaned_text'].apply(count_tokens)

In [None]:
train_df, temp_df = train_test_split(
    main_df,
    test_size=0.3,
    random_state=42,
    stratify=main_df['medical_specialty']
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=42,
    stratify=temp_df["medical_specialty"]
)


for file in ['train_medical.csv', 'val_medical.csv', 'test_medical.csv']:
    if os.path.exists(file):
        df = pd.read_csv(file)
        print(f"‚úÖ {file}: {len(df)} rows")
    else:
        print(f"‚ùå {file}: NOT FOUND")

In [None]:
main_df = main_df.drop(columns=['keywords'])

In [None]:
def created_train_tex(row):
    key_words = row["Keywords"]
    text = row["Cleaned_text"]

    if pd.isna(key_words) or key_words == '':
        return f"TRANSCRIPTION:\n{text}"
    else:
        # Add keywords prefix
        return f"KEYWORDS: {key_words}\n\nTRANSCRIPTION:\n{text}"


In [None]:
train_df["Training_Text"] = train_df.apply(created_train_tex, axis=1)
val_df["Training_Text"] = val_df.apply(created_train_tex, axis=1)
test_df["Training_Text"] = test_df.apply(created_train_tex, axis=1)

train_df.to_csv('train_medical.csv', index=False)
val_df.to_csv('val_medical.csv', index=False)
test_df.to_csv('test_medical.csv', index=False)

train_df.to_csv('train_medical.csv', index=False)
val_df.to_csv('val_medical.csv', index=False)
test_df.to_csv('test_medical.csv', index=False)

In [None]:
class Config:
    # Model
    model_name = "microsoft/biogpt"
    max_length = 512

    # Training
    batch_size = 1 # ----> 4
    gradient_accumulation_steps = 4
    learning_rate = 5e-5
    num_epochs = 3
    warmup_steps = 500

    # Paths
    train_file = "train_medical.csv"
    val_file = "val_medical.csv"
    output_dir = "./biogpt_medical_finetuned"

    # Device
    device = "cuda" if torch.cuda.is_available() else "cpu"


    # Logging
    logging_steps = 50
    save_steps = 500
    eval_steps = 500

In [None]:
# ===== DATASET CLASS =====
class MedicalTranscriptionDataset(Dataset):
    """Custom Dataset for medical transcriptions"""

    def __init__(self, csv_file, tokenizer, max_length=1024):
        self.data = pd.read_csv(csv_file)
        self.tokenizer = tokenizer
        self.max_length = max_length
        print(f"Loaded {len(self.data)} samples from {csv_file}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['Training_Text']

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        labels = input_ids.clone()
        labels[attention_mask == 0] = -100
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }




In [None]:
# ===== TRAINING FUNCTION =====
def train_with_huggingface_trainer():
    """Train using HuggingFace Trainer"""

    print("=" * 50)
    print("MEDICAL AI PROJECT - BioGPT Training")
    print("=" * 50)
    print(f"Device: {Config.device}")
    print(f"Model: {Config.model_name}")
    print(f"Batch Size: {Config.batch_size}")
    print(f"Gradient Accumulation: {Config.gradient_accumulation_steps}")
    print(f"Effective Batch Size: {Config.batch_size * Config.gradient_accumulation_steps}")
    print(f"Learning Rate: {Config.learning_rate}")
    print(f"Epochs: {Config.num_epochs}")
    print("=" * 50)

    print("\nLoading tokenizer and model...")
    tokenizer = AutoTokenizer.from_pretrained(Config.model_name)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(Config.model_name)
    model.config.pad_token_id = tokenizer.pad_token_id
    model.to(Config.device)  # ADD THIS LINE

    print(f"Model loaded. Total parameters: {sum(p.numel() for p in model.parameters()):,}")

    print("\nLoading datasets...")
    train_dataset = MedicalTranscriptionDataset(
        Config.train_file,
        tokenizer,
        Config.max_length
    )

    val_dataset = MedicalTranscriptionDataset(
        Config.val_file,
        tokenizer,
        Config.max_length
    )

    training_args = TrainingArguments(
        output_dir=Config.output_dir,
        num_train_epochs=Config.num_epochs,
        per_device_train_batch_size=Config.batch_size,
        per_device_eval_batch_size=Config.batch_size,
        gradient_accumulation_steps=Config.gradient_accumulation_steps,
        learning_rate=Config.learning_rate,
        warmup_steps=Config.warmup_steps,
        logging_steps=Config.logging_steps,
        eval_steps=Config.eval_steps,
        save_steps=Config.save_steps,
        eval_strategy="steps",
        save_strategy="steps",
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False,
        report_to="none",
        dataloader_num_workers=0,  # MPS works better with 0 workers
        remove_unused_columns=False,
        # use_mps_device removed - MPS is now used automatically if available
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        processing_class=tokenizer,  # Updated from tokenizer=tokenizer
    )

    print("\n" + "=" * 50)
    print("Starting training...")
    print("=" * 50 + "\n")

    train_result = trainer.train()

    print("\nSaving final model...")
    trainer.save_model(Config.output_dir)
    tokenizer.save_pretrained(Config.output_dir)

    print("\n" + "=" * 50)
    print("TRAINING COMPLETED!")
    print("=" * 50)
    print(f"Total training time: {train_result.metrics['train_runtime']:.2f} seconds")
    print(f"Final training loss: {train_result.metrics['train_loss']:.4f}")

    print("\nEvaluating on validation set...")
    eval_results = trainer.evaluate()
    print(f"Validation loss: {eval_results['eval_loss']:.4f}")

    perplexity = torch.exp(torch.tensor(eval_results['eval_loss']))
    print(f"Validation perplexity: {perplexity:.2f}")

    print(f"\nModel saved to: {Config.output_dir}")

    return trainer, eval_results


In [None]:
# ===== TEST GENERATION FUNCTION =====
def test_generation(model_path=None):
    """Test the fine-tuned model"""

    if model_path is None:
        model_path = Config.output_dir

    print("\n" + "=" * 50)
    print("TESTING MODEL GENERATION")
    print("=" * 50)

    print(f"\nLoading model from: {model_path}")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)
    model.to(Config.device)
    model.eval()

    test_prompts = [
        "KEYWORDS: hypertension, medication, follow-up\n\nTRANSCRIPTION:",
        "KEYWORDS: chest pain, cardiology consult, ECG\n\nTRANSCRIPTION:",
        "KEYWORDS: diabetes mellitus, glucose monitoring, insulin\n\nTRANSCRIPTION:",
    ]

    print("\nGenerating sample transcriptions...\n")

    for i, prompt in enumerate(test_prompts, 1):
        print(f"{'='*50}")
        print(f"Sample {i}:")
        print(f"{'='*50}")
        print(f"Prompt: {prompt[:50]}...")

        inputs = tokenizer(prompt, return_tensors="pt").to(Config.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=300,
                num_return_sequences=1,
                temperature=0.8,
                top_p=0.9,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )

        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"\nGenerated:\n{generated_text}\n")
    print("=" * 50)
    print("Generation test completed!")
    print("=" * 50)

In [None]:
# ===== PRINT STATUS =====
print("‚úÖ All classes and functions loaded successfully!")
print(f"üìç Device: {Config.device}")
print("\nüöÄ To start training, run in the NEXT cell:")
print("   trainer, eval_results = train_with_huggingface_trainer()")
print("   test_generation()")

In [None]:
trainer, eval_results = train_with_huggingface_trainer()
test_generation()