# Medzy
## Overview
This project aims to develop a machine learning model capable of interpreting doctors’ handwriting on prescriptions. By accurately detecting and translating challenging handwriting, the model will empower patients to read their prescriptions independently, making it easier for them to purchase their medications without confusion if they run out of medicine.

This model is using Hugging Face's [BART model](https://huggingface.co/docs/transformers/en/model_doc/bart) for correcting the OCR errors.

## Get PyTorch Device

### DirectML

In [2]:
import torch
import torch_directml

device = torch_directml.device()

ModuleNotFoundError: No module named 'torch_directml'

### CUDA (fallback to CPU if none)

In [1]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Loading the labels

In [2]:
import pandas as pd

train_df = pd.read_csv("./Dataset/Training/training_labels.csv", delimiter = ",")
labels = train_df["MEDICINE_NAME"].unique() # Get all unique instance of the labels

## OCR Errors Simulation

In [3]:
import re
import random

# Define common OCR misread patterns
ocr_confusions = {
    '0': 'O', 'O': '0', '1': 'I', 'I': '1', '5': 'S', 'S': '5',
    'rn': 'm', 'm': 'rn', 'l': 'I', 'I': 'l', 'B': '8', '8': 'B',
    'c': 'e', 'e': 'c', 'a': 'o', 'o': 'a', 'e': 'B'
}

def reduce_repetitions(word):
    """Reduces repeated patterns like 'Arolrolrol' to 'Arol'."""
    return re.sub(r'(.{2,})\1+', r'\1', word)

def introduce_ocr_typo(word):
    """Simulates common OCR errors in a word."""
    typo_types = ["swap", "delete", "insert", "replace", "ocr_confusion", "repeat"]
    typo = random.choice(typo_types)
    index = random.randint(0, len(word) - 1)

    if typo == "swap" and index < len(word) - 1:
        word = word[:index] + word[index+1] + word[index] + word[index+2:]
    elif typo == "delete":
        word = word[:index] + word[index+1:]
    elif typo == "insert":
        random_char = random.choice("abcdefghijklmnopqrstuvwxyz")
        word = word[:index] + random_char + word[index:]
    elif typo == "replace":
        random_char = random.choice("abcdefghijklmnopqrstuvwxyz")
        word = word[:index] + random_char + word[index+1:]
    elif typo == "ocr_confusion":
        for key, val in ocr_confusions.items():
            if key in word:
                word = word.replace(key, val, 1)  # Replace one occurrence
                break
    elif typo == "repeat":
        word = word[:index] + word[index:] + word[index:]  # Repeat part of the word

    return reduce_repetitions(word)  # Fix excessive repetition

In [4]:
# Generate Dataset with OCR distortions
dataset = [{"input": introduce_ocr_typo(label), "target": label} for label in labels for _ in range(8)]

In [5]:
dataset

[{'input': 'Acetx', 'target': 'Aceta'},
 {'input': 'Acetka', 'target': 'Aceta'},
 {'input': 'Aceat', 'target': 'Aceta'},
 {'input': 'Aceta', 'target': 'Aceta'},
 {'input': 'sAceta', 'target': 'Aceta'},
 {'input': 'Axceta', 'target': 'Aceta'},
 {'input': 'cAeta', 'target': 'Aceta'},
 {'input': 'Aecta', 'target': 'Aceta'},
 {'input': 'kAce', 'target': 'Ace'},
 {'input': 'Aee', 'target': 'Ace'},
 {'input': 'cAe', 'target': 'Ace'},
 {'input': 'Acee', 'target': 'Ace'},
 {'input': 'Alce', 'target': 'Ace'},
 {'input': 'Aee', 'target': 'Ace'},
 {'input': 'Acq', 'target': 'Ace'},
 {'input': 'Ace', 'target': 'Ace'},
 {'input': 'Altrol', 'target': 'Alatrol'},
 {'input': 'Alatrql', 'target': 'Alatrol'},
 {'input': 'lAatrol', 'target': 'Alatrol'},
 {'input': 'Alltrol', 'target': 'Alatrol'},
 {'input': 'AIatrol', 'target': 'Alatrol'},
 {'input': 'AIatrol', 'target': 'Alatrol'},
 {'input': 'kAlatrol', 'target': 'Alatrol'},
 {'input': 'Alatrtl', 'target': 'Alatrol'},
 {'input': 'fmodis', 'target': 'Am

## Model Training

In [6]:
from transformers import BartTokenizer

# Load tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

# Tokenization function
def preprocess(example):
    inputs = tokenizer(example["input"], padding="max_length", truncation=True, max_length=32)
    targets = tokenizer(example["target"], padding="max_length", truncation=True, max_length=32)
    
    inputs["labels"] = targets["input_ids"]  # Target labels for training
    return inputs

# Apply preprocessing
tokenized_datasets = list(map(preprocess, dataset))

A data collator dynamically pads sequences to the longest in a batch, reducing unnecessary padding.

In [7]:
from transformers import DataCollatorForSeq2Seq

# Use Hugging Face's built-in data collator for sequence-to-sequence tasks
data_collator = DataCollatorForSeq2Seq(tokenizer, model=None, padding=True)

In [8]:
from transformers import BartForConditionalGeneration, Trainer, TrainingArguments

# Load pre-trained BART model
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

# Training arguments
training_args = TrainingArguments(
    output_dir="./bart_autocorrect",
    eval_strategy="no",
    save_strategy="epoch",
    per_device_train_batch_size=64,  # Adjust batch size depending on your GPU
    per_device_eval_batch_size=64,
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=5,
    logging_dir="./logs",
    save_total_limit=2,
    push_to_hub=False
)

# Define Trainer with batching
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    processing_class=tokenizer,
    data_collator=data_collator  # Enables batching
)

In [9]:
# Start training
trainer.train()

Step,Training Loss




TrainOutput(global_step=50, training_loss=8.517660522460938, metrics={'train_runtime': 34.7027, 'train_samples_per_second': 89.906, 'train_steps_per_second': 1.441, 'total_flos': 59449304678400.0, 'train_loss': 8.517660522460938, 'epoch': 5.0})

In [10]:
# Save model
model.save_pretrained("./model-output/BART")

# Save tokenizer
tokenizer.save_pretrained("./model-output/BART")

('./model-output/BART\\tokenizer_config.json',
 './model-output/BART\\special_tokens_map.json',
 './model-output/BART\\vocab.json',
 './model-output/BART\\merges.txt',
 './model-output/BART\\added_tokens.json')