<a href="https://colab.research.google.com/github/Ananyas-7/ai-dyslexia-tool/blob/main/colab_notebooks/method_1_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [151]:
# Install required packages (run only if not already installed)
!pip install datasets nltk spacy sentence-transformers pandas
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [152]:
import random
import pandas as pd
from datasets import load_dataset
from nltk.corpus import words, brown
from collections import Counter
from difflib import get_close_matches
import spacy
from sentence_transformers import SentenceTransformer, util
import nltk

# Download required NLTK resources
nltk.download('words')
nltk.download('brown')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [161]:
# Build dataset-specific vocabulary (from your loaded dataset)
dataset_vocab = set([w.lower() for item in dataset for w in item['text'].split()])

# Combine with NLTK vocab
combined_vocab = vocab.union(dataset_vocab)


In [153]:
# Load vocabulary from NLTK
vocab = set(words.words())

# Word frequency from Brown corpus
freq_counter = Counter(brown.words())

# Load SpaCy model for POS / grammar checks
nlp = spacy.load("en_core_web_sm")

# Load sentence-transformer for semantic similarity
model = SentenceTransformer('all-MiniLM-L6-v2')


In [154]:
# Common OCR-like substitutions
ocr_subs = {'o': '0', 'm': 'rn', 'l': '1', 'i': '1', 'e': '€', 'a': '@'}

# Function to add OCR noise (light for higher correction accuracy)
def add_ocr_noise(text, p_sub=0.05, p_drop=0.02, p_space=0.02):
    noisy_text = ''
    for char in text:
        # Substitute
        if char.lower() in ocr_subs and random.random() < p_sub:
            noisy_text += ocr_subs[char.lower()]
        # Drop
        elif random.random() < p_drop:
            continue
        else:
            noisy_text += char
        # Random space
        if random.random() < p_space:
            noisy_text += ' '
    return noisy_text


In [172]:
# Install rapidfuzz if not installed
!pip install rapidfuzz

from rapidfuzz import process, fuzz

# Normalize noisy word
def normalize_noisy_word(word):
    substitutions = {'0': 'o', '1': 'i', 'rn': 'm', '€': 'e', '@': 'a'}
    for k, v in substitutions.items():
        word = word.replace(k, v)
    return word

# Fuzzy candidate generation
def generate_candidates_fuzzy(word, vocab, limit=5, score_cutoff=80):
    results = process.extract(word, vocab, scorer=fuzz.ratio, limit=limit)
    candidates = [r[0] for r in results if r[1] >= score_cutoff]
    return candidates




In [173]:
# Correct a noisy sentence using normalization + fuzzy matching + frequency scoring
def correct_sentence(noisy_sentence):
    corrected_words = []
    words_list = noisy_sentence.split()

    for word in words_list:
        # Normalize OCR-like substitutions
        word_normalized = normalize_noisy_word(word)

        # Generate candidates using fuzzy matching
        candidates = generate_candidates_fuzzy(word_normalized, combined_vocab)

        # Pick best candidate using frequency, else keep original
        if candidates:
            best = max(candidates, key=lambda w: freq_counter[w.lower()] if w.lower() in freq_counter else 0)
            corrected_words.append(best)
        else:
            corrected_words.append(word)

    # Post-processing (optional spacing fixes)
    sentence = ' '.join(corrected_words)
    sentence = sentence.replace('M Ps', 'M Ps')

    return sentence


In [174]:
# Add OCR noise (same function as before)
def add_ocr_noise(text, p_sub=0.08, p_drop=0.03, p_space=0.03):
    noisy_text = ''
    for char in text:
        if char.lower() in ocr_subs and random.random() < p_sub:
            noisy_text += ocr_subs[char.lower()]
        elif random.random() < p_drop:
            continue
        else:
            noisy_text += char
        if random.random() < p_space:
            noisy_text += ' '
    return noisy_text

# Generate results
data = []
for item in dataset:
    clean_text = item['text']
    noisy_text = add_ocr_noise(clean_text)
    predicted_text = correct_sentence(noisy_text)
    data.append({
        'Clean Data': clean_text,
        'Noisy Data': noisy_text,
        'Predicted Data': predicted_text
    })



In [176]:
import pandas as pd
from IPython.display import display

df = pd.DataFrame(data)
display(df)

# Optional CSV export
df.to_csv("ocr_correction_results.csv", index=False)


Unnamed: 0,Clean Data,Noisy Data,Predicted Data
0,put down a resolution on the subject,put down a resolut1on on th€ sbject,put down a resolution on the subject
1,and he is to be backed by Mr. Will,and he is to be ba cked b Mr. W1ll,and he is to be bar picked b Mr Will
2,nominating any more Labour life Peers,nominating a ny more abou lfe P€ers,nomination a any more about life peers
3,M Ps tomorrow. Mr. Michael Foot has,M Ps tomorrow. M. Michael Foot a,M Ps tomorrow M. Michael Foot a
4,"Griffiths, M P for Manchester Exchange .","Griffits, M P for rnanchester Exch@ge .","griffiths, M P for manchester exchange ."
5,is to be made at a meeting of Labour,is to be m@de at a meeting of Laour,is to be made at a meeting of Lodur
6,A MOVE to stop Mr. Gaitskell from,@ rnOVE to stop Mr. Gaitskellfrom,a rnOVE to stop Mr Gaitskellfrom
7,0M P for Manchester Exchange .,0M P for Man chester Exc@nge .,0M P for Ma chester exchange .
8,A MOVE to stop Mr. Gaitskell from nominating,A MOVE tostop Mr. Gatsk€ll from nominating,A MOVE stop Mr gaitskell from nomination
9,meeting of Labour 0M Ps tommorow . Mr. Michael,meet ing ofLabour 0M Ps tommorow . Mr. Michael,meet King ofLabour 0M Ps tomorrow . Mr Michael


In [177]:
def calculate_word_accuracy(clean_texts, predicted_texts):
    total_words = 0
    correct_words = 0

    for clean, pred in zip(clean_texts, predicted_texts):
        clean_words = clean.split()
        pred_words = pred.split()
        # Compare words one by one (up to the length of the shorter sentence)
        for c, p in zip(clean_words, pred_words):
            if c == p:
                correct_words += 1
        total_words += len(clean_words)

    accuracy = (correct_words / total_words) * 100
    return accuracy


In [178]:
accuracy = calculate_word_accuracy(df['Clean Data'], df['Predicted Data'])
print(f"Word-level OCR Correction Accuracy: {accuracy:.2f}%")

Word-level OCR Correction Accuracy: 46.06%


In [179]:
sentence_accuracy = (df['Clean Data'] == df['Predicted Data']).mean() * 100
print(f"Sentence-level Accuracy: {sentence_accuracy:.2f}%")

Sentence-level Accuracy: 10.00%
