## Sinhala Dyslexic Writing-Pattern Classifier
### Part A: Sentence-level surface feature extraction and pattern inference (Rule-based)


This notebook implements the finalized sentence-level dyslexic writing-pattern inference using dominance-weighted surface error signals. Earlier heuristic and threshold-based variants are retained for analysis but are not used in downstream profiling.

01 → pipeline order

surface_feature_extraction → what it actually does

pattern_inference → not “classification” yet

v3 → documents evolution (v1, v2, v3 logic)

In [4]:
import sys
import os

# Go TWO levels up: notebooks → repo root
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))

if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

print("Project root added to path:", PROJECT_ROOT)


Project root added to path: d:\RP\sinhala-dyslexic-writing-pattern-classifier


In [5]:
from writing_pattern_classifier.src.pattern_rules import infer_pattern


In [6]:
print(infer_pattern)


<function infer_pattern at 0x0000021F6A55FC40>


In [7]:
from datasets import load_dataset
import pandas as pd
import difflib

from writing_pattern_classifier.src.pattern_rules import infer_pattern


In [8]:
dataset = load_dataset(
    "SPEAK-ASR/sinhala-dyslexia-corrected-id20percent",
    split="train"
)

df = dataset.to_pandas()
df.head()


Unnamed: 0,clean_sentence,dyslexic_sentence,error_type
0,වලිකුකුළා කෑගහනවා.,වලිකුකුළා කෑගහනව,Grammar
1,අම්මා කෑම දෙනවා,අම්මා කෑම දනවා,Phonetic Confusion
2,"{""correction"": ""අපි ගමට යනවා"", ""analysis"": [{""...",අපි යනව ගමට,unknown
3,එයා එනකන් ඉඩපන්,එයා එනකන් ඉඩපන්,unknown
4,රුපියල් දෙදාහක් තියෙනවා,රුපියල් දෙදාහක් තියනව,Spoken vs Written


In [9]:
def is_valid_text(s):
    if not isinstance(s, str):
        return False
    if len(s) < 3:
        return False
    if "{" in s or "}" in s:
        return False
    return True

df = df[
    df["clean_sentence"].apply(is_valid_text) &
    df["dyslexic_sentence"].apply(is_valid_text)
].reset_index(drop=True)

df.shape


(27468, 3)

In [10]:
def char_diff_features(clean, dys):
    matcher = difflib.SequenceMatcher(None, clean, dys)

    insertions = 0
    deletions = 0
    substitutions = 0

    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == "insert":
            insertions += (j2 - j1)
        elif tag == "delete":
            deletions += (i2 - i1)
        elif tag == "replace":
            substitutions += max(i2 - i1, j2 - j1)

    return {
        "char_addition": insertions,
        "char_omission": deletions,
        "char_substitution": substitutions,
    }


In [11]:
feature_rows = []

for _, row in df.iterrows():
    feats = char_diff_features(
        row["clean_sentence"],
        row["dyslexic_sentence"]
    )

    feats["clean_sentence"] = row["clean_sentence"]
    feats["dyslexic_sentence"] = row["dyslexic_sentence"]

    feature_rows.append(feats)

feature_df = pd.DataFrame(feature_rows)
feature_df.head()


Unnamed: 0,char_addition,char_omission,char_substitution,clean_sentence,dyslexic_sentence
0,0,2,0,වලිකුකුළා කෑගහනවා.,වලිකුකුළා කෑගහනව
1,0,1,0,අම්මා කෑම දෙනවා,අම්මා කෑම දනවා
2,0,0,0,එයා එනකන් ඉඩපන්,එයා එනකන් ඉඩපන්
3,0,2,0,රුපියල් දෙදාහක් තියෙනවා,රුපියල් දෙදාහක් තියනව
4,1,0,0,ගාල්ලට යන්න ඕනෙ,ගාල්ලට යන්න ඕනෙඩ


In [12]:
feature_df["has_addition"] = feature_df["char_addition"] > 0
feature_df["has_omission"] = feature_df["char_omission"] > 0
feature_df["has_substitution"] = feature_df["char_substitution"] > 0


In [13]:
def word_count_diff(clean, dys):
    return abs(len(clean.split()) - len(dys.split()))

feature_df["word_count_diff"] = feature_df.apply(
    lambda r: word_count_diff(r["clean_sentence"], r["dyslexic_sentence"]),
    axis=1
)

feature_df["has_spacing_issue"] = feature_df["word_count_diff"] > 0


In [14]:
SINHALA_DIACRITICS = set([
    "ා","ැ","ෑ","ි","ී","ු","ූ","ෙ","ේ","ො","ෝ","ෞ","ං","ඃ","්"
])

def has_diacritic_loss(clean, dys):
    clean_d = sum(1 for c in clean if c in SINHALA_DIACRITICS)
    dys_d = sum(1 for c in dys if c in SINHALA_DIACRITICS)
    return clean_d > dys_d

feature_df["has_diacritic_loss"] = feature_df.apply(
    lambda r: has_diacritic_loss(r["clean_sentence"], r["dyslexic_sentence"]),
    axis=1
)


In [15]:
feature_df["writing_pattern"] = feature_df.apply(
    infer_pattern,
    axis=1
)


In [16]:
feature_df["writing_pattern"].value_counts()


writing_pattern
Mixed Dyslexic Pattern                 8188
No Dominant Pattern                    5966
Orthographic Instability (Strong)      4441
Phonetic Confusion (Strong)            3261
Phonetic Confusion (Moderate)          3086
Orthographic Instability (Moderate)    1287
Word Boundary Confusion (Moderate)     1216
Word Boundary Confusion (Strong)         23
Name: count, dtype: int64

In [17]:
feature_df.sample(10)[[
    "clean_sentence",
    "dyslexic_sentence",
    "writing_pattern"
]]


Unnamed: 0,clean_sentence,dyslexic_sentence,writing_pattern
10272,පූසා ගෙදර ඉන්නවා,පූස රගෙදර ඉන්නව,Orthographic Instability (Strong)
14691,බයේ.,බය්යෙ.,Phonetic Confusion (Strong)
68,කඩේට ගියාම බනිස් ගේන්න.,කඩේට ගියම බනිස් ගෙනෙන්න.,Mixed Dyslexic Pattern
24547,පළවෙනි පිටුව බලන්න.,පළවෙනි පිටුව බලන්න.,No Dominant Pattern
11810,ඔහොම පලයන්න එපා.,පලයන්න එපා ඔහොම,Mixed Dyslexic Pattern
6209,බොරුව,ඩොරනව,Mixed Dyslexic Pattern
1382,පොතේ පොඩි සතෙක් හිටියා.,පොතේ පොඩි සතෙක් හිටියා.,No Dominant Pattern
26173,කඩේට යනවනම් කියන්න,කඩෙට යනවනම් කියන්න,Phonetic Confusion (Moderate)
1872,එයා කතා කරන්න බයයි.,එයා කතා කරන්න බයයි.,No Dominant Pattern
5335,සන්දෙස,ගන්දස,Mixed Dyslexic Pattern


In [None]:
feature_df.to_csv(
    "sentence_level_patterns.csv",
    index=False,
    encoding="utf-8"
)

print("Sentence-level pattern artifacts saved successfully.")


Sentence-level pattern artifacts saved successfully.


: 