In [1]:
# ============================================================
# Sinhala Dyslexia Binary Essay Classifier (SINGLE-CELL PIPELINE)
# ============================================================
#
# This notebook cell implements:
#   1. Sentence-level binary dyslexia detection
#   2. Essay-level aggregation using sentence predictions
#
# The model is intended as a SCREENING component, not a diagnosis.
# ============================================================


# ------------------------------------------------------------
# (Colab-only) Install required dependencies
# NOTE: Remove this line when running in VS Code / local env
# ------------------------------------------------------------
!pip install datasets pandas scikit-learn joblib


# ------------------------------------------------------------
# 0. IMPORTS
# ------------------------------------------------------------

# Regular expressions for sentence splitting
import re

# Data handling
import pandas as pd

# Model persistence (save/load trained models)
import joblib

# Hugging Face dataset loader
from datasets import load_dataset

# Scikit-learn utilities
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


# ------------------------------------------------------------
# 1. LOAD DATASET
# ------------------------------------------------------------
# Dataset contains paired sentences:
#   - clean_sentence     : grammatically correct Sinhala
#   - dyslexic_sentence  : dyslexic-style writing errors
#
# Using paired data ensures the model learns ERROR PATTERNS,
# not vocabulary or topic differences.

dataset = load_dataset("SPEAK-ASR/sinhala-dyslexia-corrected-id20percent")

# Convert Hugging Face dataset to Pandas for easier manipulation
df = dataset["train"].to_pandas()


# ------------------------------------------------------------
# 2. CREATE BINARY CLASSIFICATION DATASET
# ------------------------------------------------------------
# Label convention:
#   1 → Dyslexic sentence
#   0 → Clean sentence

# Dyslexic samples
dys_df = pd.DataFrame({
    "text": df["dyslexic_sentence"],
    "label": 1
})

# Clean samples
clean_df = pd.DataFrame({
    "text": df["clean_sentence"],
    "label": 0
})

# Combine both classes into a single dataset
binary_df = pd.concat([dys_df, clean_df], ignore_index=True)


# ------------------------------------------------------------
# 3. TRAIN / TEST SPLIT
# ------------------------------------------------------------
# Stratified split ensures equal class distribution
# This avoids misleading accuracy due to class imbalance.

X_train, X_test, y_train, y_test = train_test_split(
    binary_df["text"],
    binary_df["label"],
    test_size=0.2,
    random_state=42,
    stratify=binary_df["label"]
)


# ------------------------------------------------------------
# 4. FEATURE EXTRACTION (CHARACTER-LEVEL TF-IDF)
# ------------------------------------------------------------
# Character-level TF-IDF is ideal for dyslexia detection because:
#   - captures spelling errors
#   - handles missing diacritics
#   - detects phonetic substitutions
#
# Word-level models fail on misspelled words.

vectorizer = TfidfVectorizer(
    analyzer="char",
    ngram_range=(2, 4)   # short character sequences
)

# Fit on training data, transform both train and test sets
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


# ------------------------------------------------------------
# 5. TRAIN BINARY CLASSIFIER
# ------------------------------------------------------------
# Logistic Regression is chosen because:
#   - produces probability scores
#   - works well with sparse TF-IDF vectors
#   - is stable, interpretable, and exam-friendly

model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)


# ------------------------------------------------------------
# 6. SENTENCE-LEVEL EVALUATION
# ------------------------------------------------------------
# Evaluate model performance on unseen test sentences

y_pred = model.predict(X_test_vec)

print("\nSENTENCE-LEVEL EVALUATION\n")
print(classification_report(y_test, y_pred))

print("CONFUSION MATRIX\n")
print(confusion_matrix(y_test, y_pred))

# NOTE:
# Recall for dyslexic sentences is more important than accuracy,
# since this model is used as a screening component.


# ------------------------------------------------------------
# 7. SAVE TRAINED MODEL AND VECTORIZER
# ------------------------------------------------------------
# These artifacts are later reused in:
#   - local demo (VS Code)
#   - Hugging Face Space
#   - microservice inference

joblib.dump(model, "dyslexia_binary_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("\nModel & vectorizer saved!")


# ------------------------------------------------------------
# 8. RELOAD MODEL (SIMULATES SERVICE USAGE)
# ------------------------------------------------------------
# This step ensures the model works correctly
# when loaded from disk (deployment scenario).

model = joblib.load("dyslexia_binary_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")


# ------------------------------------------------------------
# 9. HELPER FUNCTIONS (ESSAY-LEVEL INFERENCE)
# ------------------------------------------------------------

def split_sentences(text):
    """
    Splits an essay into sentences using punctuation.
    Language-agnostic and simple by design.
    """
    sentences = re.split(r'[.!?]', text)
    return [s.strip() for s in sentences if s.strip()]


def detect_dyslexia_essay(essay_text, sentence_threshold=0.65):
    """
    Performs essay-level dyslexia detection by:
      1. Splitting essay into sentences
      2. Predicting dyslexia probability per sentence
      3. Aggregating sentence results into an essay decision
    """

    sentences = split_sentences(essay_text)

    if not sentences:
        return {"error": "No valid sentences"}

    dyslexic_count = 0
    sentence_analysis = []

    for s in sentences:
        # Vectorize sentence
        vec = vectorizer.transform([s])

        # Predict probability of dyslexia
        prob = model.predict_proba(vec)[0][1]

        # Apply probability threshold
        is_dyslexic = prob >= sentence_threshold
        dyslexic_count += int(is_dyslexic)

        sentence_analysis.append({
            "sentence": s,
            "probability": round(float(prob), 2),
            "label": "DYSLEXIC" if is_dyslexic else "NORMAL"
        })

    # Essay-level rule:
    # If at least one sentence is dyslexic → essay flagged
    essay_label = (
        "DYSLEXIC ESSAY"
        if dyslexic_count >= 1
        else "NORMAL ESSAY"
    )

    return {
        "essay_label": essay_label,
        "total_sentences": len(sentences),
        "dyslexic_sentences": dyslexic_count,
        "sentence_analysis": sentence_analysis
    }


# ------------------------------------------------------------
# 10. MANUAL TEST CASES (SANITY CHECK)
# ------------------------------------------------------------

# Example of a clean Sinhala essay
normal_essay = """
මම අද පාසලට ගියෙමි. ගුරුතුමා අපට ගණිත පාඩම ඉගැන්වීය.
විවේක කාලයේදී මිතුරන් සමඟ කතා කළෙමි.
"""

# Example of a dyslexic-style Sinhala essay
dyslexic_essay = """
මම අද පාසල් ගිය. ගුරුතුමා අපට ගනිත පාඩම ඉගැන්වය.
විවේක කලයෙදි මිතුරන් සමග කතාකර ගිය.
"""

print("\nNORMAL ESSAY RESULT\n")
print(detect_dyslexia_essay(normal_essay))

print("\nDYSLEXIC ESSAY RESULT\n")
print(detect_dyslexia_essay(dyslexic_essay))







SENTENCE-LEVEL EVALUATION

              precision    recall  f1-score   support

           0       0.76      0.84      0.79      5528
           1       0.82      0.73      0.77      5527

    accuracy                           0.78     11055
   macro avg       0.79      0.78      0.78     11055
weighted avg       0.79      0.78      0.78     11055

CONFUSION MATRIX

[[4619  909]
 [1477 4050]]

Model & vectorizer saved!

NORMAL ESSAY RESULT

{'essay_label': 'NORMAL ESSAY', 'total_sentences': 3, 'dyslexic_sentences': 0, 'sentence_analysis': [{'sentence': 'මම අද පාසලට ගියෙමි', 'probability': 0.61, 'label': 'NORMAL'}, {'sentence': 'ගුරුතුමා අපට ගණිත පාඩම ඉගැන්වීය', 'probability': 0.47, 'label': 'NORMAL'}, {'sentence': 'විවේක කාලයේදී මිතුරන් සමඟ කතා කළෙමි', 'probability': 0.3, 'label': 'NORMAL'}]}

DYSLEXIC ESSAY RESULT

{'essay_label': 'DYSLEXIC ESSAY', 'total_sentences': 3, 'dyslexic_sentences': 1, 'sentence_analysis': [{'sentence': 'මම අද පාසල් ගිය', 'probability': 0.51, 'label': 'NO

In [None]:
# Colab-only: dataset archival for backup/sharing

# from google.colab import files

# files.download("dyslexia_binary_model.pkl")
# files.download("tfidf_vectorizer.pkl")


In [11]:
# ============================================================
# DATASET FREEZING FOR REPRODUCIBILITY (ONE-TIME STEP)
# ============================================================
#
# This cell downloads the Sinhala dyslexia dataset from
# Hugging Face Hub and saves it locally using the
# Hugging Face Arrow format.
#
# Purpose:
#   - Ensure reproducibility
#   - Enable offline training
#   - Lock a fixed dataset snapshot
#
# IMPORTANT:
#   - Run this cell ONCE
#   - After this, always use load_from_disk()
# ============================================================

from datasets import load_dataset

# Download dataset from Hugging Face Hub
dataset = load_dataset("SPEAK-ASR/sinhala-dyslexia-corrected-id20percent")

# Save dataset locally in Arrow format
dataset.save_to_disk("sinhala_dyslexia_dataset")

print("Dataset saved locally for reproducibility.")


Saving the dataset (0/1 shards):   0%|          | 0/27636 [00:00<?, ? examples/s]

Dataset saved locally for reproducibility.


In [12]:
# ============================================================
# OPTIONAL: ARCHIVE DATASET SNAPSHOT (PLATFORM-INDEPENDENT)
# ============================================================

import shutil

shutil.make_archive(
    "sinhala_dyslexia_dataset",
    "zip",
    "sinhala_dyslexia_dataset"
)

print("Dataset archived as sinhala_dyslexia_dataset.zip")



Dataset archived as sinhala_dyslexia_dataset.zip


In [9]:
# NOTE: Colab-only download utilities removed for local execution


# from google.colab import files
# files.download("sinhala_dyslexia_dataset.zip")


In [13]:
# ============================================================
# SENTENCE-LEVEL MODEL EVALUATION
# ============================================================
#
# This section evaluates the trained binary classifier
# on unseen test sentences.
#
# Evaluation is performed at the SENTENCE level because:
#   - The model is trained on individual sentences
#   - Essay-level decisions are derived later via aggregation
#
# Metrics used:
#   - Precision
#   - Recall
#   - F1-score
#   - Confusion Matrix
#
# These metrics help determine whether the model is suitable
# as a dyslexia SCREENING component.
# ============================================================

from sklearn.metrics import classification_report, confusion_matrix


# ------------------------------------------------------------
# Generate predictions for the test set
# ------------------------------------------------------------
# X_test_vec contains TF-IDF representations of sentences
# that were NOT seen during training.

y_pred = model.predict(X_test_vec)


# ------------------------------------------------------------
# Print detailed classification metrics
# ------------------------------------------------------------
print("SENTENCE-LEVEL EVALUATION")
print("-------------------------")

# classification_report provides:
#   - Precision: correctness of positive predictions
#   - Recall: ability to detect dyslexic sentences (important)
#   - F1-score: balance between precision and recall
#   - Support: number of samples per class

print(classification_report(y_test, y_pred))


# ------------------------------------------------------------
# Print confusion matrix
# ------------------------------------------------------------
# Confusion matrix layout:
#
#              Predicted
#             0        1
# True  0   [TN]     [FP]
#       1   [FN]     [TP]
#
# Where:
#   TN = Clean sentence correctly classified
#   FP = Clean sentence misclassified as dyslexic
#   FN = Dyslexic sentence missed by the model
#   TP = Dyslexic sentence correctly detected

print("CONFUSION MATRIX")
print(confusion_matrix(y_test, y_pred))


SENTENCE-LEVEL EVALUATION
-------------------------
              precision    recall  f1-score   support

           0       0.76      0.84      0.79      5528
           1       0.82      0.73      0.77      5527

    accuracy                           0.78     11055
   macro avg       0.79      0.78      0.78     11055
weighted avg       0.79      0.78      0.78     11055

CONFUSION MATRIX
[[4619  909]
 [1477 4050]]
