## Sinhala Dyslexic Writing-Pattern Classifier
### Part B: Essay-level dyslexic writing-pattern profiling (rule-based)

⚠️ Note:
This module performs rule-based dyslexic writing-pattern classification.
No supervised learning metrics are reported due to inferred labels.


This module does NOT perform supervised classification.
Essay-level patterns are inferred via dominance-weighted aggregation
of sentence-level surface error patterns.
Therefore, no accuracy or classification metrics are reported.


The dataset does not provide explicit essay boundaries. Therefore, essays are approximated by grouping consecutive sentences into fixed-size segments of five sentences. These segments are referred to as pseudo-essays and are used for essay-level pattern aggregation.

In [2]:
import pandas as pd

sentence_df = pd.read_csv("sentence_level_patterns_v3.csv")
essay_df = pd.read_csv("essay_level_patterns_v3.csv")

sentence_df.head(), essay_df.head()


ModuleNotFoundError: No module named 'pandas'

In [None]:
essay_df["dominant_pattern"].value_counts()


Unnamed: 0_level_0,count
dominant_pattern,Unnamed: 1_level_1
Orthographic Instability,2054
Phonetic Confusion,1601
No Dominant Pattern,1365
Mixed Dyslexic Pattern,451
Mixed Essay Pattern,6
Word Boundary Confusion,1


In [None]:
rare_classes = [
    "Word Boundary Confusion",
    "Mixed Essay Pattern"
]

essay_df["dominant_pattern_fixed"] = essay_df["dominant_pattern"].replace(
    rare_classes,
    "Mixed Dyslexic Pattern"
)


In [None]:
essay_df["dominant_pattern_fixed"].value_counts()


Unnamed: 0_level_0,count
dominant_pattern_fixed,Unnamed: 1_level_1
Orthographic Instability,2054
Phonetic Confusion,1601
No Dominant Pattern,1365
Mixed Dyslexic Pattern,458


In [None]:
X = essay_df[
    [
        "Orthographic Instability",
        "Phonetic Confusion",
        "Word Boundary Confusion",
        "Mixed Dyslexic Pattern",
        "No Dominant Pattern"
    ]
]

y = essay_df["dominant_pattern_fixed"]


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=8,
    random_state=42,
    class_weight="balanced"
)

clf.fit(X_train, y_train)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
confusion_matrix(y_test, y_pred)


                          precision    recall  f1-score   support

  Mixed Dyslexic Pattern       1.00      1.00      1.00        92
     No Dominant Pattern       1.00      1.00      1.00       273
Orthographic Instability       1.00      1.00      1.00       411
      Phonetic Confusion       1.00      1.00      1.00       320

                accuracy                           1.00      1096
               macro avg       1.00      1.00      1.00      1096
            weighted avg       1.00      1.00      1.00      1096



array([[ 92,   0,   0,   0],
       [  0, 273,   0,   0],
       [  0,   0, 411,   0],
       [  0,   0,   0, 320]])

❌ NOT USED IN FINAL SYSTEM

*   Reason: label leakage (pattern counts used to derive labels)
*   classification_report(y_test, y_pred)
*   confusion_matrix(y_test, y_pred)



In [None]:
import pandas as pd

feature_df = pd.read_csv("sentence_level_patterns_v3.csv")

feature_df.head()


Unnamed: 0,char_addition,char_omission,char_substitution,clean_sentence,dyslexic_sentence,has_addition,has_omission,has_substitution,word_count_diff,has_spacing_issue,has_diacritic_loss,writing_pattern,writing_pattern_v2,writing_pattern_v3
0,0,2,0,වලිකුකුළා කෑගහනවා.,වලිකුකුළා කෑගහනව,False,True,False,0,False,True,Orthographic Instability,Orthographic Instability,Orthographic Instability
1,0,1,0,අම්මා කෑම දෙනවා,අම්මා කෑම දනවා,False,True,False,0,False,True,Orthographic Instability,Orthographic Instability,Orthographic Instability
2,0,0,0,එයා එනකන් ඉඩපන්,එයා එනකන් ඉඩපන්,False,False,False,0,False,False,No Dominant Pattern,No Dominant Pattern,No Dominant Pattern
3,0,2,0,රුපියල් දෙදාහක් තියෙනවා,රුපියල් දෙදාහක් තියනව,False,True,False,0,False,True,Orthographic Instability,Orthographic Instability,Orthographic Instability
4,1,0,0,ගාල්ලට යන්න ඕනෙ,ගාල්ලට යන්න ඕනෙඩ,True,False,False,0,False,False,No Dominant Pattern,No Dominant Pattern,No Dominant Pattern


In [None]:
feature_df.columns


Index(['char_addition', 'char_omission', 'char_substitution', 'clean_sentence',
       'dyslexic_sentence', 'has_addition', 'has_omission', 'has_substitution',
       'word_count_diff', 'has_spacing_issue', 'has_diacritic_loss',
       'writing_pattern', 'writing_pattern_v2', 'writing_pattern_v3'],
      dtype='object')

In [None]:
feature_df.head(20)


Unnamed: 0,char_addition,char_omission,char_substitution,clean_sentence,dyslexic_sentence,has_addition,has_omission,has_substitution,word_count_diff,has_spacing_issue,has_diacritic_loss,writing_pattern,writing_pattern_v2,writing_pattern_v3
0,0,2,0,වලිකුකුළා කෑගහනවා.,වලිකුකුළා කෑගහනව,False,True,False,0,False,True,Orthographic Instability,Orthographic Instability,Orthographic Instability
1,0,1,0,අම්මා කෑම දෙනවා,අම්මා කෑම දනවා,False,True,False,0,False,True,Orthographic Instability,Orthographic Instability,Orthographic Instability
2,0,0,0,එයා එනකන් ඉඩපන්,එයා එනකන් ඉඩපන්,False,False,False,0,False,False,No Dominant Pattern,No Dominant Pattern,No Dominant Pattern
3,0,2,0,රුපියල් දෙදාහක් තියෙනවා,රුපියල් දෙදාහක් තියනව,False,True,False,0,False,True,Orthographic Instability,Orthographic Instability,Orthographic Instability
4,1,0,0,ගාල්ලට යන්න ඕනෙ,ගාල්ලට යන්න ඕනෙඩ,True,False,False,0,False,False,No Dominant Pattern,No Dominant Pattern,No Dominant Pattern
5,0,0,1,පැන දෙන්න,පැන ඩෙන්න,False,False,True,0,False,False,Phonetic Confusion,Phonetic Confusion,Phonetic Confusion
6,0,1,1,කළ දුරකථනය දෙනවා.,කළ දුරකථනය ඩෙනවා,False,True,True,0,False,False,Phonetic Confusion,Mixed Dyslexic Pattern,Mixed Dyslexic Pattern
7,0,0,0,උදේට කෑම කනවද,උදේට කෑම කනවද,False,False,False,0,False,False,No Dominant Pattern,No Dominant Pattern,No Dominant Pattern
8,0,0,2,පන්ති යන්න ඕනෙද,පන්ති යන්න ඔනෙඩ,False,False,True,0,False,False,Phonetic Confusion,Phonetic Confusion,Phonetic Confusion
9,0,1,0,ගානවා,ගානව,False,True,False,0,False,True,Orthographic Instability,Orthographic Instability,Orthographic Instability


In [None]:
ESSAY_SIZE = 5  # fixed design choice

feature_df = feature_df.reset_index(drop=True)
feature_df["essay_id"] = feature_df.index // ESSAY_SIZE

feature_df[["essay_id", "clean_sentence", "writing_pattern_v3"]].head(15)


Unnamed: 0,essay_id,clean_sentence,writing_pattern_v3
0,0,වලිකුකුළා කෑගහනවා.,Orthographic Instability
1,0,අම්මා කෑම දෙනවා,Orthographic Instability
2,0,එයා එනකන් ඉඩපන්,No Dominant Pattern
3,0,රුපියල් දෙදාහක් තියෙනවා,Orthographic Instability
4,0,ගාල්ලට යන්න ඕනෙ,No Dominant Pattern
5,1,පැන දෙන්න,Phonetic Confusion
6,1,කළ දුරකථනය දෙනවා.,Mixed Dyslexic Pattern
7,1,උදේට කෑම කනවද,No Dominant Pattern
8,1,පන්ති යන්න ඕනෙද,Phonetic Confusion
9,1,ගානවා,Orthographic Instability


In [None]:
essay_pattern_counts = (
    feature_df
    .groupby("essay_id")["writing_pattern_v3"]
    .value_counts()
    .unstack(fill_value=0)
)

essay_pattern_counts.head()


writing_pattern_v3,Mixed Dyslexic Pattern,No Dominant Pattern,Orthographic Instability,Phonetic Confusion,Word Boundary Confusion
essay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2,3,0,0
1,1,1,1,2,0
2,1,1,1,2,0
3,0,1,2,2,0
4,1,1,3,0,0


In [None]:
essay_pattern_counts["dominant_pattern"] = essay_pattern_counts.idxmax(axis=1)

essay_pattern_counts[["dominant_pattern"]].head()


writing_pattern_v3,dominant_pattern
essay_id,Unnamed: 1_level_1
0,Orthographic Instability
1,Phonetic Confusion
2,Phonetic Confusion
3,Orthographic Instability
4,Orthographic Instability


In [None]:
pattern_cols = [
    "Orthographic Instability",
    "Phonetic Confusion",
    "Word Boundary Confusion",
    "Mixed Dyslexic Pattern",
    "No Dominant Pattern"
]


In [None]:
essay_pattern_counts["dominant_pattern"] = (
    essay_pattern_counts[pattern_cols].idxmax(axis=1)
)


In [None]:
essay_pattern_counts["confidence"] = (
    essay_pattern_counts[pattern_cols].max(axis=1) /
    essay_pattern_counts[pattern_cols].sum(axis=1)
)


In [None]:
essay_pattern_counts[
    ["dominant_pattern", "confidence"]
].head()


writing_pattern_v3,dominant_pattern,confidence
essay_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Orthographic Instability,0.6
1,Phonetic Confusion,0.4
2,Phonetic Confusion,0.4
3,Orthographic Instability,0.4
4,Orthographic Instability,0.6


In [None]:
def confidence_label(c):
    if c >= 0.6:
        return "Strong Dominance"
    elif c >= 0.4:
        return "Moderate Dominance"
    else:
        return "Weak / Mixed"

essay_pattern_counts["dominance_strength"] = (
    essay_pattern_counts["confidence"].apply(confidence_label)
)


In [None]:
essay_pattern_counts["dominant_pattern"].value_counts(normalize=True)


Unnamed: 0_level_0,proportion
dominant_pattern,Unnamed: 1_level_1
Orthographic Instability,0.465498
Phonetic Confusion,0.387733
No Dominant Pattern,0.108069
Mixed Dyslexic Pattern,0.038153
Word Boundary Confusion,0.000548


In [None]:
essay_pattern_counts["confidence"].describe()


Unnamed: 0,confidence
count,5478.0
mean,0.526707
std,0.139891
min,0.2
25%,0.4
50%,0.6
75%,0.6
max,1.0


In [26]:
essay_pattern_counts.sample(3)


writing_pattern_v3,Mixed Dyslexic Pattern,No Dominant Pattern,Orthographic Instability,Phonetic Confusion,Word Boundary Confusion,dominant_pattern,confidence,dominance_strength
essay_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3682,0,2,3,0,0,Orthographic Instability,0.6,Strong Dominance
3717,0,2,1,2,0,Phonetic Confusion,0.4,Moderate Dominance
2976,1,1,1,2,0,Phonetic Confusion,0.4,Moderate Dominance


In [1]:
feature_df[feature_df["essay_id"] == 0][
    ["clean_sentence", "dyslexic_sentence", "writing_pattern_v3"]
]


NameError: name 'feature_df' is not defined