In [4]:
!pip install transformers


Collecting transformers
  Downloading transformers-4.57.5-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Using cached huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2025.11.3-cp310-cp310-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Using cached tokenizers-0.22.2-cp39-abi3-win_amd64.whl.metadata (7.4 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.7.0-cp38-abi3-win_amd64.whl.metadata (4.2 kB)
Downloading transformers-4.57.5-py3-none-any.whl (12.0 MB)
   ---------------------------------------- 0.0/12.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/12.0 MB ? eta -:--:--
    --------------------------------------- 0.3/12.0 MB ? eta -:--:--
   - -------------------------------------- 0.5/12.0 MB 1.3 MB/s eta 0:00:09
   - -------------------------------------- 0.5/

In [7]:
import os
import numpy as np
from pathlib import Path

from text_features import compute_raw_language_features
from feature_engineering import extract_language_features

In [14]:
#paths

PROJECT_ROOT = Path.cwd().parent

RAW_DATA_DIR = PROJECT_ROOT / "data" / "language" / "raw"
OUTPUT_DIR = PROJECT_ROOT / "data" / "language" / "processed"

X_PATH = OUTPUT_DIR / "X_features.npy"
Y_PATH = OUTPUT_DIR / "y_labels.npy"

In [18]:
#config
MIN_WORDS = 5 
MAX_SAMPLES_PER_CLASS = None
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("RAW_DATA_DIR exists:", RAW_DATA_DIR.exists())


PROJECT_ROOT: c:\Users\ASUS\Desktop\Dyslexia screening (Model finetuning resnet-50)
RAW_DATA_DIR exists: True


In [22]:
# DATASET BUILD

def build_dataset():
    X = []
    y = []

    for label_name, label in [("normal", 0), ("dyslexic_like", 1)]:
        class_dir = RAW_DATA_DIR / label_name

        if not class_dir.exists():
            print(f"Missing folder: {class_dir}")
            continue

        files = list(class_dir.glob("*.txt"))
        if MAX_SAMPLES_PER_CLASS:
            files = files[:MAX_SAMPLES_PER_CLASS]

        print(f"\nProcessing {label_name.upper()} ({len(files)} files)")

        for file_path in files:
            try:
                text = file_path.read_text(
                    encoding="utf-8",
                    errors="ignore"
                ).strip()

                if len(text.split()) < MIN_WORDS:
                    continue

                raw_features = compute_raw_language_features(text)
                if raw_features is None:
                    continue

                features = extract_language_features(raw_features)

                X.append(features.flatten())
                y.append(label)

                # DEBUG LINE 
                print(f"Accepted: {file_path.name} | words={len(text.split())}")

            except Exception as e:
                print(f"Skipping {file_path.name}: {e}")

    X = np.array(X, dtype=np.float32)
    y = np.array(y, dtype=np.int64)

    return X, y


In [23]:
if __name__ == "__main__":
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    X, y = build_dataset()
    print("DATASET SUMMARY")
    print("X shape:", X.shape)
    print("y shape:", y.shape)
    print("Class distribution:", np.bincount(y))

    np.save(X_PATH, X)
    np.save(Y_PATH, y)

    print("\nSaved:")
    print(X_PATH)
    print(Y_PATH)


Processing NORMAL (2 files)
[[ 0.          0.          0.26236426  0.18232156 16.75219941 20.46
   0.13976194  6.2166061 ]]
Accepted: dyslexic dataset info.txt | words=2046
[[ 0.          0.          0.26236426  0.18232156 10.9723912  42.74
   0.13976194  6.2166061 ]]
Accepted: language based data.txt | words=4274

Processing DYSLEXIC_LIKE (152 files)
[[0.18232156 0.18232156 0.26236426 0.18232156 3.8        0.05
  0.13976194 6.2166061 ]]
Accepted: sample_S10.txt | words=5
[[0.18232156 0.18232156 0.26236426 0.18232156 3.8        0.05
  0.13976194 6.2166061 ]]
Accepted: sample_S100.txt | words=5
[[0.18232156 0.18232156 0.26236426 0.18232156 3.2        0.05
  0.13976194 6.2166061 ]]
Accepted: sample_S101.txt | words=5
[[0.18232156 0.18232156 0.26236426 0.18232156 3.6        0.05
  0.13976194 6.2166061 ]]
Accepted: sample_S102.txt | words=5
[[0.         0.         0.26236426 0.18232156 3.2        0.05
  0.13976194 6.2166061 ]]
Accepted: sample_S103.txt | words=5
[[0.15415068 0.15415068 0.