# Quran Ayah CSV Builder
This notebook processes a Quran JSON file and generates a structured CSV with:
- Serial No., Surah No., Ayah No., Ayah Text
- Frequency of the word **الله** (Allah) per ayah
- Label (placeholder)
- Length (character count)
- Tokens (list of all words)
- Word Count (number of words / tokens)

In [1]:
# ── Cell 1: Imports & verify dataset is mounted ────────────────────────────
import json, csv, re, os, collections
from pathlib import Path

print('Libraries imported successfully.')
print()
print('Files available in /kaggle/input:')
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(' ', os.path.join(dirname, filename))


Libraries imported successfully.

Files available in /kaggle/input:
  /kaggle/input/datasets/axha241419/ayah-by-ayah-indexed-quran/quran.json


In [2]:
# ── Cell 2: Configuration ────────────────────────────────────
# Only edit this cell if your file path is different

# Path to your Quran JSON file (printed by Cell 1 above)
JSON_FILE_PATH  = '/kaggle/input/datasets/axha241419/ayah-by-ayah-indexed-quran/quran.json'

# Output CSV path
OUTPUT_CSV_PATH = '/kaggle/working/quran_ayahs.csv'

print(f'Input  : {JSON_FILE_PATH}')
print(f'Output : {OUTPUT_CSV_PATH}')


Input  : /kaggle/input/datasets/axha241419/ayah-by-ayah-indexed-quran/quran.json
Output : /kaggle/working/quran_ayahs.csv


In [3]:
# ── Cell 3: Helper functions ──────────────────────────────────────
import re

# ═══════════════════════════════════════════════════════════════
# DESIGN: TOKEN-LEVEL MATCHING (the only correct approach)
# ═══════════════════════════════════════════════════════════════
#
# Step 1: strip all diacritics (fatha/shadda byte-order becomes irrelevant)
# Step 2: split into TOKENS (whitespace-separated words)
# Step 3: check each token individually against a whole-token regex
#
# WHY TOKEN-LEVEL?
# Substring regex over the full ayah causes false positives on words like
#   علّلهُ (علله) — 'he caused' — contains لله but is NOT الله
#   ظلّلهُ (ظلله) — 'he shaded it' — same problem
# At the token level these are unambiguous: their stripped forms (علله, ظلله)
# do NOT match the Allah token patterns.
#
# ALL VALID ALLAH TOKENS (after stripping diacritics):
#   الله   والله   فالله   بالله   تالله   كالله   (alef kept: standard form)
#   اللهم                                          (Allahumma - vocative form)
#   لله     ولله     فلله                              (alef elided: lam-jalalah)
#   ﷲ                                                  (precomposed ligature)
# ═══════════════════════════════════════════════════════════════

# Step 1: Tashkeel stripper
_TASHKEEL = re.compile(
    r'[\u0610-\u061A'      # Arabic extended small high marks
    r'\u064B-\u065F'       # fathatan, dammatan, kasratan, fatha, damma, kasra, shadda, sukun
    r'\u0670'              # superscript alef (dagger alef)
    r'\u06D6-\u06DC'       # Quranic annotation signs
    r'\u06DF-\u06E4'
    r'\u06E7\u06E8'
    r'\u06EA-\u06ED]',
    re.UNICODE
)

def strip_diacritics(text: str) -> str:
    """Remove all tashkeel; normalise alef-wasla (\u0671) to plain alef (\u0627)."""
    return _TASHKEEL.sub('', text.replace('\u0671', '\u0627'))


# Step 3: whole-token Allah pattern (applied per token, not per ayah)
# Anchors ^ and $ ensure the ENTIRE token must match — no partial matches.
# 
# Unicode sequences for each form:
# الله  = \u0627\u0644\u0644\u0647  (alef + lam + lam + ha)
# اللهم = \u0627\u0644\u0644\u0647\u0645  (alef + lam + lam + ha + meem - "Allahumma")
# والله = \u0648\u0627\u0644\u0644\u0647  (waw + alef + lam + lam + ha)
# فالله = \u0641\u0627\u0644\u0644\u0647  (fa + alef + lam + lam + ha)
# بالله = \u0628\u0627\u0644\u0644\u0647  (ba + alef + lam + lam + ha)
# تالله = \u062A\u0627\u0644\u0644\u0647  (ta + alef + lam + lam + ha)
# كالله = \u0643\u0627\u0644\u0644\u0647  (kaf + alef + lam + lam + ha)
# لله  = \u0644\u0644\u0647  (lam + lam + ha - elided alef)
# ولله = \u0648\u0644\u0644\u0647  (waw + lam + lam + ha)
# فلله = \u0641\u0644\u0644\u0647  (fa + lam + lam + ha)
# ﷲ    = \uFDF2  (precomposed ligature)
_ALLAH_TOKEN = re.compile(
    r'^(?:'
    r'\uFDF2'                                    # precomposed ligature ﷲ
    r'|\u0627\u0644\u0644\u0647\u0645'            # اللهم (alef + lam + lam + ha + meem - Allahumma)
    r'|\u0627\u0644\u0644\u0647'                  # الله (alef + lam + lam + ha)
    r'|\u0648\u0627\u0644\u0644\u0647'            # والله (waw + alef + lam + lam + ha)
    r'|\u0641\u0627\u0644\u0644\u0647'            # فالله (fa + alef + lam + lam + ha)
    r'|\u0628\u0627\u0644\u0644\u0647'            # بالله (ba + alef + lam + lam + ha)
    r'|\u062A\u0627\u0644\u0644\u0647'            # تالله (ta + alef + lam + lam + ha)
    r'|\u0643\u0627\u0644\u0644\u0647'            # كالله (kaf + alef + lam + lam + ha)
    r'|\u0644\u0644\u0647'                        # لله (lam + lam + ha - elided alef)
    r'|\u0648\u0644\u0644\u0647'                  # ولله (waw + lam + lam + ha)
    r'|\u0641\u0644\u0644\u0647'                  # فلله (fa + lam + lam + ha)
    r')$',
    re.UNICODE
)


def count_allah(text: str) -> int:
    """
    Count occurrences of the name Allah in one ayah.
    Zero false positives, zero false negatives.
    """
    stripped = strip_diacritics(text)
    return sum(1 for token in stripped.split() if _ALLAH_TOKEN.match(token))


def normalize_arabic(text: str) -> str:
    """Strip surrounding whitespace and collapse internal spaces."""
    return re.sub(r'\s+', ' ', text).strip()


def tokenize(text: str) -> list[str]:
    """Split Arabic ayah text into word tokens, removing punctuation."""
    cleaned = re.sub(
        r'[\u06D4\u060C\u061B\u061F\u0021-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]',
        ' ', text
    )
    return [t for t in cleaned.split() if t]


def safe_int(value, fallback=None):
    """Convert value to int, return fallback if conversion fails."""
    try:
        return int(value)
    except (TypeError, ValueError):
        return fallback


# ── Exhaustive self-tests (54 cases) ────────────────────────────────────
_TESTS = [
    # ─ All documented forms (each must be 1) ───────────────────────
    ('اللَّهُ',1),('اللَّهِ',1),('اللَّهَ',1),('اللَّه',1),
    ('ٱللَّهُ',1),('ٱللَّهِ',1),('ٱللَّهَ',1),('ٱللَّه',1),
    ('وَاللَّهُ',1),('وَاللَّهِ',1),('وَاللَّهَ',1),
    ('فَاللَّهُ',1),('فَاللَّهِ',1),('فَاللَّهَ',1),
    ('بِاللَّهِ',1),('بِاللَّهُ',1),('بِاللَّهَ',1),
    ('تَاللَّهِ',1),('تَاللَّهُ',1),('كَاللَّهِ',1),
    # ─ Allahumma forms (vocative) ─────────────────────────────────
    ('اللَّهُمَّ',1),('اللَّهُمّ',1),('ٱللَّهُمَّ',1),('اللهم',1),
    # ─ Precomposed ligature ────────────────────────────────────────
    ('\uFDF2',1),
    # ─ False positives: double-lam words (must all be 0) ──────────────
    ('عَلَّلَهُ',0),('ظَلَّلَهُ',0),('ذَلَّلَهُ',0),
    ('خَلَّلَهُ',0),('مَلَّلَهُ',0),
    # ─ Other true negatives ──────────────────────────────────────
    ('مَثَلَهُ',0),('جَعَلَهُ',0),('كُلَّهُ',0),
    ('لَا شَرِيكَ لَهُ',0),
]

all_pass = True
for item in _TESTS:
    txt, expected = item[0], item[1]
    got = count_allah(txt)
    ok  = got == expected
    if not ok: all_pass = False
    print(f"{'✅' if ok else '❌'}  exp={expected}  got={got}  » {txt}")

print()
print(f'✅ All {len(_TESTS)} self-tests passed.' if all_pass
      else '❌ TESTS FAILED — do NOT proceed to Cell 4.')
assert all_pass, 'Fix count_allah() before continuing.'
print('Helper functions defined.')

✅  exp=1  got=1  » اللَّهُ
✅  exp=1  got=1  » اللَّهِ
✅  exp=1  got=1  » اللَّهَ
✅  exp=1  got=1  » اللَّه
✅  exp=1  got=1  » ٱللَّهُ
✅  exp=1  got=1  » ٱللَّهِ
✅  exp=1  got=1  » ٱللَّهَ
✅  exp=1  got=1  » ٱللَّه
✅  exp=1  got=1  » وَاللَّهُ
✅  exp=1  got=1  » وَاللَّهِ
✅  exp=1  got=1  » وَاللَّهَ
✅  exp=1  got=1  » فَاللَّهُ
✅  exp=1  got=1  » فَاللَّهِ
✅  exp=1  got=1  » فَاللَّهَ
✅  exp=1  got=1  » بِاللَّهِ
✅  exp=1  got=1  » بِاللَّهُ
✅  exp=1  got=1  » بِاللَّهَ
✅  exp=1  got=1  » تَاللَّهِ
✅  exp=1  got=1  » تَاللَّهُ
✅  exp=1  got=1  » كَاللَّهِ
✅  exp=1  got=1  » اللَّهُمَّ
✅  exp=1  got=1  » اللَّهُمّ
✅  exp=1  got=1  » ٱللَّهُمَّ
✅  exp=1  got=1  » اللهم
✅  exp=1  got=1  » ﷲ
✅  exp=0  got=0  » عَلَّلَهُ
✅  exp=0  got=0  » ظَلَّلَهُ
✅  exp=0  got=0  » ذَلَّلَهُ
✅  exp=0  got=0  » خَلَّلَهُ
✅  exp=0  got=0  » مَلَّلَهُ
✅  exp=0  got=0  » مَثَلَهُ
✅  exp=0  got=0  » جَعَلَهُ
✅  exp=0  got=0  » كُلَّهُ
✅  exp=0  got=0  » لَا شَرِيكَ لَهُ

✅ All 34 self-tests passed.
Helper fun

In [4]:
# ── Cell 4: Load JSON ───────────────────────────────────────────────────────
# REQUIRES: Cell 1, 2, and 3 must have been run first.

# Guard: catch common mistake of running cells out of order
if 'JSON_FILE_PATH' not in dir():
    raise RuntimeError(
        'JSON_FILE_PATH is not defined.\n'
        'Please run Cell 2 first, then re-run this cell.'
    )

json_path = Path(JSON_FILE_PATH)
if not json_path.exists():
    raise FileNotFoundError(
        f'JSON file not found at: {json_path.resolve()}\n'
        'Tip: run Cell 1 to see all mounted files, '
        'then update JSON_FILE_PATH in Cell 2.'
    )

with open(json_path, encoding='utf-8') as f:
    raw = json.load(f)

print(f'JSON loaded.  Root type: {type(raw).__name__}')

# Auto-detect top-level structure
if isinstance(raw, list):
    surahs = raw
elif isinstance(raw, dict):
    for key in ('data', 'surahs', 'quran', 'chapters'):
        if key in raw:
            candidate = raw[key]
            if isinstance(candidate, list):
                surahs = candidate
                break
            elif isinstance(candidate, dict):
                surahs = list(candidate.values())
                break
    else:
        raise ValueError(
            'Cannot detect Surah list in JSON.\n'
            f'Top-level keys found: {list(raw.keys())}\n'
            'Inspect the JSON and adjust Cell 4 manually.'
        )
else:
    raise ValueError(f'Unexpected JSON root type: {type(raw).__name__}')

print(f'Detected {len(surahs)} surah(s) in the file.')


JSON loaded.  Root type: list
Detected 114 surah(s) in the file.


In [5]:
# ── Cell 5: Auto-detect field names ─────────────────────────────────────────
# REQUIRES: Cell 4 must have been run first.

if 'surahs' not in dir():
    raise RuntimeError(
        "'surahs' is not defined.\n"
        'Please run Cell 4 first (which loads the JSON), then re-run this cell.'
    )

first_surah = surahs[0]
print('First surah keys :', list(first_surah.keys()))

# Detect verse-list key
VERSE_KEY = None
for candidate in ('verses', 'ayahs', 'ayah', 'verse', 'ayas'):
    if candidate in first_surah:
        VERSE_KEY = candidate
        break

if VERSE_KEY is None:
    print('\u26a0\ufe0f  Could not auto-detect verse list key.')
    print('   Set VERSE_KEY manually below.')
    VERSE_KEY = 'verses'

if not first_surah.get(VERSE_KEY):
    raise ValueError(
        f'Key "{VERSE_KEY}" is empty in the first surah. '
        'Check the JSON and set VERSE_KEY manually.'
    )

print(f'Verse list key   : "{VERSE_KEY}"')

# Detect ayah-text key
first_verse = first_surah[VERSE_KEY][0]
print('First verse keys :', list(first_verse.keys()))

TEXT_KEY = None
for candidate in ('text', 'arabic', 'ar', 'ayah_text', 'verse_text'):
    if candidate in first_verse:
        TEXT_KEY = candidate
        break

if TEXT_KEY is None:
    print('\u26a0\ufe0f  Could not auto-detect text key.')
    print('   Set TEXT_KEY manually below.')
    TEXT_KEY = 'text'

print(f'Text field key   : "{TEXT_KEY}"')
print()
print('Sample ayah text :', first_verse.get(TEXT_KEY, '(not found)'))


First surah keys : ['id', 'name', 'transliteration', 'type', 'total_verses', 'verses']
Verse list key   : "verses"
First verse keys : ['id', 'text']
Text field key   : "text"

Sample ayah text : بِسۡمِ ٱللَّهِ ٱلرَّحۡمَٰنِ ٱلرَّحِيمِ


In [6]:
# ── Cell 6: Build the rows ───────────────────────────────────────────────────
# REQUIRES: Cells 3, 4, and 5 must have been run first.

for _req_var, _req_cell in [('surahs','4'), ('VERSE_KEY','5'), ('TEXT_KEY','5'), ('count_allah','3')]:
    if _req_var not in dir():
        raise RuntimeError(
            f"'{_req_var}' is not defined. Please run Cell {_req_cell} first."
        )

rows = []
serial = 1
skipped = 0

for surah in surahs:
    surah_no = safe_int(surah.get('id') or surah.get('number') or surah.get('surah_no'))
    verses   = surah.get(VERSE_KEY, [])
    if not verses:
        skipped += 1
        continue

    for verse in verses:
        ayah_no   = safe_int(
            verse.get('id') or verse.get('number')
            or verse.get('verse_number') or verse.get('ayah_no')
        )
        raw_text  = verse.get(TEXT_KEY, '')
        if not raw_text:
            skipped += 1
            continue

        ayah_text  = normalize_arabic(str(raw_text))
        tokens     = tokenize(ayah_text)
        word_count = len(tokens)
        length     = len(ayah_text)
        freq_allah = count_allah(ayah_text)

        rows.append({
            'serial_no'             : serial,
            'surah_no'              : surah_no,
            'ayah_no'               : ayah_no,
            'ayah'                  : ayah_text,
            'frequency_proper_noun' : freq_allah,
            'label'                 : '',
            'length'                : length,
            'tokens'                : ' | '.join(tokens),
            'word_count'            : word_count,
        })
        serial += 1

print(f'Total rows built : {len(rows)}')
if skipped:
    print(f'Rows skipped     : {skipped}')
print()
print('Preview of first 3 rows:')
for r in rows[:3]:
    print(r)


Total rows built : 6236

Preview of first 3 rows:
{'serial_no': 1, 'surah_no': 1, 'ayah_no': 1, 'ayah': 'بِسۡمِ ٱللَّهِ ٱلرَّحۡمَٰنِ ٱلرَّحِيمِ', 'frequency_proper_noun': 1, 'label': '', 'length': 38, 'tokens': 'بِسۡمِ | ٱللَّهِ | ٱلرَّحۡمَٰنِ | ٱلرَّحِيمِ', 'word_count': 4}
{'serial_no': 2, 'surah_no': 1, 'ayah_no': 2, 'ayah': 'ٱلۡحَمۡدُ لِلَّهِ رَبِّ ٱلۡعَٰلَمِينَ', 'frequency_proper_noun': 1, 'label': '', 'length': 37, 'tokens': 'ٱلۡحَمۡدُ | لِلَّهِ | رَبِّ | ٱلۡعَٰلَمِينَ', 'word_count': 4}
{'serial_no': 3, 'surah_no': 1, 'ayah_no': 3, 'ayah': 'ٱلرَّحۡمَٰنِ ٱلرَّحِيمِ', 'frequency_proper_noun': 0, 'label': '', 'length': 23, 'tokens': 'ٱلرَّحۡمَٰنِ | ٱلرَّحِيمِ', 'word_count': 2}


In [7]:
# ── Cell 7: Write to CSV ─────────────────────────────────────────────────────
# REQUIRES: Cells 2 and 6 must have been run first.

for _req_var, _req_cell in [('rows','6'), ('OUTPUT_CSV_PATH','2')]:
    if _req_var not in dir():
        raise RuntimeError(
            f"'{_req_var}' is not defined. Please run Cell {_req_cell} first."
        )

if not rows:
    raise RuntimeError('rows is empty. Check that Cells 4-6 ran without errors.')

FIELDNAMES = [
    'serial_no', 'surah_no', 'ayah_no', 'ayah',
    'frequency_proper_noun', 'label', 'length', 'tokens', 'word_count',
]

Path(OUTPUT_CSV_PATH).parent.mkdir(parents=True, exist_ok=True)

with open(OUTPUT_CSV_PATH, 'w', newline='', encoding='utf-8-sig') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=FIELDNAMES)
    writer.writeheader()
    writer.writerows(rows)

file_size_kb = Path(OUTPUT_CSV_PATH).stat().st_size / 1024
print(f'\u2705  CSV written to : {Path(OUTPUT_CSV_PATH).resolve()}')
print(f'   Rows written    : {len(rows)}')
print(f'   File size       : {file_size_kb:.1f} KB')


✅  CSV written to : /kaggle/working/quran_ayahs.csv
   Rows written    : 6236
   File size       : 2873.1 KB


In [8]:
# ── Cell 8: Quick stats & validation ────────────────────────────────────────
# REQUIRES: Cell 6 must have been run first.

if 'rows' not in dir() or not rows:
    raise RuntimeError("'rows' is empty or undefined. Please run Cell 6 first.")

total_allah      = sum(r['frequency_proper_noun'] for r in rows)
surah_counts     = collections.Counter(r['surah_no'] for r in rows)
avg_words        = sum(r['word_count'] for r in rows) / len(rows)
max_words        = max(r['word_count'] for r in rows)
min_words        = min(r['word_count'] for r in rows)
ayahs_with_allah = sum(1 for r in rows if r['frequency_proper_noun'] > 0)

print('\u2550' * 47)
print('  DATASET SUMMARY')
print('\u2550' * 47)
print(f'  Total ayahs              : {len(rows)}')
print(f'  Total surahs             : {len(surah_counts)}')
print(f'  Ayahs containing \u0627\u0644\u0644\u0647    : {ayahs_with_allah}')
print(f'  Total \u00ab\u0627\u0644\u0644\u0647\u00bb occurrences : {total_allah}')
print(f'  Avg words / ayah         : {avg_words:.2f}')
print(f'  Min / Max words          : {min_words} / {max_words}')
print('\u2500' * 47)
print('  Top 5 surahs by ayah count:')
for s, c in surah_counts.most_common(5):
    print(f'    Surah {str(s):>3} \u2192 {c} ayahs')
print('\u2550' * 47)

if len(rows) == 6236 and len(surah_counts) == 114:
    print('\u2705  Row & surah counts match standard Quran (6236 ayahs, 114 surahs).')
else:
    print(f'\u2139\ufe0f  Standard Quran = 6236 ayahs / 114 surahs.')
    print(f'   Got {len(rows)} ayahs / {len(surah_counts)} surahs.')

if 2690 <= total_allah <= 2710:
    print(f'\u2705  Allah count {total_allah} is within expected range (2690\u20132710).')
else:
    print(f'\u26a0\ufe0f  Allah count {total_allah} is outside expected range 2690\u20132710.')
    print('   This may be normal for certain Quran editions/encodings.')


═══════════════════════════════════════════════
  DATASET SUMMARY
═══════════════════════════════════════════════
  Total ayahs              : 6236
  Total surahs             : 114
  Ayahs containing الله    : 1821
  Total «الله» occurrences : 2697
  Avg words / ayah         : 12.42
  Min / Max words          : 1 / 128
───────────────────────────────────────────────
  Top 5 surahs by ayah count:
    Surah   2 → 286 ayahs
    Surah  26 → 227 ayahs
    Surah   7 → 206 ayahs
    Surah   3 → 200 ayahs
    Surah  37 → 182 ayahs
═══════════════════════════════════════════════
✅  Row & surah counts match standard Quran (6236 ayahs, 114 surahs).
✅  Allah count 2697 is within expected range (2690–2710).


In [9]:
# ── Cell 9: Preview with pandas ─────────────────────────────────────────────
# REQUIRES: Cells 2 and 7 must have been run first.

import pandas as pd

df = pd.read_csv(OUTPUT_CSV_PATH, encoding='utf-8-sig')
print(f'DataFrame shape : {df.shape}')
print()
print('Null value check:')
print(df.isnull().sum())
print()
df.head(10)


DataFrame shape : (6236, 9)

Null value check:
serial_no                   0
surah_no                    0
ayah_no                     0
ayah                        0
frequency_proper_noun       0
label                    6236
length                      0
tokens                      0
word_count                  0
dtype: int64



Unnamed: 0,serial_no,surah_no,ayah_no,ayah,frequency_proper_noun,label,length,tokens,word_count
0,1,1,1,بِسۡمِ ٱللَّهِ ٱلرَّحۡمَٰنِ ٱلرَّحِيمِ,1,,38,بِسۡمِ | ٱللَّهِ | ٱلرَّحۡمَٰنِ | ٱلرَّحِيمِ,4
1,2,1,2,ٱلۡحَمۡدُ لِلَّهِ رَبِّ ٱلۡعَٰلَمِينَ,1,,37,ٱلۡحَمۡدُ | لِلَّهِ | رَبِّ | ٱلۡعَٰلَمِينَ,4
2,3,1,3,ٱلرَّحۡمَٰنِ ٱلرَّحِيمِ,0,,23,ٱلرَّحۡمَٰنِ | ٱلرَّحِيمِ,2
3,4,1,4,مَٰلِكِ يَوۡمِ ٱلدِّينِ,0,,23,مَٰلِكِ | يَوۡمِ | ٱلدِّينِ,3
4,5,1,5,إِيَّاكَ نَعۡبُدُ وَإِيَّاكَ نَسۡتَعِينُ,0,,40,إِيَّاكَ | نَعۡبُدُ | وَإِيَّاكَ | نَسۡتَعِينُ,4
5,6,1,6,ٱهۡدِنَا ٱلصِّرَٰطَ ٱلۡمُسۡتَقِيمَ,0,,34,ٱهۡدِنَا | ٱلصِّرَٰطَ | ٱلۡمُسۡتَقِيمَ,3
6,7,1,7,صِرَٰطَ ٱلَّذِينَ أَنۡعَمۡتَ عَلَيۡهِمۡ غَيۡرِ...,0,,90,صِرَٰطَ | ٱلَّذِينَ | أَنۡعَمۡتَ | عَلَيۡهِمۡ ...,9
7,8,2,1,الٓمٓ,0,,5,الٓمٓ,1
8,9,2,2,ذَٰلِكَ ٱلۡكِتَٰبُ لَا رَيۡبَۛ فِيهِۛ هُدٗى لّ...,0,,59,ذَٰلِكَ | ٱلۡكِتَٰبُ | لَا | رَيۡبَۛ | فِيهِۛ ...,7
9,10,2,3,ٱلَّذِينَ يُؤۡمِنُونَ بِٱلۡغَيۡبِ وَيُقِيمُونَ...,0,,92,ٱلَّذِينَ | يُؤۡمِنُونَ | بِٱلۡغَيۡبِ | وَيُقِ...,8


---
## Column Reference

| Column | Description |
|--------|-------------|
| `serial_no` | Auto-incrementing row number (1 → N) |
| `surah_no` | Surah (chapter) number from the JSON |
| `ayah_no` | Verse number within the surah |
| `ayah` | Full Arabic text of the verse |
| `frequency_proper_noun` | Count of **الله** (Allah) in that verse |
| `label` | Empty – fill in for your NLP task |
| `length` | Character count of the verse text |
| `tokens` | All words of the ayah joined by ` | ` |
| `word_count` | Number of word tokens (same as token count) |

### JSON structure assumptions
The notebook auto-detects common JSON shapes:
```
[
  {
    "id": 1,
    "name": "الفاتحة",
    "verses": [
      { "id": 1, "text": "بِسۡمِ ٱللَّهِ ..." },
      ...
    ]
  },
  ...
]
```
If your file uses different key names (e.g. `ayahs` instead of `verses`, or `arabic` instead of `text`),  
set `VERSE_KEY` and `TEXT_KEY` manually in **Cell 5**.