In [2]:
# =========================================
# Bag of Words (BOW) from file
# =========================================

from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import os
import re

# مجلد النتائج
os.makedirs("results", exist_ok=True)

# دالة المعالجة المسبقة
def preprocess(text):
    text = re.sub(r'[ًٌٍَُِّْـ]', '', text)
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'\bو', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

# قراءة السورة من الملف
with open("surah.txt", "r", encoding="utf-8") as f:
    surah = [preprocess(line.strip()) for line in f if line.strip()]

# نموذج BOW
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(surah)

features = vectorizer.get_feature_names_out()
matrix = bow_matrix.toarray()
word_frequencies = np.sum(matrix, axis=0)

# حفظ النتائج
with open("results/bow_results.txt", "w", encoding="utf-8") as f:
    f.write("Bag of Words (BOW) with Preprocessing\n\n")
    f.write("Word : Total Frequency\n")
    f.write("----------------------\n")
    for word, freq in zip(features, word_frequencies):
        f.write(f"{word} : {freq}\n")
    f.write("\nBOW Matrix (per verse):\n")
    for row in matrix:
        f.write(str(row) + "\n")

print("BOW results saved to results/bow_results.txt")


BOW results saved to results/bow_results.txt
