In [1]:
# =========================================
# TF-IDF from file
# =========================================

from sklearn.feature_extraction.text import TfidfVectorizer
import os
import re

os.makedirs("results", exist_ok=True)

def preprocess(text):
    text = re.sub(r'[ًٌٍَُِّْـ]', '', text)
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'\bو', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

# قراءة الملف
with open("surah.txt", "r", encoding="utf-8") as f:
    surah = [preprocess(line.strip()) for line in f if line.strip()]

# نموذج TF-IDF
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(surah)

features = tfidf.get_feature_names_out()
matrix = tfidf_matrix.toarray()

with open("results/tfidf_results.txt", "w", encoding="utf-8") as f:
    f.write("TF-IDF Results - Surah Al-Fatiha\n\n")
    f.write("Word : TF-IDF Weight (total)\n")
    f.write("----------------------\n")
    for word, weight in zip(features, matrix.sum(axis=0)):
        f.write(f"{word} : {weight:.4f}\n")
    f.write("\nTF-IDF Matrix (per verse):\n")
    for row in matrix:
        f.write(str([round(val, 4) for val in row]) + "\n")

print("TF-IDF results saved to results/tfidf_results.txt")


TF-IDF results saved to results/tfidf_results.txt
