#### Using Dedicated Hebrew tokenizer 

In [4]:
# Import necessary modules
import sys
import os

# Add current directory to sys.path
sys.path.append(os.path.abspath(os.getcwd()))

from hebrew_tokenizer_package.tokenizer import Tokenizer

file_path = 'genesis_hebrew.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    hebrew_text = file.read()
    
# Tokenize the Hebrew text
tokenizer = Tokenizer()
tokens = list(tokenizer.tokenize(hebrew_text))


In [56]:
print(tokens)

[('HEBREW', 'רֵאשִׁית', 0, (0, 8)), ('HEBREW', 'בּרא', 1, (9, 13)), ('HEBREW', 'אֱלֹהִים', 2, (14, 22)), ('HEBREW', 'שָׁמַיִם', 3, (23, 31)), ('HEBREW', 'אֶרֶץ', 4, (32, 37)), ('HEBREW', 'אֶרֶץ', 5, (38, 43)), ('HEBREW', 'היה', 6, (44, 47)), ('HEBREW', 'תֹּהוּ', 7, (48, 54)), ('HEBREW', 'בֹּהוּ', 8, (55, 61)), ('HEBREW', 'חֹשֶׁךְ', 9, (62, 69)), ('HEBREW', 'פָּנֶה', 10, (70, 76)), ('HEBREW', 'תְּהוֺם', 11, (77, 84)), ('HEBREW', 'רוּחַ', 12, (85, 90)), ('HEBREW', 'אֱלֹהִים', 13, (91, 99)), ('HEBREW', 'רחף', 14, (100, 103)), ('HEBREW', 'פָּנֶה', 15, (104, 110)), ('HEBREW', 'מַיִם', 16, (111, 116)), ('HEBREW', 'אמר', 17, (117, 120)), ('HEBREW', 'אֱלֹהִים', 18, (121, 129)), ('HEBREW', 'היה', 19, (130, 133)), ('HEBREW', 'אוֺר', 20, (134, 138)), ('HEBREW', 'היה', 21, (139, 142)), ('HEBREW', 'אוֺר', 22, (143, 147)), ('HEBREW', 'ראה', 23, (148, 151)), ('HEBREW', 'אֱלֹהִים', 24, (152, 160)), ('HEBREW', 'אוֺר', 25, (161, 165)), ('HEBREW', 'כִּי', 26, (166, 170)), ('HEBREW', 'טוֺב', 27, (171, 175

In [7]:
from gensim import corpora

# Create a list of tokenized documents
documents = [token[1].split() for token in tokens]

# Create a dictionary from the tokenized documents
dictionary = corpora.Dictionary(documents)



# Create a bag-of-words representation of the corpus
corpus = [dictionary.doc2bow(doc) for doc in documents]


In [8]:
from gensim.models import LdaModel

# Set the number of topics
num_topics = 5

# Train the LDA model
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

# Print the top 5 tokens with highest probability for each topic
for idx, topic in lda_model.show_topics(num_words=5, formatted=False):
    print(f"Topic {idx}:")
    for word, prob in topic:
        print(f"{word}: {prob:.4f}")
    print()


Topic 0:
יוֺסֵף: 0.1021
אָח: 0.0616
בּוא: 0.0503
עשׂה: 0.0345
ילד: 0.0271

Topic 1:
אמר: 0.1793
כִּי: 0.0766
יוֺם: 0.0460
ראה: 0.0423
אִישׁ: 0.0380

Topic 2:
אֲשֶׁר: 0.0942
אֶרֶץ: 0.0751
מִצְרַיִם: 0.0421
פַּרְעֹה: 0.0392
בַּיִת: 0.0369

Topic 3:
אָב: 0.0960
היה: 0.0626
אֱלֹהִים: 0.0533
עלה: 0.0408
נתן: 0.0343

Topic 4:
בֵּן: 0.0980
כֹּל: 0.0777
יַעֲקֹב: 0.0481
לֹא: 0.0456
גַּם: 0.0325



Topic 0:
Said: 0.1940
where: 0.1248
Joseph: 0.1237
God: 0.0595
Again: 0.0312

Topic 1:
Total: 0.0887
Because: 0.0833
Ohh: 0.0751
Jacob: 0.0549
No: 0.0521

Topic 2:
Ob: 0.1125
Home: 0.0512
Leaf: 0.0478
Make: 0.0438
Year: 0.0378

Topic 3:
Son: 0.1019
Come on: 0.0559
Egypt: 0.0511
Pharaoh: 0.0476
Man: 0.0377

Topic 4:
Country: 0.1119
was: 0.0786
See: 0.0514
Here it is: 0.0422
Also: 0.0414

Topic 0:
Joseph: 0.0985
Ob: 0.0854
Egypt: 0.0444
No: 0.0412
Day: 0.0396

Topic 1:
Country: 0.0867
Total: 0.0768
Because: 0.0722
Ohh: 0.0650
Man: 0.0358

Topic 2:
where: 0.1114
was: 0.0624
Come on: 0.0544
Jacob: 0.0488
Leaf: 0.0407

Topic 3:
Said: 0.1678
Pharaoh: 0.0450
Nathan: 0.0331
Please: 0.0302
thing: 0.0283

Topic 4:
Son: 0.0934
God: 0.0500
Home: 0.0410
See: 0.0384
Grave: 0.0284

In [9]:
from gensim.models import HdpModel

# Train the HDP model
hdp_model = HdpModel(corpus, dictionary)

# Print the topics
topics = hdp_model.show_topics()
for topic_id, topic in topics:
    print(f"Topic {topic_id}: {topic}\n")


Topic 0: 0.005*יצא + 0.004*רְאוּבֵן + 0.004*חֶמְאָה + 0.004*עֶשֶׂר + 0.003*חַטָּאת + 0.003*חֶלְקָה + 0.003*נְחֹשֶׁת + 0.003*יָוָן + 0.003*אֶשְׁכֹּל + 0.003*אֻמָּה + 0.003*פּוּט + 0.003*בֶּקַע + 0.003*ירשׁ + 0.003*שַׁעַר + 0.003*שׂוח + 0.003*עֶרֶב + 0.003*סָבִיב + 0.003*רֶוַח + 0.003*חִנָּם + 0.003*עזב

Topic 1: 0.004*יטב + 0.004*צֵל + 0.004*פּרד + 0.004*תִּמְנַע + 0.003*ידה + 0.003*סְפָר + 0.003*כֹּל + 0.003*רֵאשִׁית + 0.003*טַבַּעַת + 0.003*כּבס + 0.003*הלךְ + 0.003*אַלְמוֺדָד + 0.003*רְעוּאֵל + 0.003*אֲרָם + 0.003*עַכְבּוֺר + 0.003*נַעֲרָה + 0.003*בְּדַד + 0.002*נהל + 0.002*עֲנָה + 0.002*בֵּן

Topic 2: 0.003*הלךְ + 0.003*רפא + 0.003*גֹּמֶר + 0.003*בִּנְיָמִן + 0.003*שׁכח + 0.003*תָּמָר + 0.003*קֵץ + 0.003*גַּעְתָּם + 0.003*יצא + 0.003*אַלְמוֺדָד + 0.003*גּוֺזָל + 0.003*עֶשְׂרֵה + 0.003*רְכוּשׁ + 0.003*עֹרֶף + 0.003*מלט + 0.003*אוֺנָם + 0.003*אַךְ + 0.002*הֲלֹם + 0.002*עֵת + 0.002*מִין

Topic 3: 0.004*רָעָב + 0.004*פּצל + 0.004*נֵץ + 0.003*שְׁכֶם + 0.003*ירא + 0.003*יקע + 0.003*תּפשׂ 

Topic 0: 0.014*front + 0.012*right + 0.011*left + 0.010*back + 0.010*ninety + 0.009*right + 0.008*next + 0.007*ten E + 0.007*true + 0.007*carried + 0.007*Abraham + 0.006*Raba + 0.006*Sin + 0.006*Ad + 0.006*Dinah + 0.006*Hem + 0.006*Gam + 0.006*Yom

Topic 1: 0.011*Angel + 0.009*Desert + 0.009*Tove + 0.008*Acho + 0.008*Abimelech + 0.008*Shet + 0.008*Eir + 0.007*Ketz + 0.007*Gado L + 0.007*in the top + 0.007*bed + 0.007*dot + 0.007*har + 0.007*grace + 0.006*dor + 0.006*king + 0.006*peleg + 0.006*dudaiim + 0.006*he

Topic 2: 0.017*Nathan + 0.012*Taba + 0.011*Lamaan + 0.010*Anhanu + 0.009*Oz + 0.009*Kesef + 0.007*Shekka + 0.007*Elipaz + 0.007*Manashe + 0.007*Ev ֶN + 0.006*pen + 0.006*tsan + 0.006*Yitzchak + 0.006*Sodom + 0.006*Remesh + 0.006*Shechem + 0.005*Here + 0.005*Grave + 0.005*Altar + 0.005*Go'i

Topic 3: 0.012*eye + 0.011*sword + 0.010*na + 0.009*great + 0.009*very + 0.008*life + 0.008*language + 0.007*life + 0.007*shine + 0.0 06*ear + 0.006*rest + 0.006*Gold + 0.006*Door + 0.006*Negesh + 0.006*God + 0.006*Petah + 0.006*Adam + 0.006*Luֺt + 0.006*Ephroֺn

Topic 4: 0.015*Red + 0.014*Monday + 0.012*Two + 0.008*Ayn + 0.008*Morning + 0.008*Gedal + 0.008*Egypt + 0.008* Nasech + 0.008*Yahem + 0.007*Ephraim + 0.007*save + 0.007*eat + 0.007*purchase + 0.007*find + 0.007*talk + 0.007*Canaan + 0.007*servant + 0.007*righteous

Topic 5: 0.011*Cain + 0.010*Bena + 0.010*Snaiim + 0.009*Yachal + 0.009*Nahar + 0.008*Pharaoh + 0.008*Canaan + 0.008*Arr + 0.007*male + 0.007*accessible + 0.007*beautiful + 0.007*nathan + 0.007*animal + 0.007*family + 0.007*vulva + 0.006*path + 0.006*five + 0.006*good

Topic 6: 0.010*Oda + 0.009*Patah + 0.009*Kichor + 0.009*Hay + 0.008*Shlesh + 0.008*God + 0.008*Nahar + 0.007*Shak + 0.007*Sheni + 0.007*Bor + 0 .007* bride + 0.007* bra + 0.006*dress + 0.006*all + 0.006*pregnant + 0.006*leaf + 0.006*sab + 0.006*neck + 0.006*ulli + 0.006*fall

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import sys
import os

# Add current directory to sys.path
sys.path.append(os.path.abspath(os.getcwd()))

from hebrew_tokenizer_package.tokenizer import Tokenizer

file_path = 'genesis_hebrew.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    hebrew_text = file.read()

# Tokenize the Hebrew text
tokenizer = Tokenizer()
tokens = list(tokenizer.tokenize(hebrew_text))




documents = hebrew_text.split()  

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the documents into TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(documents)

# Define the number of components for LSA
num_components = 5

# Apply Truncated SVD to perform LSA
lsa_model = TruncatedSVD(n_components=num_components)
lsa_matrix = lsa_model.fit_transform(tfidf_matrix)

# Print the topics (components) and the top words for each component
terms = vectorizer.get_feature_names_out()
for i, component in enumerate(lsa_model.components_):
    top_terms = [terms[term_idx] for term_idx in component.argsort()[:-6:-1]]
    print(f"Component {i+1}: {', '.join(top_terms)}")


Component 1: אמר, ראה, בר, קרא, ילד
Component 2: ים, קו, עו, או, מו
Component 3: יו, קרא, ראה, הו, יהוה
Component 4: היה, עש, בר, יש, לקח
Component 5: יש, הלך, דו, עו, קרא


#### translational to english
Component 1: said, do, see, read, have
Component 2: sea, line, o, o, mo
Component 3: Yo, Natan, Lu, Yir, Hakel
Component 4: was, o, read, took, went
Component 5: Has, went, gave, boy, did

In [25]:
# Tokenize the Hebrew text into words
tokenized_text = hebrew_text.split()  # Split the text into words based on whitespace

# Print the tokenized text
print(tokenized_text)


['רֵאשִׁית', 'בּרא', 'אֱלֹהִים', 'שָׁמַיִם', 'אֶרֶץ', 'אֶרֶץ', 'היה', 'תֹּהוּ', 'בֹּהוּ', 'חֹשֶׁךְ', 'פָּנֶה', 'תְּהוֺם', 'רוּחַ', 'אֱלֹהִים', 'רחף', 'פָּנֶה', 'מַיִם', 'אמר', 'אֱלֹהִים', 'היה', 'אוֺר', 'היה', 'אוֺר', 'ראה', 'אֱלֹהִים', 'אוֺר', 'כִּי', 'טוֺב', 'בּדל', 'אֱלֹהִים', 'בַּיִן', 'אוֺר', 'בַּיִן', 'חֹשֶׁךְ', 'קרא', 'אֱלֹהִים', 'אוֺר', 'יוֺם', 'חֹשֶׁךְ', 'קרא', 'לַיְלָה', 'היה', 'עֶרֶב', 'היה', 'בֹּקֶר', 'יוֺם', 'אֶחָד', 'ף', 'אמר', 'אֱלֹהִים', 'היה', 'רָקִיעַ', 'תָּוֶךְ', 'מַיִם', 'היה', 'בּדל', 'בַּיִן', 'מַיִם', 'מַיִם', 'עשׂה', 'אֱלֹהִים', 'רָקִיעַ', 'בּדל', 'בַּיִן', 'מַיִם', 'אֲשֶׁר', 'תַּחַת', 'רָקִיעַ', 'בַּיִן', 'מַיִם', 'אֲשֶׁר', 'רָקִיעַ', 'היה', 'כֵּן', 'קרא', 'אֱלֹהִים', 'רָקִיעַ', 'שָׁמַיִם', 'היה', 'עֶרֶב', 'היה', 'בֹּקֶר', 'יוֺם', 'שֵׁנִי', 'ף', 'אמר', 'אֱלֹהִים', 'קוה', 'מַיִם', 'תַּחַת', 'שָׁמַיִם', 'מָקוֺם', 'אֶחָד', 'ראה', 'יַבָּשָׁה', 'היה', 'כֵּן', 'קרא', 'אֱלֹהִים', 'יַבָּשָׁה', 'אֶרֶץ', 'מִקְוֶה', 'מַיִם', 'קרא', 'יָם', 'ראה', 'אֱלֹהִים', 'כִּי', 'טוֺב'

In [11]:
from sklearn.decomposition import NMF

# Apply NMF to the TF-IDF matrix
nmf_model = NMF(n_components=num_topics)
nmf_matrix = nmf_model.fit_transform(tfidf_matrix)

# Print the topics
print("NMF Topics:")
for i, component in enumerate(nmf_model.components_):
    top_terms = [terms[j] for j in component.argsort()[-5:]]  # Top 5 terms per topic
    print(f"Topic {i+1}: {' '.join(top_terms)}")


NMF Topics:
Topic 1: לקח הו ילד וא אמר
Topic 2: מו או עו קו ים
Topic 3: דו וא נתן עש יו
Topic 4: ראה לקח נתן עש היה
Topic 5: אש הו וא ית יש


NMF Topics:
Topic 1: Bar O Yild Va Amer
Topic 2: Mo or O Ko Yam
Topic 3: See oh boy Natan Yu
Topic 4: Yahweh gave oh and there was
Topic 5: A lesson of yes and no

Topic: 0
Words: 0.053*"Egypt" + 0.049*"Pharaoh" + 0.047*"Day" + 0.040*"Do" + 0.035*"Serve" + 0.033*"Please" + 0.032*"Death" + 0.031*"Talk" + 0.030*"this" + 0.029*"again"
Topic: 1
Words: 0.105*"father" + 0.087*"all" + 0.073*"oh" + 0.060*"come" + 0.054*"Jacob" + 0.048*"house" + 0.045*"see" + 0.045*"leaf" + 0.027*"woman" + 0.023*"eye"
Topic: 2
Words: 0.124*"which" + 0.123*"Joseph" + 0.038*"Nathan" + 0.037*"also" + 0.036*"year" + 0.035*"he" + 0.028*"went" + 0.026*"with" + 0.024*"Jehovah" + 0.019*"Seven"
Topic: 3
Words: 0.117*"son" + 0.087*"because" + 0.073*"was" + 0.038*"turn" + 0.035*"grave" + 0.032*"read" + 0.031*"Abraham" + 0.030*"there" + 0.029*"name" + 0.027*"hand"
Topic: 4
Words: 0.217*"said" + 0.111*"land" + 0.067*"God" + 0.058*"no" + 0.046*"man" + 0.042*"here" + 0.037*"child" + 0.034*"took" + 0.024*"Israel" + 0.020*"Send"

#### Using regural tokenizer 

In [12]:
import string
from hebrew_tokenizer import tokenize
# Read Hebrew stopwords from file
hebrew_stopwords_file = "heb_stopwords.txt"
with open(hebrew_stopwords_file, 'r', encoding="utf-8") as heb_stopfile:
    hebrew_stopwords = heb_stopfile.read().splitlines()

# Read Hebrew text from file
file_path = 'genesis_hebrew.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    hebrew_text = file.read()

# Remove Hebrew stopwords
preprocessed_text = [word for word in hebrew_text.split() if word not in hebrew_stopwords]

# Print first 100 preprocessed words
print(preprocessed_text[:100])



['רֵאשִׁית', 'בּרא', 'אֱלֹהִים', 'שָׁמַיִם', 'אֶרֶץ', 'אֶרֶץ', 'תֹּהוּ', 'בֹּהוּ', 'חֹשֶׁךְ', 'פָּנֶה', 'תְּהוֺם', 'רוּחַ', 'אֱלֹהִים', 'רחף', 'פָּנֶה', 'מַיִם', 'אֱלֹהִים', 'אוֺר', 'אוֺר', 'ראה', 'אֱלֹהִים', 'אוֺר', 'כִּי', 'טוֺב', 'בּדל', 'אֱלֹהִים', 'בַּיִן', 'אוֺר', 'בַּיִן', 'חֹשֶׁךְ', 'קרא', 'אֱלֹהִים', 'אוֺר', 'יוֺם', 'חֹשֶׁךְ', 'קרא', 'לַיְלָה', 'עֶרֶב', 'בֹּקֶר', 'יוֺם', 'אֶחָד', 'ף', 'אֱלֹהִים', 'רָקִיעַ', 'תָּוֶךְ', 'מַיִם', 'בּדל', 'בַּיִן', 'מַיִם', 'מַיִם', 'עשׂה', 'אֱלֹהִים', 'רָקִיעַ', 'בּדל', 'בַּיִן', 'מַיִם', 'אֲשֶׁר', 'תַּחַת', 'רָקִיעַ', 'בַּיִן', 'מַיִם', 'אֲשֶׁר', 'רָקִיעַ', 'כֵּן', 'קרא', 'אֱלֹהִים', 'רָקִיעַ', 'שָׁמַיִם', 'עֶרֶב', 'בֹּקֶר', 'יוֺם', 'שֵׁנִי', 'ף', 'אֱלֹהִים', 'קוה', 'מַיִם', 'תַּחַת', 'שָׁמַיִם', 'מָקוֺם', 'אֶחָד', 'ראה', 'יַבָּשָׁה', 'כֵּן', 'קרא', 'אֱלֹהִים', 'יַבָּשָׁה', 'אֶרֶץ', 'מִקְוֶה', 'מַיִם', 'קרא', 'יָם', 'ראה', 'אֱלֹהִים', 'כִּי', 'טוֺב', 'אֱלֹהִים', 'דּשׁא', 'אֶרֶץ', 'דֶּשֶׁא', 'עֵשֶׂב']


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Convert the preprocessed text into a list of strings
preprocessed_text_str = ' '.join(preprocessed_text)

from gensim import corpora

# Create a list of tokenized documents
documents = [token.split() for token in preprocessed_text]

# Create a dictionary from the tokenized documents
dictionary = corpora.Dictionary(documents)



# Create a bag-of-words representation of the corpus
corpus = [dictionary.doc2bow(doc) for doc in documents]


In [14]:
from gensim.models import LdaModel

# Set the number of topics
num_topics = 5

# Train the LDA model
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
# Print the topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))


Topic: 0 
Words: 0.091*"אֲשֶׁר" + 0.037*"נתן" + 0.035*"מִצְרַיִם" + 0.035*"פַּרְעֹה" + 0.028*"לקח" + 0.026*"יָד" + 0.025*"עֶבֶד" + 0.022*"שֵׁם" + 0.022*"זֶה" + 0.021*"בַּת"
Topic: 1 
Words: 0.066*"אָח" + 0.063*"לֹא" + 0.037*"עשׂה" + 0.035*"שָׁנָה" + 0.032*"קרא" + 0.030*"גַּם" + 0.028*"עִם" + 0.027*"שָׁם" + 0.022*"שׁוב" + 0.020*"עַד"
Topic: 2 
Words: 0.088*"כִּי" + 0.081*"כֹּל" + 0.035*"הוּא" + 0.032*"פָּנֶה" + 0.028*"אַבְרָהָם" + 0.028*"יִשְׂרָאֵל" + 0.026*"עַיִן" + 0.025*"אָדוֺן" + 0.020*"יִצְחָק" + 0.017*"אַחֲרֵי"
Topic: 3 
Words: 0.080*"אֶרֶץ" + 0.077*"יוֺסֵף" + 0.074*"אָב" + 0.060*"בּוא" + 0.044*"אֱלֹהִים" + 0.042*"אִישׁ" + 0.038*"ראה" + 0.034*"בַּיִת" + 0.032*"הִנֵּה" + 0.023*"מות"
Topic: 4 
Words: 0.102*"בֵּן" + 0.058*"יַעֲקֹב" + 0.040*"יוֺם" + 0.032*"ילד" + 0.030*"אִשָּׁה" + 0.028*"הלךְ" + 0.027*"יהוה" + 0.026*"אֵלֶּה" + 0.023*"עלה" + 0.019*"עֵשָׂו"


Topic: 0
Words: 0.106*"because" + 0.106*"Joseph" + 0.097*"all" + 0.038*"turn" + 0.024*"until" + 0.024*"yitzchak" + 0.023*"left" + 0.021*"put" + 0.020*"attorney" + 0.019*"from"
Topic: 1
Words: 0.077*"come" + 0.074*"oh" + 0.039*"year" + 0.032*"Abraham" + 0.031*"with" + 0.031*"Israel" + 0.031*"Jehovah" + 0.028*"Adon" + 0.026*"Upload" + 0.023*"Send"
Topic: 2
Words: 0.131*"where" + 0.104*"land" + 0.097*"ob" + 0.067*"Jacob" + 0.049*"see" + 0.044*"house" + 0.037*"hand" + 0.031*"there" + 0.029*"eye" + 0.025*"knee"
Topic: 3
Words: 0.043*"Egypt" + 0.043*"Pharaoh" + 0.036*"did" + 0.034*"took" + 0.034*"he" + 0.032*"child" + 0.030*"servant" + 0.027*"went" + 0.026*"daughter" + 0.025*"these"
Topic: 4
Words: 0.123*"Son" + 0.075*"No" + 0.060*"God" + 0.057*"Ish" + 0.055*"Nathan" + 0.048*"Yom" + 0.044*"Behold" + 0.038*"Read" + 0.037*"woman" + 0.035*"too"

In [15]:
from gensim.models import HdpModel
from gensim.corpora import Dictionary

# Create a dictionary from the preprocessed text
dictionary = Dictionary([preprocessed_text])

# Create a corpus (bag of words) from the preprocessed text
corpus = [dictionary.doc2bow(text) for text in [preprocessed_text]]

# Apply HDP topic modeling
hdp_model = HdpModel(corpus, dictionary)

# Print the topics
topics = hdp_model.show_topics()
for topic_id, topic in topics:
    print(f"Topic #{topic_id}: {topic}")


Topic #0: 0.024*אֲשֶׁר + 0.021*בֵּן + 0.020*כֹּל + 0.018*אֶרֶץ + 0.017*כִּי + 0.013*אֱלֹהִים + 0.013*בּוא + 0.012*לֹא + 0.012*אָב + 0.010*יַעֲקֹב + 0.010*אָח + 0.010*ילד + 0.010*יהוה + 0.009*שָׁנָה + 0.009*אִישׁ + 0.009*יוֺסֵף + 0.009*עשׂה + 0.009*יוֺם + 0.009*אִשָּׁה + 0.009*נתן
Topic #1: 0.003*נחשׁ + 0.003*רְאוּמָה + 0.003*גֹּמֶר + 0.003*מָחָר + 0.003*יוֺסֵף + 0.003*מַתָּן + 0.002*רכשׁ + 0.002*רמשׂ + 0.002*דֹּתָן + 0.002*יְשׁוּעָה + 0.002*פְּצָלָה + 0.002*כּול + 0.002*אפס + 0.002*כְּתֹנֶת + 0.002*מְלֹא + 0.002*יָד + 0.002*יַלְדָּה + 0.002*תְּמוֺל + 0.002*דָּג + 0.002*רַחֲמִים
Topic #2: 0.003*רפא + 0.003*פַּס + 0.003*שׂים + 0.003*שְׁאוֺל + 0.003*תּוֺעֵבָה + 0.003*בּלל + 0.003*לקח + 0.002*רִיב + 0.002*שׁחט + 0.002*עֶצֶם + 0.002*טרף + 0.002*קִדְמָה + 0.002*מַתָּן + 0.002*רָצוֺן + 0.002*מַעֲשֶׂה + 0.002*נָפִישׁ + 0.002*נגע + 0.002*מִטָּה + 0.002*מִסְפּוֺא + 0.002*מַשְׁקֵה
Topic #3: 0.003*חֹרֶב + 0.003*לוּדִים + 0.003*קַשָּׁת + 0.003*אִשָּׁה + 0.003*שׁכם + 0.003*עֶשְׂרֵה + 0.003*חֶרְפָּה 

In [16]:
from gensim.models import LsiModel
from gensim.corpora import Dictionary

# Create a dictionary from the preprocessed text
dictionary = Dictionary([preprocessed_text])

# Create a corpus (bag of words) from the preprocessed text
corpus = [dictionary.doc2bow(text) for text in [preprocessed_text]]

# Apply LSA topic modeling
lsa_model = LsiModel(corpus, id2word=dictionary, num_topics=10)

# Print the topics
topics = lsa_model.show_topics()
for topic_id, topic in topics:
    print(f"Topic #{topic_id}: {topic}")


Topic #0: 0.323*"אֲשֶׁר" + 0.287*"בֵּן" + 0.269*"כֹּל" + 0.244*"אֶרֶץ" + 0.228*"כִּי" + 0.172*"אֱלֹהִים" + 0.171*"בּוא" + 0.167*"לֹא" + 0.164*"אָב" + 0.142*"יַעֲקֹב"


In [17]:
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
   

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_text)

# Apply SVD to the TF-IDF matrix
n_components = 20 # Number of topics
lsa_model = TruncatedSVD(n_components=n_components, random_state=42)
lsa_topic_matrix = lsa_model.fit_transform(X)
# Print the top words for each topic
terms = vectorizer.get_feature_names_out()
for i, topic in enumerate(lsa_model.components_):
    top_terms_idx = topic.argsort()[:-11:-1]
    top_terms = [terms[idx] for idx in top_terms_idx]
    print(f"Topic {i}: {' | '.join(top_terms)}")



Topic 0: ים | קו | עו | או | מו | בו | לו | טו | נו | ימ
Topic 1: יו | יצא | אן | רך | או | בו | נש | לו | את | ית
Topic 2: יש | ית | מו | יצא | חו | חיה | יא | ידע | אן | או
Topic 3: וא | חיה | מו | יא | ידע | לח | אן | קום | חו | נש
Topic 4: ילד | קום | חו | טו | בו | עלה | ידע | יא | מצא | רו
Topic 5: יהוה | ידע | חו | קום | קו | יא | מו | עלה | נתן | רו
Topic 6: הו | ית | רך | יצא | או | עלה | בר | לו | אן | ירד
Topic 7: עש | טו | בר | עלה | הו | ית | נו | או | דו | יהוה
Topic 8: נתן | אן | רך | יצא | או | לו | ידע | נש | עו | ירד
Topic 9: ראה | מצא | יצא | אן | עו | רך | את | או | חו | יא
Topic 10: לקח | ראה | עלה | מצא | דו | עו | בר | רך | טו | ירד
Topic 11: עו | דו | קו | לח | וב | מו | מע | מות | חיה | אכל
Topic 12: דו | או | אן | רך | נש | יצא | לח | לו | את | קרא
Topic 13: קרא | קו | בו | חו | רו | טו | עלה | יא | חיה | קום
Topic 14: הלך | לח | וב | קו | יר | רך | אכל | בו | מצא | מות
Topic 15: יר | לו | או | חו | יא | עלה | יצא | טו | את | מצא
Topic 16: לו | קו | מע | בו | 

Topic 0: Sea | line | O | or | Mo | in it | him | Tu | Well | day
Topic 1: Yu | came out Ann | soft | or | in it | Nes | him | the | Yat
Topic 2: There is | y | Mo | came out Hu | Animal | 11 Knowledge Ann | or
Topic 3: And | Animal | Mo | 11 Knowledge Moist | Ann | Get up Hu | Nes
Topic 4: Child | Get up Hu | Tu | in it | rose | Knowledge 11 find | Ru
Topic 5: Yahweh Knowledge Hu | Get up line | 11 Mo | rose | Nathan Ru
Topic 6: Oh | y | soft | came out or | rose | bar | him | Ann | got off
Topic 7: Moth | Tu | bar | rose | Oh | y | Well | or | Do | Jehovah
Topic 8: Nathan | Ann | soft | came out or | him | Knowledge Nes | O | got off

In [18]:
import requests
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.manifold import TSNE
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation


# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_text)
# Apply NMF to the TF-IDF matrix
n_components = 20  # Number of topics
nmf_model = NMF(n_components=n_components, random_state=42)
nmf_topic_matrix = nmf_model.fit_transform(X)




In [19]:
    # Print the topics
    feature_names = vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(nmf_model.components_):
        print(f"Topic {topic_idx + 1}:")
        top_idx = topic.argsort()[:-11:-1]
        top_terms = [feature_names[i] for i in top_idx]
        print(", ".join(top_terms))

Topic 1:
ים, קו, מו, בו, טו, נו, ימ, יל, זו, יבש
Topic 2:
יו, חו, מות, יא, וב, לח, מו, נש, אכל, מע
Topic 3:
יש, ית, אש, טו, תר, יא, יב, יבם, יבש, יג
Topic 4:
וא, רך, וב, אכל, מצא, רו, עלה, טו, בו, ית
Topic 5:
ילד, מות, חו, יא, רך, וב, לח, מו, נש, אכל
Topic 6:
יהוה, חו, יא, רך, וב, לח, נש, מו, אכל, אן
Topic 7:
הו, ית, אש, מות, מו, מע, יד, דל, רה, נפל
Topic 8:
עש, מות, חו, יא, רך, וב, לח, מו, נש, אכל
Topic 9:
נתן, חו, מות, יא, רך, וב, לח, מו, נש, אכל
Topic 10:
ראה, חו, רך, בו, ירד, עבד, תר, יאל, יב, יבם
Topic 11:
לקח, מות, יא, רך, וב, לח, מו, נש, אכל, אן
Topic 12:
עו, ים, לו, חו, יא, רך, את, עבד, יכל, הרה
Topic 13:
דו, או, חו, יא, רך, מו, את, חיה, מצא, קום
Topic 14:
קרא, חו, מות, יא, רך, וב, לח, מו, נש, אכל
Topic 15:
הלך, מות, חו, יא, רך, וב, לח, אכל, מו, נש
Topic 16:
יר, או, חו, יא, את, עלה, קום, טו, ין, עבד
Topic 17:
לו, בו, מות, אכל, מע, אן, חיה, ידע, מצא, עלה
Topic 18:
בר, חו, מות, יא, לח, מו, נש, וב, אן, מע
Topic 19:
או, ים, מות, לו, יר, וב, לח, אכל, נש, מע
Topic 20:
יצא, קו, חו, יא