In [1]:
import requests
import json
import re
import collections

In [2]:
def strip_non_hebrew(s):
    return re.sub(r'[^א-ת ]','',s.replace('־',' '))

In [3]:
def find_lex(word):
    search_url= 'https://tanach-search-2-2.loadbalancer.dicta.org.il/lexemes'
    data = {
            "from": 0,
            "size": 10000,
            "query": word,
        }
    r = requests.post(search_url, json=data)
        #prnum(r.status_code)
    if r.status_code == 200:
            roots=r.json()
    
    lexemeWords = []
    lexmLen = len(roots[0]['allLexemes'])
    for m in range(lexmLen):
        lexemeWords.append(strip_non_hebrew(roots[0]['allLexemes'][m]['lexeme']))

    return(list(set(lexemeWords)))

In [4]:
def search(s,num,book):
    
    if book == 'tanach':
        url = 'https://tanach-search-2-2.loadbalancer.dicta.org.il/textAnalysis'
    if book == 'talmud':
        url = 'https://talmud-search.loadbalancer.dicta.org.il/textAnalysis'

    search_url= url
    data = {
        "from": 0,
        "size": num,
        "query": s,
    }
    r = requests.post(search_url, json=data)
    #prnum(r.status_code)
    if r.status_code == 200:
        r_dict=r.json()
        return(r_dict)
    else:
        return None

In [5]:
word = 'פיקח'
size = 200
book = 'talmud'
r_dict = search(word,size,book)

In [6]:
r_dict

[[[{'word': 'אי',
    'lexeme': 'אִי (Conjunction) ',
    'lexemeCode': 'אִי',
    'morphology': 'conj, aramaic',
    'morphologyCode': 'ffffffffffffffffffffftffffffffffffffffffffffffffffffftffffffffffffffffffffffff'},
   {'word': 'הכי',
    'lexeme': 'הָכִי (Adverb) ',
    'lexemeCode': 'הָכִי',
    'morphology': 'adv, aramaic',
    'morphologyCode': 'fffffftfffffffffffffffffffffffffffffffffffffffffffffftffffffffffffffffffffffff'},
   {'word': 'אימא',
    'lexeme': 'אמר (Verb) ',
    'lexemeCode': 'אמר[',
    'morphology': 'sing, verb, paal, fut, mas, per1, aramaic',
    'morphologyCode': 'ffffftfftffftffffffffftfffffffffffftffffffftffffffffftffffffffffffffffffffffff'},
   {'word': 'סיפא',
    'lexeme': 'סֵיפָא (Noun) ',
    'lexemeCode': 'סֵיפָא/',
    'morphology': 'noun, sing, mas, abs',
    'morphologyCode': 'ffffttffffffffffffffffffffffffffffftfftfffffffffffffffffffffffffffffffffffffff'},
   {'word': 'אם',
    'lexeme': 'אִם (Conjunction) ',
    'lexemeCode': 'אִם',
    'morpholo

In [7]:
def get_wordforms(str):
    query=str
    r_wordforms = requests.post(
        'https://talmud-search.loadbalancer.dicta.org.il/wordforms',
        json={
            "query": query
        }
    )
    print(r_wordforms.status_code)

    if (r_wordforms.status_code==200):
        r_wordforms_dict=r_wordforms.json()
        
    a = r_wordforms_dict[0]['wordForms']
    words = {}
    for x in range(len(a)):
        words[x]=a[x]['wordForm']
    justwords = [ v for v in words.values() ]
    clean_justwords = {}
    for j in range(len(justwords)):
        clean_justwords[j]=strip_non_hebrew(justwords[j])
    clean_justwords = [ v for v in clean_justwords.values() ]
    return(clean_justwords)

In [8]:
def top_words_lex(word,size,book):
    list_of_words = []
    r_dict = search(word,size,book)
    results = len(r_dict)
    stop_words = ['את','אל','כי','על','עם','לא','ואת','ויהי','ולא','עד','לך','לפני','לי','שם','אתה','הזה','הוא','אם','גם','מה','לו','היה','או','מאי','אי','הכי','אמר','רב','רבי','הא','נמי','מי','הרי','זה','היי','כל','של','בן','ל']
    word_forms = get_wordforms(word)
    
    for i in range(results):  #for each result
        if i == 0:
            big_count = collections.Counter()
        list_of_wordslex = []
        for j in range(len(r_dict[i])): #if there are more than one "mini result" in result
            for k in range(len(r_dict[i][j])):
                word = (r_dict[i][j][k])
                lex = word['lexeme']
                lexCode = strip_non_hebrew(word['lexemeCode'])
                if lexCode not in stop_words:
                    if lexCode not in word_forms:
                        list_of_wordslex.append(lex)
                
        small_count = collections.Counter(list_of_wordslex)
        big_count = big_count + small_count
    top_lexeme = (list(big_count.most_common()[0]))[0]
    
    list_of_tops = []
    
    for a in range(results):
        for b in range(len(r_dict[a])):
            for c in range(len(r_dict[a][b])):
                if r_dict[a][b][c]['lexeme'] == top_lexeme:
                    list_of_tops.append(r_dict[a][b][c]['word'])
    
    list_of_tops = list(set(list_of_tops))           
    
        
        
    # added word
        
        
        
    return(big_count.most_common()[0],list_of_tops)
                
        

In [9]:
word = 'פיקח'
size = 200
book = 'talmud'
top_words_lex(word,size,book)

200


(('חֵרֵשׁ (Noun) ', 104),
 ['לחרש', 'חרשין', 'וחרש', 'חרש', 'אחרשין', 'החרש', 'חֵרֵשׁ', 'מחרש', 'בחרש'])