# How main works

In [331]:
import os
import sys
import dictionary; reload(dictionary)
import argparse
from ordbog import print_results, dictionaries

The 'dictionaries' we have imported is a dictionary taking as key a short-hand name of a language (fx en for English). The corresponding value is another dictionary with some data.

In [322]:
dictionaries

OrderedDict([('en',
              {'datfile': 'data/EngelskOrdbog.dat',
               'doubflag': 3,
               'gddfile': 'data/EngelskOrdbog.gdd',
               'name': 'Engelsk'}),
             ('de',
              {'datfile': 'data/TyskOrdbog.dat',
               'doubflag': 3,
               'gddfile': 'data/TyskOrdbog.gdd',
               'name': 'Tysk'}),
             ('fr',
              {'datfile': 'data/FranskOrdbog.dat',
               'doubflag': 3,
               'gddfile': 'data/FranskOrdbog.gdd',
               'name': 'Fransk'}),
             ('se',
              {'datfile': 'data/SvenskOrdbog.dat',
               'doubflag': 3,
               'gddfile': 'data/SvenskOrdbog.gdd',
               'name': 'Svensk'}),
             ('no',
              {'datfile': 'data/NorskDownload.dat',
               'doubflag': 2,
               'gddfile': 'data/NorskDownload.gdd',
               'name': 'Norsk'}),
             ('enfag',
              {'datfile': 'data/FagordbogEng

To look up a word, we must provide three variables. First, the language (shortened form). Second, whether we want to translate from Danish (0), to Danish (1) or both ways (2). Not all three options are possible for all the dictionaries. The 'doubflag' variable provides this info. Lastly, we need to give a list of the search terms we want to look up.

In [323]:
language = 'en'
translate = 2
search_terms = ['fundere']

We can translate the shortened language name to the full one

In [34]:
language_name = dictionaries[language]['name']
print(language_name)

Engelsk


We also keep a list of the translation direcitons

In [37]:
directions = [('fromDanish', 'Dansk-%s' % language_name), ('toDanish', '%s-Dansk' % language_name)]

if dictionaries[language]['doubflag'] < 2 or translate == 0:
    del directions[1]
elif translate == 1:
    del directions[0]
print(directions)

[('fromDanish', 'Dansk-Engelsk'), ('toDanish', 'Engelsk-Dansk')]


And a list of the type of lookups we want

In [38]:
tables = [('lookup', 'Opslagsord'), ('collocation_lookup', 'Ordforbindelser')]
if dictionaries[language]['doubflag'] == 1  or dictionaries[language]['doubflag'] == 3:
    tables.append(('reverse', 'Resultater'))
print(tables)

[('lookup', 'Opslagsord'), ('collocation_lookup', 'Ordforbindelser'), ('reverse', 'Resultater')]


Next we initialise a dictionary instance

In [332]:
dic = dictionary.Dictionary(dictionaries)

In [333]:
dic.spell_suggestions('spaed', True, 'lookup', 'en')

{u'spade', u'spand', u'speed'}

And use it to look up our search terms

In [41]:
search_results = dic.lookup(search_terms, language)

The result is the following, not easily readable dictionary

In [42]:
search_results

{'fromDanish': {'collocation_lookup': ['<div></div><h3>fundere over</h3><div>(gruble)</div><div>ponder on</div> ',
   '<div></div><h3>fundere p\xc3\xa5</h3><div>(basere)</div><div>base on</div> ',
   '<div></div><h3>d\xc3\xa5rligt funderet i historie</h3><div>poorly read in history</div> '],
  'lookup': ['<div><h2>fundere <font color="#605A50"><i>vb. </i></font></h2><div>(funderer; funderede; funderet)</div></div><h3>fundere<font color="#888888"> <i>vb.</i></font></h3><ol><li><div>(gruble)</div><div>ponder</div></li><li><div>(basere)</div><div>found</div><div>base</div></li></ol> '],
  'reverse': ['<div><h2>muse <font color="#605A50"><i>vb. </i></font></h2><div>[mju:z] <a href="sound://10045:m102316">[LYD]</a></div></div><h3>muse<font color="#888888"> <i>vb.</i></font></h3><div>gruble</div><div>grunde</div><div>fundere</div><div>spekulere</div> ',
   '<div><h2>ponder <font color="#605A50"><i>vb. </i></font></h2><div>[\xcb\x88p\xc9\x94nd\xc9\x99] <a href="sound://10045:p102461">[LYD]</a

We can use our print function to make the result more readable

In [43]:
print_results(search_results, search_terms, directions, tables, language)



[107m[1m[30mOpslagsord fra Dansk-Engelsk[0m[0m[0m


[34mfundere [0m[2m[3mvb. [0m[0m[2m(funderer; funderede; funderet)[0m
1. [2m(gruble)[0m ponder
2. [2m(basere)[0m found; base



[107m[1m[30mOrdforbindelser fra Dansk-Engelsk[0m[0m[0m


[34mfundere over [0m[2m(gruble)[0m ponder on

[34mfundere på [0m[2m(basere)[0m base on

[34mdårligt funderet i historie [0mpoorly read in history



[107m[1m[30mResultater fra Engelsk-Dansk[0m[0m[0m


[34mmuse [0m[2m[3mvb. [0m[0m[37m[mju:z][0m gruble; grunde; fundere; spekulere

[34mponder [0m[2m[3mvb. [0m[0m[37m[ˈpɔndə][0m 
1. overveje; fundere over; grunde over
2. [2m(uden objekt)[0m overveje; fundere; spekulere



# Looking up a term

In [44]:
import sys, os
import sqlite3
import re
from array import array
import groparser

In [45]:
search_terms = ['fundere']

In [46]:
# make * and . wildcard letters;  strip ' characters to avoid SQL injections
search_terms = [s.replace('*', '%').replace('\'', ' ').replace('.', '%')
                for s in search_terms]

In [47]:
search_terms

['fundere']

In [50]:
# open data files
db_path = dictionaries[language]['gddfile']
dat_path = dictionaries[language]['datfile']
dat_file = open(dat_path, 'rb')
db = sqlite3.connect(db_path)

In [66]:
cursor = db.execute('select * from lookup1')
names = [description[0] for description in cursor.description]
names

['entry_id_', 'word_']

In [67]:
cursor = db.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

[(u'info',), (u'dict_setup',), (u'sound',), (u'entries1',), (u'entries2',), (u'lookup1',), (u'collocation_lookup1',), (u'reverse1',), (u'lookup2',), (u'collocation_lookup2',), (u'reverse2',)]


In [57]:
fromDanish = 1
first_term = True
table = 'lookup'

In [58]:
first_term = True
for term in search_terms:
    rows = list(db.execute('select * from %s%i where word_ like \'%s\''
                           % (table, fromDanish, term)))
    term_entry_ids = [r[0] for r in rows]
    if first_term:
        first_term = False
        entry_ids = term_entry_ids
    else:
        entry_ids = set(term_entry_ids) & set(entry_ids)

In [90]:
list(db.execute("select * from lookup1 where word_ like 'funder%'"))

[(23947, u'fundere'),
 (165705, u'funderer'),
 (165706, u'funderede'),
 (165707, u'funderende'),
 (165708, u'funderendes'),
 (165709, u'funderet')]

In [92]:
rows

[(23947, 0, 23947, 6834703, 344)]

Hmm, virker som dobbeltarbejde at slå op igen? Informationen er jo allerede hentet i rows.

In [68]:
entries = []
for entry_id in entry_ids:
    rows = list(db.execute('select * from entries%i where id_ = %d'
                           % (fromDanish, entry_id)))
    for _, entry_type, link_id, offset, nbyte in rows:
        entries.append((entry_id, offset, nbyte))

In [69]:
entries

[(23947, 6834703, 344)]

In [72]:
def extractFromFile(f, offset, nbyte):
    f.seek(offset)
    data = array('B')
    data.fromfile(f, nbyte)
    return data

In [73]:
raw_entries = []
for entry_id, offset, nbyte in entries:
    data = extractFromFile(dat_file, offset, nbyte)
    raw_entry = groparser.parse_entry(data, entry_id, offset, nbyte)
    raw_entries.append(raw_entry.split('\0')[-2])

In [74]:
raw_entries

['<div><h2>fundere <font color="#605A50"><i>vb. </i></font></h2><div>(funderer; funderede; funderet)</div></div><h3>fundere<font color="#888888"> <i>vb.</i></font></h3><ol><li><div>(gruble)</div><div>ponder</div></li><li><div>(basere)</div><div>found</div><div>base</div></li></ol> ']

In [94]:
raw_entry

'\x00\x10\x00\x14\x00\x15\x00?fundere\x00vb.\x00\x001 (gruble) ponder; 2 (basere) found; base\x00<div><h2>fundere <font color="#605A50"><i>vb. </i></font></h2><div>(funderer; funderede; funderet)</div></div><h3>fundere<font color="#888888"> <i>vb.</i></font></h3><ol><li><div>(gruble)</div><div>ponder</div></li><li><div>(basere)</div><div>found</div><div>base</div></li></ol> \x00'

# Spelling Corrector

In [156]:
import re
from collections import Counter

def words(text): return re.findall(r'\w+', text.lower())

#WORDS = Counter(words(open('big.txt').read()))

#def P(word, N=sum(WORDS.values())): 
#    "Probability of `word`."
#    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or [word] or known(edits2(word)) )

def known1(term):
    return bool(list(db.execute("select * from lookup1 where word_ like '%s'" % term)))

def known(terms):
    return set(t for t in terms if known1(t))

def known_old(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyzæøå'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [157]:
word = 'blp'

In [169]:
candidates(word)

{'bap', 'bip', 'bl', 'bla', 'ble', 'blu', 'bly', 'bnp'}

In [159]:
edits1(word)

{'ablp',
 'alp',
 'balp',
 'bap',
 'bblp',
 'bbp',
 'bclp',
 'bcp',
 'bdlp',
 'bdp',
 'belp',
 'bep',
 'bflp',
 'bfp',
 'bglp',
 'bgp',
 'bhlp',
 'bhp',
 'bilp',
 'bip',
 'bjlp',
 'bjp',
 'bklp',
 'bkp',
 'bl',
 'bla',
 'blap',
 'blb',
 'blbp',
 'blc',
 'blcp',
 'bld',
 'bldp',
 'ble',
 'blep',
 'blf',
 'blfp',
 'blg',
 'blgp',
 'blh',
 'blhp',
 'bli',
 'blip',
 'blj',
 'bljp',
 'blk',
 'blkp',
 'bll',
 'bllp',
 'blm',
 'blmp',
 'bln',
 'blnp',
 'blo',
 'blop',
 'blp',
 'blpa',
 'blpb',
 'blpc',
 'blpd',
 'blpe',
 'blpf',
 'blpg',
 'blph',
 'blpi',
 'blpj',
 'blpk',
 'blpl',
 'blpm',
 'blpn',
 'blpo',
 'blpp',
 'blpq',
 'blpr',
 'blps',
 'blpt',
 'blpu',
 'blpv',
 'blpw',
 'blpx',
 'blpy',
 'blpz',
 'blp\xa5',
 'blp\xa6',
 'blp\xb8',
 'blp\xc3',
 'blq',
 'blqp',
 'blr',
 'blrp',
 'bls',
 'blsp',
 'blt',
 'bltp',
 'blu',
 'blup',
 'blv',
 'blvp',
 'blw',
 'blwp',
 'blx',
 'blxp',
 'bly',
 'blyp',
 'blz',
 'blzp',
 'bl\xa5',
 'bl\xa5p',
 'bl\xa6',
 'bl\xa6p',
 'bl\xb8',
 'bl\xb8p',
 'bl\

In [219]:
list(db.execute("SELECT * FROM lookup1 WHERE word_ LIKE '_lp' or word_ LIKE 'b_p' or word_ LIKE 'bl_' or word_ LIKE '_blp' or word_ LIKE 'b_lp' or word_ LIKE 'bl_p'"))

[(4914, u'bap'),
 (7795, u'bip'),
 (7968, u'bla'),
 (8077, u'ble'),
 (8507, u'blu'),
 (8537, u'bly'),
 (8539, u'bly'),
 (8693, u'bl\xe5'),
 (8736, u'bnp')]

In [334]:
directions

[('fromDanish', 'Dansk-Engelsk'), ('toDanish', 'Engelsk-Dansk')]

In [173]:
word = "berete"

In [175]:
splits = [(word[:i], word[i:])    for i in range(len(word) + 1)]

In [176]:
splits

[('', 'berete'),
 ('b', 'erete'),
 ('be', 'rete'),
 ('ber', 'ete'),
 ('bere', 'te'),
 ('beret', 'e'),
 ('berete', '')]

In [177]:
inserts = [L + '_' + R for L, R in splits]

In [178]:
inserts

['_berete', 'b_erete', 'be_rete', 'ber_ete', 'bere_te', 'beret_e', 'berete_']

In [180]:
replaces = [L + '_' + R[1:] for L, R in splits if R]

In [181]:
replaces

['_erete', 'b_rete', 'be_ete', 'ber_te', 'bere_e', 'beret_']

In [182]:
deletes    = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]

In [183]:
deletes

['erete', 'brete', 'beete', 'berte', 'beree', 'beret']

In [184]:
transposes

['ebrete', 'breete', 'beerte', 'bertee', 'bereet']

In [189]:
' OR '.join(['word_ LIKE ' + w for w in inserts])

'word_ LIKE _berete OR word_ LIKE b_erete OR word_ LIKE be_rete OR word_ LIKE ber_ete OR word_ LIKE bere_te OR word_ LIKE beret_e OR word_ LIKE berete_'

In [193]:
def edits1(word):
    "All edits that are one edit away from `word`."
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + '_' + R[1:]           for L, R in splits if R]
    inserts    = [L + '_' + R               for L, R in splits]
    return set(deletes + transposes + replaces + inserts)

In [194]:
edits1(word)

{'_berete',
 '_erete',
 'b_erete',
 'b_rete',
 'be_ete',
 'be_rete',
 'beerte',
 'beete',
 'ber_ete',
 'ber_te',
 'bere_e',
 'bere_te',
 'beree',
 'bereet',
 'beret',
 'beret_',
 'beret_e',
 'berete_',
 'berte',
 'bertee',
 'breete',
 'brete',
 'ebrete',
 'erete'}

In [312]:
word = "dearthh"

In [313]:
ss1 = " OR ".join(["word_ LIKE '%s'" % w for w in edits1(word)])

In [318]:
list(db.execute("SELECT * FROM reverse2 WHERE " + ss1))

[]

In [302]:
suggestions = set(w for _,w in list(db.execute("SELECT * FROM lookup2 WHERE " + ss1)))

In [303]:
suggestions

{u'definitely'}

In [298]:
table

'lookup'

In [319]:
tables

[('lookup', 'Opslagsord'),
 ('collocation_lookup', 'Ordforbindelser'),
 ('reverse', 'Resultater')]

In [320]:
directions

[('fromDanish', 'Dansk-Engelsk'), ('toDanish', 'Engelsk-Dansk')]

In [341]:

def generate_spell_suggestions(dic, word, directions, tables, language):
    suggestions = set()
    for d, _ in directions:
        fromDanish = (d == "fromDanish")
        for t, _ in tables:
            suggestions = suggestions.union(dic.spell_suggestions(word, fromDanish, t, language))
    return suggestions

In [344]:
generate_spell_suggestions(dic, "spaed", directions, tables[:1], 'en')

{u'spaced', u'spade', u'spand', u'sped', u'speed'}

In [336]:
directions

[('fromDanish', 'Dansk-Engelsk'), ('toDanish', 'Engelsk-Dansk')]

In [343]:
tables[:1]

[('lookup', 'Opslagsord')]

In [346]:
directions

[('fromDanish', 'Dansk-Engelsk'), ('toDanish', 'Engelsk-Dansk')]

In [351]:
res = dic.lookup(['fdfd'], 'en')

In [358]:
res.values()[0].values()

[[], [], []]

In [374]:
not any([e for f in [d.values() for d in res.values()] for e in f])

True

In [369]:
any([[], [], [[],[]]])

True

In [377]:
any([any(d.values()) for d in res.values()])

False