In [1]:
from models.autofill import *
from models.ngram import *
from models.evaluate import *
from nltk.corpus import arcosg

In [2]:
training_sents, test_sents = split_data()
print(len(arcosg.sents()))
print(len(training_sents) + len(test_sents))

10899
10790


In [3]:
trie = TrieModel()

print(evaluate(trie, "tha mi a' goid"))
print(evaluate(trie, "a bheil thu a' faireachdainn math"))
print(evaluate(trie, arcosg.sents()[0]))
print(evaluate(trie, test_sents[0]))

(11, 8, 4, 4)
(28, 9, 6, 6)
(17, 8, 5, 5)
(17, 8, 5, 5)


In [4]:
sentences = arcosg.sents()

In [5]:
unigram = UnigramModel(sents=sentences)
unigram.next_nwords(10,'')

['a', 'an', 'air', ',', 'e', '.', "a'", 'tha', 'agus', 'na']

In [6]:
bigram = BigramModel(sents=sentences)
bigram.next_nwords(6,["a'"])

['dol', 'tighinn', 'feuchainn', 'ruith', 'faighinn', 'cur']

In [7]:
trigram = TrigramModel(sents=sentences)
trigram.next_nwords(6,['mi',"a'"])

['smaoineachadh', 'creidsinn', 'smaointinn', 'smaoineadh', 'ràdh', 'coimhead']

In [8]:
fourgram = NgramModel(4,sents=sentences)
print(fourgram.next_nwords(6,['mi',"a'",'creidsinn']))
print(fourgram.next_nwords(10,['tha']))

['ach', '[?]', 'gu', 'sia']
['e', 'mi', 'iad', 'an', 'i', 'sin', 'seo', 'sinn', 'esan', 'am']


In [9]:
lin_interp = LinearInterpolationModel(sents=sentences)
print(lin_interp.next_nwords(6,['mi',"a'",'creidsinn']))
print(lin_interp.next_nwords(10,['tha']))

['gu', 'ach', '[?]', 'sia', "a'", 'le']
['e', 'mi', 'iad', 'an', 'i', 'sin', 'seo', 'sinn', 'na', 'am']


In [10]:
lin_interp = BackoffInterpolationModel(sents=sentences)
print(lin_interp.next_nwords(6,['mi',"a'",'creidsinn']))
print(lin_interp.next_nwords(10,['tha']))

['gu', 'ach', '[?]', 'sia', 'a', 'còig']
['e', 'mi', 'iad', 'an', 'i', 'sin', 'seo', '.', 'esan', 'sinn']


In [11]:
ngram_trie = NgramTrieModel(ngram=bigram)
print(ngram_trie.suggestions('m'))
print(ngram_trie.suggestions_context('m',['tha','mi',"a'"]))
print(ngram_trie.suggestions_context('',['tha','mi',"a'"]))

['mi', 'mar', 'math']
['moladh', 'mhòr-chuid', 'mhilennium']
['dol', 'tighinn', 'feuchainn']


In [12]:
training_sents, test_sents = split_data() # split_data(test_files=['c01.txt'])
words = [w for sent in training_sents for w in sent]

In [13]:
bigram_trie = NgramTrieModel(words=words,ngram=BigramModel(sents=training_sents))
result = test_model(bigram_trie,training_sents=training_sents,test_sents=test_sents)
print(result)

1048/1048
<models.evaluate.Results object at 0x107b4ce50>


In [14]:
print(result.get_key_ratio())
print(result.get_word_ratio())
print(result.time_total)
print(result.avg_time())

0.6110697227287745
0.6154918225831189
89.39919499999993
0.08530457538167932


In [15]:
trie = TrieModel(words=words)
bigram_trie = NgramTrieModel(words=words,ngram=BigramModel(sents=training_sents))
trigram_trie = NgramTrieModel(words=words,ngram=TrigramModel(sents=training_sents))
fourgram_trie = NgramTrieModel(words=words,ngram=NgramModel(4,sents=training_sents))
iterp_trie = NgramTrieModel(words=words,ngram=LinearInterpolationModel(sents=training_sents))
back_trie = NgramTrieModel(words=words,ngram=BackoffInterpolationModel(sents=training_sents))

trie_models = [trie,bigram_trie,trigram_trie,fourgram_trie,iterp_trie,back_trie]

In [16]:
trie_results = test_models(trie_models,training_sents=training_sents,test_sents=test_sents)

1048/1048 model: 5


In [17]:
trie_model_names = ["TrieModel", "BigramTrie", "TrigramTrie", "FourgramTrie", "LinInterpTrie", "BackInterpTrie"]

for r, n in zip(trie_results.values(),trie_model_names):
    print(n)
    print(r.get_key_ratio())
    print(r.get_word_ratio())
    print(r.time_total)
    print(r.avg_time())
    print()

TrieModel
0.5552727647275361
0.9135377120309013
633.6574779999996
0.6046349980916027

BigramTrie
0.6110697227287745
0.6154918225831189
75.9643469999983
0.07248506393129608

TrigramTrie
0.7361147455367812
0.4095007745435491
47.6790310000067
0.04549525858779265

FourgramTrie
0.7908396876450589
0.33978820081636496
47.9523580000009
0.04575606679389399

LinInterpTrie
0.4523174327320947
0.9108597545265239
1523.588838
1.4538061431297709

BackInterpTrie
0.463112340659277
0.910204432422875
1499.0921649999975
1.4304314551526693



In [18]:
dfa = MinimalDFAModel(words=words)
bigram_dfa = NgramMinimalDFAModel(words=words,ngram=BigramModel(sents=training_sents))
trigram_dfa = NgramMinimalDFAModel(words=words,ngram=TrigramModel(sents=training_sents))
fourgram_dfa = NgramMinimalDFAModel(words=words,ngram=NgramModel(4,sents=training_sents))
iterp_dfa = NgramMinimalDFAModel(words=words,ngram=LinearInterpolationModel(sents=training_sents))
back_dfa = NgramMinimalDFAModel(words=words,ngram=BackoffInterpolationModel(sents=training_sents))

dfa_models = [dfa, bigram_dfa, trigram_dfa, fourgram_dfa, iterp_dfa, back_dfa]

In [38]:
dfa_results = test_models(dfa_models,training_sents=training_sents,test_sents=test_sents)


1048/1048 model: 5


In [39]:
dfa_model_names = ["MinimalDFAModel", "BigramMinimalDFA", "TrigramMinimalDFA", "FourgramMinimalDFA", "LinInterpMinimalDFA", "BackInterpMinimalDFA"]

for r, n in zip(dfa_results.values(),dfa_model_names):
    print(n)
    print(r.get_key_ratio())
    print(r.get_word_ratio())
    print(r.time_total)
    print(r.avg_time())
    print()

MinimalDFAModel
0.5553690408527885
0.9135377120309013
505.34737899998345
0.48220169751906816

BigramMinimalDFA
0.6110455890097779
0.6156826622777753
61.4443860000174
0.058630139312993705

TrigramMinimalDFA
0.7367395149334479
0.4096916142382056
37.69783699994787
0.035971218511400636

FourgramMinimalDFA
0.7911012605747475
0.33997904051102146
37.945648999973855
0.0362076803434865

LinInterpMinimalDFA
0.45291689574210087
0.9110505942211804
1327.9304170000005
1.267109176526718

BackInterpMinimalDFA
0.4631755355940217
0.9103952721175314
1243.8826079999617
1.1869108854961468



In [40]:
models5 = [UnigramModel(sents=training_sents), BigramModel(sents=training_sents), TrigramModel(sents=training_sents), NgramModel(4,sents=training_sents), NgramModel(5,sents=training_sents)]
models7 = [UnigramModel(sents=training_sents), BigramModel(sents=training_sents), TrigramModel(sents=training_sents), NgramModel(4,sents=training_sents), NgramModel(5,sents=training_sents), 
    NgramModel(6,sents=training_sents), NgramModel(7,sents=training_sents)]


iterp_dfa1 = NgramMinimalDFAModel(words=words,ngram=LinearInterpolationModel(sents=training_sents)) # 1:0.1   2:0.4   3:0.5
iterp_dfa2 = NgramMinimalDFAModel(
    words=words,
    ngram=LinearInterpolationModel(
        models=models5, 
        weights={1:0.03,2:0.07,3:0.2,4:0.3,5:0.4},
        sents=training_sents)) # 1:0.03     2:0.007      3:0.2  4:0.3   5:0.4
iterp_dfa3 = NgramMinimalDFAModel(
    words=words,
    ngram=LinearInterpolationModel(
        models=models5, 
        weights={1:0.0001,2:0.0099,3:0.09,4:0.2,5:0.7},
        sents=training_sents)) # 1:0.0001   2:0.0099    3:0.09  4:0.2   5:0.7
iterp_dfa4 = NgramMinimalDFAModel(
    words=words,
    ngram=LinearInterpolationModel(
        models=models7, 
        weights={1:0.003,2:0.007,3:0.05,4:0.14,5:0.2,6:0.25,7:0.35},
        sents=training_sents)) # 1:0.003    2:0.007     3:0.05   4:0.14 5:0.2    6:0.25  7:0.35
iterp_dfa5 = NgramMinimalDFAModel(
    words=words,
    ngram=LinearInterpolationModel(
        models=models7, 
        weights={1:0.0001,2:0.0049,3:0.015,4:0.04,5:0.08,6:0.16,7:0.7},
        sents=training_sents)) # 1:0.0001   2:0.0049   3:0.015    4:0.04  5:0.08    6:0.16  7:0.7

interp_models = [iterp_dfa1, iterp_dfa2, iterp_dfa3, iterp_dfa4, iterp_dfa5]

In [41]:
interp_results = test_models(interp_models,training_sents=training_sents,test_sents=test_sents)

1048/1048 model: 4


In [42]:
interp_names = [
    "LinInterpMinimalDFA - Default TrigramModel\nweights: 1:0.1   2:0.4   3:0.5",
    "LinInterpMinimalDFA - FivegramModel\nweights: 1:0.03   2:0.007    3:0.2  4:0.3   5:0.4",
    "LinInterpMinimalDFA - FivegramModel\nweights: 1:0.0001   2:0.0099    3:0.09  4:0.2   5:0.7",
    "LinInterpMinimalDFA - SevengramModel\nweights: 1:0.003    2:0.007   3:0.05   4:0.14 5:0.2   6:0.25  7:0.35",
    "LinInterpMinimalDFA - SevengramModel\nweights: 1:0.0001   2:0.0049   3:0.015   4:0.04  5:0.08   6:0.16  7:0.7"]

for r, n in zip(interp_results.values(),interp_names):
    print(n)
    print(r.get_key_ratio())
    print(r.get_word_ratio())
    print(r.time_total)
    print(r.avg_time())
    print()

LinInterpMinimalDFA - Default TrigramModel
weights: 1:0.1   2:0.4   3:0.5
0.45291689574210087
0.9110505942211804
1549.255025999977
1.4782967805343292

LinInterpMinimalDFA - FivegramModel
weights: 1:0.03   2:0.007    3:0.2  4:0.3   5:0.4
0.45406593176526
0.9100721277131336
1668.3995440000217
1.5919842977099443

LinInterpMinimalDFA - FivegramModel
weights: 1:0.0001   2:0.0099    3:0.09  4:0.2   5:0.7
0.45412014428009045
0.9097211189890333
1827.9531839999981
1.7442301374045783

LinInterpMinimalDFA - SevengramModel
weights: 1:0.003    2:0.007   3:0.05   4:0.14 5:0.2   6:0.25  7:0.35
0.45416224725342696
0.9099767078658054
1846.902947999968
1.7623119732824124

LinInterpMinimalDFA - SevengramModel
weights: 1:0.0001   2:0.0049   3:0.015   4:0.04  5:0.08   6:0.16  7:0.7
0.4541021405353115
0.9098165388363615
1908.3542290000187
1.820948691793911

