public
Description: Language Identification with Ruby: probabilistic language identification with ruby1.9
Homepage:
Clone URL: git://github.com/snifty/whatlang.git
whatlang / generate_models.rb
100644 33 lines (25 sloc) 0.52 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#!/usr/bin/env ruby1.9
# coding: utf-8
require 'rubygems'
require 'hpricot'
 
index = File.open('models/index.xml')
 
doc = Hpricot(index)
 
class Model
  def initialize text
    @text = text
  end
 
  def bigrams
    # if text has "house", bigrams should return
    # ['ho', 'ou', 'us', 'se']
    pos = 0
    while pos <= @text.length - 2
      yield @text[pos, 2]
      pos += 1
    end
  end
 
end
 
m = Model.new("the house is very large")
puts m.bigrams
 
# Dir.glob('models/udhr_*').each do |file|
# puts file
# end