-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcorpus.rb
52 lines (42 loc) · 961 Bytes
/
corpus.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
require 'csv'
require 'roo'
require './doc'
class Corpus
attr_reader(:docs, :doc_count)
# Assume corpus comes as a single-column csv, with a raw document per row.
# Using csv instead of raw lines allows for multi-line documents.
#
def initialize()
@doc_count = 0
@docs = []
end
def read_csv(path)
CSV.foreach(path) do |row|
accumulate(Doc.new(@doc_count, row[0]))
end
end
def read_xsl(path)
xlsx = Roo::Spreadsheet.open(path)
xlsx.each_row_streaming do |row|
accumulate(Doc.new(@doc_count, row[0].to_s))
end
end
def term_counts
@term_counts ||= Hash.new
end
def vocab
@vocab ||= Set.new
end
def idf(term)
Math.log(doc_count.to_f / (term_counts[term] + 1).to_f)
end
private
def accumulate(doc)
@docs << doc
@doc_count = @doc_count + 1
doc.vocab.each do |term|
term_counts[term] = (term_counts[term] || 0) + 1
vocab << term
end
end
end