-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdoc.rb
69 lines (58 loc) · 1.52 KB
/
doc.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
require './toks'
require 'matrix'
class Doc
attr_reader(:raw,:toks,:id)
def initialize(doc_id, raw_text)
@id = doc_id
@raw = raw_text
@toks = Toks.tokenize(raw_text)
end
def compare(other_doc, corpus)
cosine_similarity(other_doc, corpus)
end
def vocab
@vocab ||= Set.new(toks)
end
def raw_tfs(corpus)
tfs = Hash.new
corpus.vocab.each { |tok| tfs[tok] = 0 }
toks.each { |tok| tfs[tok] = tfs[tok] + 1 }
tfs
end
def vectorize_simple(corpus)
counts = raw_tfs(corpus)
Vector.elements(counts.values)
end
def vectorize_tfidf(corpus)
counts = raw_tfs(corpus)
tfidfs = []
doc_len = toks.length
counts.keys.each do |term|
raw_tf = counts[term]
tfidf = (raw_tf>0) ? (raw_tf.to_f / doc_len.to_f) / corpus.idf(term) : 0
tfidfs << tfidf
end
Vector.elements(tfidfs)
end
def vectorize(corpus)
vectorize_tfidf(corpus)
end
def cosine_similarity(other_doc, corpus)
self_vec = vectorize(corpus)
other_vec = other_doc.vectorize(corpus)
self_vec.dot(other_vec) / (self_vec.magnitude * other_vec.magnitude)
end
def scored_comparison_terms(other_doc, corpus)
contribs = Hash.new
corpus.vocab.each { |tok| contribs[tok] = 0.0 }
self_vec = vectorize(corpus)
other_vec = other_doc.vectorize(corpus)
i = 0
contribs.keys.each do |term|
contribs[term] = self_vec[i] * other_vec[i]
i = i + 1
# puts "#{term} -> #{contribs[term]}"
end
contribs.sort_by { |term, value| value }
end
end