public
Description: A document vector search with flexible matrix transforms. Currently supports Latent semantic analysis and Term frequency - inverse document frequency
Homepage: http://github.com/josephwilk/rsemantic/wikis/home
Clone URL: git://github.com/josephwilk/rsemantic.git
rsemantic / lib / semantic / transform / tf_idf_transform.rb
100644 43 lines (33 sloc) 1.347 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
module Semantic
  module Transform
    class TFIDF
 
      @@number_of_documents_with_term = []
 
      def self.transform(matrix)
        number_of_documents = matrix.num_columns
        @@number_of_documents_with_term = []
 
        matrix.columns.each_with_index do |document, column_index|
          document_term_total = document.rows.inject(0.0) {|word_sum, word_count| word_sum + word_count.to_f }
 
          document.rows.each_with_index do |term_weight, row_index|
            unless term_weight.to_f == 0.0
              matrix[row_index, column_index] = (term_weight / document_term_total) *
              Math.log((number_of_documents / number_of_documents_with_term(row_index, matrix).to_f).abs)
            end
          end
        end
        matrix
      end
 
      def self.number_of_documents_with_term(row_index, matrix)
        return @@number_of_documents_with_term[row_index] unless @@number_of_documents_with_term[row_index].nil?
 
        term_document_occurences = 0
 
        rows,cols = matrix.dimensions
 
        for n in (0...cols)
          if matrix[row_index, n] > 0 #Term appears in document
            term_document_occurences += 1
          end
        end
        @@number_of_documents_with_term[row_index] = term_document_occurences
        @@number_of_documents_with_term[row_index]
      end
 
    end
  end
end