public
Description: A document vector search with flexible matrix transforms. Currently supports Latent semantic analysis and Term frequency - inverse document frequency
Homepage: http://github.com/josephwilk/rsemantic/wikis/home
Clone URL: git://github.com/josephwilk/rsemantic.git
rsemantic / lib / semantic / transform / tf_idf_transform.rb
4afc2fc1 » josephwilk 2008-09-12 prep structure for shifting... 1 module Semantic
2 module Transform
3 class TFIDF
427eb7b5 » josephwilk 2008-09-22 Major refactor to enable ar... 4
045bcc9e » josephwilk 2008-11-13 Bringing frequency caculati... 5 @@number_of_documents_with_term = []
427eb7b5 » josephwilk 2008-09-22 Major refactor to enable ar... 6
045bcc9e » josephwilk 2008-11-13 Bringing frequency caculati... 7 def self.transform(matrix)
8 number_of_documents = matrix.num_columns
9 @@number_of_documents_with_term = []
427eb7b5 » josephwilk 2008-09-22 Major refactor to enable ar... 10
045bcc9e » josephwilk 2008-11-13 Bringing frequency caculati... 11 matrix.columns.each_with_index do |document, column_index|
12 document_term_total = document.rows.inject(0.0) {|word_sum, word_count| word_sum + word_count.to_f }
427eb7b5 » josephwilk 2008-09-22 Major refactor to enable ar... 13
045bcc9e » josephwilk 2008-11-13 Bringing frequency caculati... 14 document.rows.each_with_index do |term_weight, row_index|
15 unless term_weight.to_f == 0.0
16 matrix[row_index, column_index] = (term_weight / document_term_total) *
17 Math.log((number_of_documents / number_of_documents_with_term(row_index, matrix).to_f).abs)
427eb7b5 » josephwilk 2008-09-22 Major refactor to enable ar... 18 end
19 end
20 end
045bcc9e » josephwilk 2008-11-13 Bringing frequency caculati... 21 matrix
22 end
427eb7b5 » josephwilk 2008-09-22 Major refactor to enable ar... 23
045bcc9e » josephwilk 2008-11-13 Bringing frequency caculati... 24 def self.number_of_documents_with_term(row_index, matrix)
25 return @@number_of_documents_with_term[row_index] unless @@number_of_documents_with_term[row_index].nil?
427eb7b5 » josephwilk 2008-09-22 Major refactor to enable ar... 26
045bcc9e » josephwilk 2008-11-13 Bringing frequency caculati... 27 term_document_occurences = 0
427eb7b5 » josephwilk 2008-09-22 Major refactor to enable ar... 28
045bcc9e » josephwilk 2008-11-13 Bringing frequency caculati... 29 rows,cols = matrix.dimensions
30
31 for n in (0...cols)
32 if matrix[row_index, n] > 0 #Term appears in document
33 term_document_occurences += 1
427eb7b5 » josephwilk 2008-09-22 Major refactor to enable ar... 34 end
35 end
045bcc9e » josephwilk 2008-11-13 Bringing frequency caculati... 36 @@number_of_documents_with_term[row_index] = term_document_occurences
37 @@number_of_documents_with_term[row_index]
427eb7b5 » josephwilk 2008-09-22 Major refactor to enable ar... 38 end
39
4afc2fc1 » josephwilk 2008-09-12 prep structure for shifting... 40 end
41 end
427eb7b5 » josephwilk 2008-09-22 Major refactor to enable ar... 42 end