module Semantic
module Transform
class TFIDF
@@number_of_documents_with_term = []
def self.transform(matrix)
number_of_documents = matrix.num_columns
@@number_of_documents_with_term = []
matrix.columns.each_with_index do |document, column_index|
document_term_total = document.rows.inject(0.0) {|word_sum, word_count| word_sum + word_count.to_f }
document.rows.each_with_index do |term_weight, row_index|
unless term_weight.to_f == 0.0
matrix[row_index, column_index] = (term_weight / document_term_total) *
Math.log((number_of_documents / number_of_documents_with_term(row_index, matrix).to_f).abs)
end
end
end
matrix
end
def self.number_of_documents_with_term(row_index, matrix)
return @@number_of_documents_with_term[row_index] unless @@number_of_documents_with_term[row_index].nil?
term_document_occurences = 0
rows,cols = matrix.dimensions
for n in (0...cols)
if matrix[row_index, n] > 0 #Term appears in document
term_document_occurences += 1
end
end
@@number_of_documents_with_term[row_index] = term_document_occurences
@@number_of_documents_with_term[row_index]
end
end
end
end