Skip to content

Commit

Permalink
adding custom objects for freqdist and idf
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexander Behrens committed Apr 11, 2016
1 parent 3152af6 commit 501d146
Show file tree
Hide file tree
Showing 8 changed files with 341 additions and 60 deletions.
105 changes: 105 additions & 0 deletions lib/components/freqDist.js
@@ -0,0 +1,105 @@
/**
* Create an freqDist object.
*/


'use strict';


function FreqDist(_text) {

var _cache = {};
var _ordered;
var _size;
var _count;
var _tokens;

if (_text) {
add(_text);
}

function add(_text, _count) {
_reset();

if (_text.hasOwnProperty('merge') && typeof _text['merge'] === 'function') {
_cache = _text.merge(_cache);
return this;
}

_text = [].concat(_text);
if (typeof _count === 'undefined') {
_count = 1;
}
_text.forEach(function addToken(_token) {
if (!_cache.hasOwnProperty(_token)) {
_cache[_token] = 0;
}
_cache[_token] += _count;
});
return this;
}

function merge(external) {
for (var key in _cache) {
if (!external.hasOwnProperty(key)) {
external[key] = 0;
}
external[key] += _cache[key];
}
return external;
}

function count(token) {
if (token) {
return (_cache.hasOwnProperty(token)) ? _cache[token] : 0;
}
return (typeof _count !== 'undefined') ? _count : _count = Object.keys(_cache).reduce(function (p, c) {
return p + _cache[c];
}, 0);
}

function range(start, end) {
start = (typeof start !== 'undefined') ? start : 0;
end = (typeof end !== 'undefined') ? end : size();
if (typeof _ordered === 'undefined') {
var __count = count();
_ordered = tokens().map(function (token) {
return {
name: token,
count: _cache[token],
tf: _cache[token] / __count
};
});
_ordered.sort(function (a, b) {
return a.count > b.count ? 1 : -1;
});
}
return _ordered.slice(start, end);
}

function size() {
return (typeof _size !== 'undefined') ? _size : _size = Object.keys(_cache).length;
}

function tokens() {
return (typeof _tokens !== 'undefined') ? _tokens : _tokens = Object.keys(_cache);
}

function _reset() {
_count = undefined;
_ordered = undefined;
_size = undefined;
_tokens = undefined;
}

return {
add: add,
count: count,
range: range,
size: size,
tokens: tokens
};

}

module.exports = FreqDist;
88 changes: 88 additions & 0 deletions lib/components/idf.js
@@ -0,0 +1,88 @@
/**
* Create an Idf object.
*/


'use strict';


var FreqDist = require('./').FreqDist;


function Idf(freqDists) {

var _cache = {};
var _ordered;
var _size;
var _tokens;
var documents = 0;

if (freqDists) {
add(freqDists);
}

function add(freqDists) {
_reset();
freqDists = [].concat(freqDists);
freqDists.forEach(function (freqDist) {
documents += 1;
var _freqDist = (freqDist.hasOwnProperty('tokens') && typeof freqDist.tokens === 'function') ? freqDist : FreqDist(freqDist);
_freqDist.tokens().forEach(function (token) {
if (!_cache.hasOwnProperty(token)) {
_cache[token] = 0;
}
_cache[token] += 1;
});
});
return this;
}

function get(token) {
if (!_cache.hasOwnProperty(token)) {
return 0;
}
return Math.log(documents / _cache[token]);
}

function range(start, end) {
start = (typeof start !== 'undefined') ? start : 0;
end = (typeof end !== 'undefined') ? end : size() - 1;
if (typeof _ordered === 'undefined') {
_ordered = tokens().map(function (token) {
return {
name: token,
count: _cache[token],
idf: get(token)
};
});
_ordered.sort(function (a, b) {
return a.count > b.count ? 1 : -1;
});
}
return _ordered.slice(start, end);
}

function size() {
return (typeof _size !== 'undefined') ? _size : _size = tokens().length;
}

function tokens() {
return (typeof _tokens !== 'undefined') ? _tokens : _tokens = Object.keys(_cache);
}

function _reset() {
_size = undefined;
_tokens = undefined;
}

return {
add: add,
get: get,
range: range,
size: size,
tokens: tokens
};

}

module.exports = Idf;
2 changes: 2 additions & 0 deletions lib/components/index.js
@@ -0,0 +1,2 @@
module.exports.Idf = require('./idf.js');
module.exports.FreqDist = require('./freqDist.js');
53 changes: 25 additions & 28 deletions lib/frequency.js
Expand Up @@ -10,50 +10,47 @@
* MODULES.
*/
var through2 = require('through2');
var FreqDist = require('./components').FreqDist;


/**
* FUNCTIONS.
*/
function frequency() {
var _freq = {};
function frequency(text, options) {

if (!options && Object.prototype.toString.call(text) !== '[object Array]') {
options = text;
text = '';
}

options = options || {};

if (text) {
return FreqDist(text);
}

var _freq = FreqDist();

return through2.obj(function (chunk, enc, callback) {

var _chunk = (typeof chunk === 'object' && Object.prototype.toString.call(chunk) !== '[object Array]') ? chunk.text : chunk;
if (!_chunk || Object.prototype.toString.call(_chunk) !== '[object Array]') {
return callback(new Error('Chunk is not an array ' + JSON.stringify(chunk)));
}
_chunk.forEach(function (token) {
_freq = add(_freq, token, 1);
});
return callback();
if (options.cache) {
_freq.add(_chunk);
return callback();
}
return callback(null, FreqDist(_chunk));
}, function (callback) {
var _freqArray = Object.keys(_freq).map(function (item) {
return {
token: item,
count: _freq[item]
};
});
_freqArray.sort(function (a, b) {
if (a.count > b.count) {
return -1;
} else {
return 1
}
});
this.push(_freqArray);
if (options.cache) {
this.push(_freq);
}
return callback();
});

}

function add(_freq, token, count) {
if (!_freq.hasOwnProperty(token)) {
_freq[token] = 0;
}
_freq[token] += count;
return _freq;
}


/**
* EXPORTS.
Expand Down
36 changes: 26 additions & 10 deletions lib/idf.js
Expand Up @@ -10,25 +10,41 @@
* MODULES.
*/
var through2 = require('through2');
var Idf = require('./components').Idf;


/**
* FUNCTIONS.
*/
function idf() {
function idf(text, options) {

var countTokens = {};
var sentences = 0;
if (!options && Object.prototype.toString.call(text) !== '[object Array]') {
options = text;
text = '';
}

options = options || {};

if (text) {
return Idf(text);
}

var _idf = Idf();

return through2.obj(function (chunk, enc, callback) {
sentences++;
for (var key in chunk) {
if (!countTokens.hasOwnProperty(key)) {
countTokens[key] = 0;
}
countTokens[key] += 1;

var _chunk = (chunk.hasOwnProperty('text')) ? chunk.text : chunk;
if (!_chunk || Object.prototype.toString.call(_chunk) !== '[object Array]' && !_chunk.hasOwnProperty('tokens')) {
return callback(new Error('Cannot use document ' + JSON.stringify(chunk)));
}
return callback(null, chunk);

_idf.add(_chunk);

return callback();

}, function (callback) {
this.push(_idf);
return callback();
});

}
Expand Down
4 changes: 3 additions & 1 deletion lib/index.js
Expand Up @@ -21,6 +21,7 @@ var classifiers = require('./classifiers');
var crossValidation = require('./crossValidation.js');
var filters = require('./filters');
var calculate = require('./calculate');
var components = require('./components');


/**
Expand All @@ -38,5 +39,6 @@ module.exports = {
crossValidation: crossValidation,
classifiers: classifiers,
filters: filters,
calculate: calculate
calculate: calculate,
components: components
};

0 comments on commit 501d146

Please sign in to comment.