Permalink
Browse files

Temporary fix to Database-abstraction

  • Loading branch information...
1 parent 17ae33b commit 5db3f5b638024994151e44203950db1c5d15bc89 @malthejorgensen malthejorgensen committed Nov 17, 2011
View
@@ -1 +1 @@
-conf.php
+conf.php
View
@@ -0,0 +1,6 @@
+# Tagger
+The [Tagger](http://tagger.dk) project is a library making it possible to extract relevant tags (keywords and named entities) from texts.
+
+Tags can be diambiguated and rated by relevancy.
+
+The library can be included in a webservice wrapper (https://github.com/40c/tagger-webservice) or it can be integrated in to your faviourite CMS (ie Drupal: http://drupal.org/project/tagger)
View
@@ -1,9 +0,0 @@
-This is the Tagger project.
-
-The Tagger project is a library making it possible to extract relevant tags (keywords and named entities) from texts.
-
-Tags can be diambiguated and rated by relevancy.
-
-The library can be included in a webservice wrapper (https://github.com/40c/tagger-webservice) or it can be integrated in to your faviourite CMS (ie Drupal: http://drupal.org/project/tagger)
-
-Read more on http://tagger.dk
@@ -7,19 +7,25 @@ class KeywordExtractor {
public $words;
public $tags;
+ private $constant;
+
function __construct($words) {
$this->tagger = Tagger::getTagger();
- $this->words = $words;
$this->tags = array();
+ $this->constant = 1/count($words);
+
+ $words = array_map('mb_strtolower', $words);
+ $this->words = array_count_values($words);
+
}
public function determine_keywords() {
$db_conf = $this->tagger->getConfiguration('db');
$word_relations_table = $db_conf['word_relations_table'];
$lookup_table = $db_conf['lookup_table'];
- $implode_words = implode("','", array_map('mysql_real_escape_string', $this->words));
+ $implode_words = implode("','", array_map('mysql_real_escape_string', array_keys($this->words)));
$query = "SELECT * FROM $word_relations_table WHERE word IN ('$implode_words.')";
TaggerLogManager::logDebug("Query:\n" . $query);
@@ -28,13 +34,30 @@ public function determine_keywords() {
$subjects = array();
while ($row = TaggerQueryManager::fetch($result)) {
- if(!isset($subjects[$row['tid']]['rating'])) { $subjects[$row['tid']]['rating'] = 0; }
- //if(!isset($subjects[$row->tid]['words'])) { $subjects[$row->tid]['words'] = array(); }
- $subjects[$row['tid']]['rating'] += $row['score'];
- //$subjects[$row->tid]['words'][] = array('word' => $row->word, 'rating' => $row->score);
+ if (array_key_exists(mb_strtolower($row['word']), $this->words)) {
+ if (!isset($subjects[$row['tid']]['rating'])) { $subjects[$row['tid']]['rating'] = 0; }
+ //if(!isset($subjects[$row->tid]['words'])) { $subjects[$row->tid]['words'] = array(); }
+ $subjects[$row['tid']]['rating'] += $row['score'] * $this->words[mb_strtolower($row['word'])];
+ //$subjects[$row->tid]['words'][] = array('word' => $row->word, 'rating' => $row->score);
+ }
}
- if (isset($subjects[0])) { unset($subjects[0]); }
+ $constant = $this->constant;
+ // Normalize scores
+ $normalize = function($s) use ($constant) {
+ $s['rating'] *= $constant;
+ return $s;
+ };
+ $subjects = array_map($normalize, $subjects);
+
+ // Threshold
+ $threshold = $this->tagger->getConfiguration('keyword_threshold');
+ $thresher = function($subject) use ($threshold) {
+ return $subject['rating'] > $threshold;
+ };
+ $subjects = array_filter($subjects, $thresher);
+
+ //if (isset($subjects[0])) { unset($subjects[0]); }
TaggerLogManager::logDebug("Keywords:\n" . print_r($subjects, true));
if (!empty($subjects)) {
View
@@ -18,6 +18,8 @@
'type' => $tagger_conf['db']['type'],
);
}
+ // DATABASE TABLE NAMES ARE DEFINED AT END OF THIS FILE
+
// Names and ids of your vocabularies.
$tagger_conf['vocab_names'] = array(
@@ -41,7 +41,14 @@ public function query($sql, $args) {
$c = __CLASS__;
self::$instance = new $c;
}
- $result = self::$instance->link->query(sprintf($sql, $args));
+
+ if (!empty($args)) {
+ $result = self::$instance->link->query(sprintf($sql, $args));
+ }
+ else {
+ $result = self::$instance->link->query($sql);
+ }
+
if($result) {
return $result;
} else {
View
@@ -62,6 +62,11 @@
'h3',
);
+
+ // Minimum one full keyword per 250 words
+ $tagger_conf['keyword_threshold'] = 1/250;
+
+
// Settings for logging
$tagger_conf['log_handler'] = 'Default';
$tagger_conf['logging_type'] = 'file'; // file db
@@ -1,122 +0,0 @@
-<?php
- ini_set('memory_limit', '1024M');
- ini_set('extension', 'translit.so');
-
- require_once 'lib_calc_score.php';
-
- $start = time();
-
- $link = mysql_connect('localhost', 'root', 'sniggle');
-
- if (!$link) {
- die('Could not connect: ' . mysql_error());
- }
- mysql_select_db('ny_taggerdk');
- mysql_set_charset ('utf8', $link);
- mb_internal_encoding("UTF-8");
-
- mysql_query('
- CREATE TABLE IF NOT EXISTS `wordstats` (
- `word` varchar(255) NOT NULL,
- `word_count` bigint(20) unsigned NOT NULL,
- `doc_count` bigint(20) unsigned NOT NULL,
- `word_freq` decimal(30,20) unsigned NOT NULL,
- `doc_freq` decimal(30,20) unsigned NOT NULL,
- `doc_freq_std` decimal(30,20) unsigned NOT NULL,
- PRIMARY KEY (`word`)
- ) DEFAULT CHARSET=utf8;
- ');
-
- if (!mysql_query('TRUNCATE TABLE `wordstats`;')) {
- die('Could not query:' . mysql_error());
- }
-
- $query = '
- SELECT nr.title, nr.body, cfu.field_underrubrik_value
- FROM node AS n
- JOIN node_revisions AS nr ON nr.vid = n.vid
- JOIN content_field_underrubrik AS cfu ON cfu.vid = n.vid
- WHERE n.type = "avisartikel"
- ORDER BY created DESC
- LIMIT 0, 10000;';
-
- $result = mysql_query($query);
- $doc_count = 0;
- $word_count = 0;
-
-
- $overall_frequency = array();
- while ($row = mysql_fetch_object($result)) {
-
- $frequency = count_words(strip_tags($row->title.' '.$row->field_underrubrik_value. ' '.$row->body));
-
- foreach ($frequency AS $key => $value){
- $word_count += $value['word_count'];
- if(!isset($overall_frequency[$key])) {
- $overall_frequency[$key]['word_count'] = $value['word_count'];
- $overall_frequency[$key]['doc_count'] = 1;
- $overall_frequency[$key]['doc_freq_sum'] = $value['word_freq'];
- $overall_frequency[$key]['doc_freq_squared_sum'] = pow($value['word_freq'],2);
- } else {
- $overall_frequency[$key]['word_count'] += $value['word_count'];
- $overall_frequency[$key]['doc_count'] += 1;
- $overall_frequency[$key]['doc_freq_sum'] += $value['word_freq'];
- $overall_frequency[$key]['doc_freq_squared_sum'] += pow($value['word_freq'],2);
- }
- }
- $doc_count++;
- }
-
- $counter = 0;
- $sql = "INSERT INTO wordstats (word, word_count, doc_count, word_freq, doc_freq, doc_freq_std) VALUES\n";
- foreach ($overall_frequency AS $key => $value) {
- $key = mysql_escape_string($key);
- if($counter != 0) {
- $sql .= ', ';
- }
- $word_freq = $value['word_count']/$word_count;
- $doc_freq = $value['doc_freq_sum']/$doc_count;
- $doc_freq_std = sqrt($value['doc_freq_squared_sum']/$doc_count - pow($doc_freq,2));
- $sql .= "('$key', $value[word_count], $value[doc_count], $word_freq, $doc_freq, $doc_freq_std)";
- if(++$counter == 1000) {
- $sql .= " ON DUPLICATE KEY UPDATE word_count=word_count+VALUES(word_count),
- doc_count=doc_count+VALUES(doc_count),
- word_freq=(word_count+VALUES(word_count))/$word_count,
- doc_freq=(doc_freq+VALUES(doc_freq)),
- doc_freq_std=(doc_freq_std+VALUES(doc_freq_std))/2;";
- // the last two lines in ON DUPLICATE KEY UPDATE are not correct but an approximation!
- if (!mysql_query($sql)) {
- die('Could not query, line ' . __LINE__ . ': ' . mysql_error());
- }
- $sql = "INSERT INTO wordstats (word, word_count, doc_count, word_freq, doc_freq, doc_freq_std) VALUES\n";
- $counter = 0;
- }
- }
- $sql .= " ON DUPLICATE KEY UPDATE word_count=word_count+VALUES(word_count),
- doc_count=doc_count+VALUES(doc_count),
- word_freq=(word_count+VALUES(word_count))/$word_count,
- doc_freq=(doc_freq+VALUES(doc_freq)),
- doc_freq_std=(doc_freq_std+VALUES(doc_freq_std))/2;";
- if (!mysql_query($sql)) {
- echo $sql . "\n";
- die('Could not query, line ' . __LINE__ . ': ' . mysql_error());
- }
-
- $end = time();
-
- $time = $end - $start;
-
- mysql_query('
- CREATE TABLE IF NOT EXISTS `docstats` (
- `word_count` bigint(20) unsigned NOT NULL,
- `doc_count` bigint(20) unsigned NOT NULL
- )
- ');
- mysql_query('TRUNCATE TABLE `docstats`;');
-
- mysql_query('INSERT INTO `docstats` (doc_count,word_count) VALUES ('.$doc_count.','.$word_count.');');
-
- print 'Total documents: '. $doc_count. '<br />';
- print 'Total words: '. $word_count. '<br />';
- print 'Total time: '. $time .' secs. ('. $doc_count/$time .' documents per sec. )';
-
Oops, something went wrong.

0 comments on commit 5db3f5b

Please sign in to comment.