Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Temporary fix to Database-abstraction

  • Loading branch information...
commit 5db3f5b638024994151e44203950db1c5d15bc89 1 parent 17ae33b
@malthejorgensen malthejorgensen authored
View
2  .gitignore
@@ -1 +1 @@
-conf.php
+conf.php
View
6 README.md
@@ -0,0 +1,6 @@
+# Tagger
+The [Tagger](http://tagger.dk) project is a library making it possible to extract relevant tags (keywords and named entities) from texts.
+
+Tags can be diambiguated and rated by relevancy.
+
+The library can be included in a webservice wrapper (https://github.com/40c/tagger-webservice) or it can be integrated in to your faviourite CMS (ie Drupal: http://drupal.org/project/tagger)
View
9 README.txt
@@ -1,9 +0,0 @@
-This is the Tagger project.
-
-The Tagger project is a library making it possible to extract relevant tags (keywords and named entities) from texts.
-
-Tags can be diambiguated and rated by relevancy.
-
-The library can be included in a webservice wrapper (https://github.com/40c/tagger-webservice) or it can be integrated in to your faviourite CMS (ie Drupal: http://drupal.org/project/tagger)
-
-Read more on http://tagger.dk
View
37 classes/KeywordExtractor.class.php
@@ -7,11 +7,17 @@ class KeywordExtractor {
public $words;
public $tags;
+ private $constant;
+
function __construct($words) {
$this->tagger = Tagger::getTagger();
- $this->words = $words;
$this->tags = array();
+ $this->constant = 1/count($words);
+
+ $words = array_map('mb_strtolower', $words);
+ $this->words = array_count_values($words);
+
}
public function determine_keywords() {
@@ -19,7 +25,7 @@ public function determine_keywords() {
$word_relations_table = $db_conf['word_relations_table'];
$lookup_table = $db_conf['lookup_table'];
- $implode_words = implode("','", array_map('mysql_real_escape_string', $this->words));
+ $implode_words = implode("','", array_map('mysql_real_escape_string', array_keys($this->words)));
$query = "SELECT * FROM $word_relations_table WHERE word IN ('$implode_words.')";
TaggerLogManager::logDebug("Query:\n" . $query);
@@ -28,13 +34,30 @@ public function determine_keywords() {
$subjects = array();
while ($row = TaggerQueryManager::fetch($result)) {
- if(!isset($subjects[$row['tid']]['rating'])) { $subjects[$row['tid']]['rating'] = 0; }
- //if(!isset($subjects[$row->tid]['words'])) { $subjects[$row->tid]['words'] = array(); }
- $subjects[$row['tid']]['rating'] += $row['score'];
- //$subjects[$row->tid]['words'][] = array('word' => $row->word, 'rating' => $row->score);
+ if (array_key_exists(mb_strtolower($row['word']), $this->words)) {
+ if (!isset($subjects[$row['tid']]['rating'])) { $subjects[$row['tid']]['rating'] = 0; }
+ //if(!isset($subjects[$row->tid]['words'])) { $subjects[$row->tid]['words'] = array(); }
+ $subjects[$row['tid']]['rating'] += $row['score'] * $this->words[mb_strtolower($row['word'])];
+ //$subjects[$row->tid]['words'][] = array('word' => $row->word, 'rating' => $row->score);
+ }
}
- if (isset($subjects[0])) { unset($subjects[0]); }
+ $constant = $this->constant;
+ // Normalize scores
+ $normalize = function($s) use ($constant) {
+ $s['rating'] *= $constant;
+ return $s;
+ };
+ $subjects = array_map($normalize, $subjects);
+
+ // Threshold
+ $threshold = $this->tagger->getConfiguration('keyword_threshold');
+ $thresher = function($subject) use ($threshold) {
+ return $subject['rating'] > $threshold;
+ };
+ $subjects = array_filter($subjects, $thresher);
+
+ //if (isset($subjects[0])) { unset($subjects[0]); }
TaggerLogManager::logDebug("Keywords:\n" . print_r($subjects, true));
if (!empty($subjects)) {
View
2  conf.example.php
@@ -18,6 +18,8 @@
'type' => $tagger_conf['db']['type'],
);
}
+ // DATABASE TABLE NAMES ARE DEFINED AT END OF THIS FILE
+
// Names and ids of your vocabularies.
$tagger_conf['vocab_names'] = array(
View
9 db/TaggerQueryHandler.class.php
@@ -41,7 +41,14 @@ public function query($sql, $args) {
$c = __CLASS__;
self::$instance = new $c;
}
- $result = self::$instance->link->query(sprintf($sql, $args));
+
+ if (!empty($args)) {
+ $result = self::$instance->link->query(sprintf($sql, $args));
+ }
+ else {
+ $result = self::$instance->link->query($sql);
+ }
+
if($result) {
return $result;
} else {
View
5 defaults.php
@@ -62,6 +62,11 @@
'h3',
);
+
+ // Minimum one full keyword per 250 words
+ $tagger_conf['keyword_threshold'] = 1/250;
+
+
// Settings for logging
$tagger_conf['log_handler'] = 'Default';
$tagger_conf['logging_type'] = 'file'; // file db
View
122 keyword-import/build_frequencies.php
@@ -1,122 +0,0 @@
-<?php
- ini_set('memory_limit', '1024M');
- ini_set('extension', 'translit.so');
-
- require_once 'lib_calc_score.php';
-
- $start = time();
-
- $link = mysql_connect('localhost', 'root', 'sniggle');
-
- if (!$link) {
- die('Could not connect: ' . mysql_error());
- }
- mysql_select_db('ny_taggerdk');
- mysql_set_charset ('utf8', $link);
- mb_internal_encoding("UTF-8");
-
- mysql_query('
- CREATE TABLE IF NOT EXISTS `wordstats` (
- `word` varchar(255) NOT NULL,
- `word_count` bigint(20) unsigned NOT NULL,
- `doc_count` bigint(20) unsigned NOT NULL,
- `word_freq` decimal(30,20) unsigned NOT NULL,
- `doc_freq` decimal(30,20) unsigned NOT NULL,
- `doc_freq_std` decimal(30,20) unsigned NOT NULL,
- PRIMARY KEY (`word`)
- ) DEFAULT CHARSET=utf8;
- ');
-
- if (!mysql_query('TRUNCATE TABLE `wordstats`;')) {
- die('Could not query:' . mysql_error());
- }
-
- $query = '
- SELECT nr.title, nr.body, cfu.field_underrubrik_value
- FROM node AS n
- JOIN node_revisions AS nr ON nr.vid = n.vid
- JOIN content_field_underrubrik AS cfu ON cfu.vid = n.vid
- WHERE n.type = "avisartikel"
- ORDER BY created DESC
- LIMIT 0, 10000;';
-
- $result = mysql_query($query);
- $doc_count = 0;
- $word_count = 0;
-
-
- $overall_frequency = array();
- while ($row = mysql_fetch_object($result)) {
-
- $frequency = count_words(strip_tags($row->title.' '.$row->field_underrubrik_value. ' '.$row->body));
-
- foreach ($frequency AS $key => $value){
- $word_count += $value['word_count'];
- if(!isset($overall_frequency[$key])) {
- $overall_frequency[$key]['word_count'] = $value['word_count'];
- $overall_frequency[$key]['doc_count'] = 1;
- $overall_frequency[$key]['doc_freq_sum'] = $value['word_freq'];
- $overall_frequency[$key]['doc_freq_squared_sum'] = pow($value['word_freq'],2);
- } else {
- $overall_frequency[$key]['word_count'] += $value['word_count'];
- $overall_frequency[$key]['doc_count'] += 1;
- $overall_frequency[$key]['doc_freq_sum'] += $value['word_freq'];
- $overall_frequency[$key]['doc_freq_squared_sum'] += pow($value['word_freq'],2);
- }
- }
- $doc_count++;
- }
-
- $counter = 0;
- $sql = "INSERT INTO wordstats (word, word_count, doc_count, word_freq, doc_freq, doc_freq_std) VALUES\n";
- foreach ($overall_frequency AS $key => $value) {
- $key = mysql_escape_string($key);
- if($counter != 0) {
- $sql .= ', ';
- }
- $word_freq = $value['word_count']/$word_count;
- $doc_freq = $value['doc_freq_sum']/$doc_count;
- $doc_freq_std = sqrt($value['doc_freq_squared_sum']/$doc_count - pow($doc_freq,2));
- $sql .= "('$key', $value[word_count], $value[doc_count], $word_freq, $doc_freq, $doc_freq_std)";
- if(++$counter == 1000) {
- $sql .= " ON DUPLICATE KEY UPDATE word_count=word_count+VALUES(word_count),
- doc_count=doc_count+VALUES(doc_count),
- word_freq=(word_count+VALUES(word_count))/$word_count,
- doc_freq=(doc_freq+VALUES(doc_freq)),
- doc_freq_std=(doc_freq_std+VALUES(doc_freq_std))/2;";
- // the last two lines in ON DUPLICATE KEY UPDATE are not correct but an approximation!
- if (!mysql_query($sql)) {
- die('Could not query, line ' . __LINE__ . ': ' . mysql_error());
- }
- $sql = "INSERT INTO wordstats (word, word_count, doc_count, word_freq, doc_freq, doc_freq_std) VALUES\n";
- $counter = 0;
- }
- }
- $sql .= " ON DUPLICATE KEY UPDATE word_count=word_count+VALUES(word_count),
- doc_count=doc_count+VALUES(doc_count),
- word_freq=(word_count+VALUES(word_count))/$word_count,
- doc_freq=(doc_freq+VALUES(doc_freq)),
- doc_freq_std=(doc_freq_std+VALUES(doc_freq_std))/2;";
- if (!mysql_query($sql)) {
- echo $sql . "\n";
- die('Could not query, line ' . __LINE__ . ': ' . mysql_error());
- }
-
- $end = time();
-
- $time = $end - $start;
-
- mysql_query('
- CREATE TABLE IF NOT EXISTS `docstats` (
- `word_count` bigint(20) unsigned NOT NULL,
- `doc_count` bigint(20) unsigned NOT NULL
- )
- ');
- mysql_query('TRUNCATE TABLE `docstats`;');
-
- mysql_query('INSERT INTO `docstats` (doc_count,word_count) VALUES ('.$doc_count.','.$word_count.');');
-
- print 'Total documents: '. $doc_count. '<br />';
- print 'Total words: '. $word_count. '<br />';
- print 'Total time: '. $time .' secs. ('. $doc_count/$time .' documents per sec. )';
-
View
178 keyword-import/build_related_words.php
@@ -1,178 +0,0 @@
-<?php
-
- require_once('lib_calc_score.php');
-
- // settings
- $property = 'diff_outer_doc_freq';
- $maximum_subject_add_count = 1; // how many subjects to be added at a time
-
- //touch('keyword_list_non_candidates.txt');
- $error = false;
-
- // Get selected subjects (subjects_selected.txt)
- /*if ($lines = file('subjects_selected.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) {
- foreach($lines as $line) {
- list($tid, $name) = explode('|', $line);
- $subjects[$tid] = mb_strtolower($name);
- }
- $c = count($lines);
- echo "Found $c subjects in subjects_selected.txt.\n";
-
- // check if the wanted subjects exists in the database
- foreach ($subjects as $tid => $name) {
- echo "$tid: $name\n";
- $query = "SELECT name FROM term_data WHERE vid = 16 AND tid = $tid";
- $result = mysql_query($query) or die(mysql_error());
- if ($row = mysql_fetch_object($result)) {
- if (mb_strtolower($row->name) == $name) {
- continue;
- }
- else {
- echo " => TID $tid is '$row->name' not '$name'.\n";
- unset($subjects[$tid]);
- $error = true;
- }
- }
- else {
- echo " => TID $tid not found.\n";
- unset($subjects[$tid]);
- $error = true;
- }
- $query = "SELECT tid FROM term_data WHERE vid = 16 AND name = '".mysql_real_escape_string($name)."'";
- $result = mysql_query($query) or die(mysql_error());
- if ($row = mysql_fetch_object($result)) {
- echo " => '$name' has TID $row->tid\n";
- $error = true;
- }
- else {
- echo " => No subject '$name' found i database.\n";
- $error = true;
- }
- }
- }
- else { */
- $query = "SELECT tid, name FROM term_data WHERE vid = 16";
- $result = mysql_query($query) or die(mysql_error());
-
- while ($row = mysql_fetch_object($result)) {
- $subjects[$row->tid] = $row->name;
- }
- //}
-
- if ($error) {
- die("Errors in subjects_selected.txt found. Exiting\n");
- }
-
- // filter subjects that:
- // * have too few articles (subject_non_candidates.txt)
- // * are already in the database (subjects_in_db.txt)
- // * or simply don't wanna have (add them yourself to subjects_non_candidates.txt)
- touch("subjects_non_candidates.txt");
- touch("subjects_in_db.$property.txt");
- $lines1 = file("subjects_non_candidates.txt", FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
- $lines2 = file("subjects_in_db.$property.txt", FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
- $lines = array_merge($lines1, $lines2);
- foreach($lines as $line) {
- list($tid, $name) = explode('|', $line);
- unset($subjects[$tid]);
- }
-
- //$subjects = array_slice($subjects, 0, $maximum_subject_add_count, true);
-
- $property_esc = mysql_real_escape_string($property);
-
- $subject_count = count($subjects);
- $new_subjects = 0;
-
- $start = time();
- echo "Trying to add $subject_count new subjects to the database...\n\n";
-
- foreach ($subjects as $tid => $name) {
- echo "Adding $name...\n";
- $name_esc = mysql_real_escape_string($name);
-
- // check if the subject already is in the database
- $query = "SELECT tid, word FROM `word_relations_$property_esc` WHERE tid = $tid";
- $result = mysql_query($query);
- if($result && mysql_num_rows($result) > 0) {
- echo "$name is already in the database. Skipping\n";
- $file = fopen("subjects_in_db.$property.txt", 'a');
- fwrite($file, $tid . '|' . $name . "\n");
- continue;
- }
-
- // Get and score the words related to this subject
- $result = subject_name_related_words($name, $property, 'underrubrik', false);
-
- $hits = $result['doc_count'];
- $freq_array = &$result['freq_array'];
-
- // too few articles found
- echo "$hits articles.";
- if ($hits < 5) {
- echo " Too few. Skipping.\n";
- $file = fopen('subjects_non_candidates.txt', 'a');
- fwrite($file, $tid . '|' . $name . '|' . $hits . "\n");
- continue;
- }
- else {
- echo '\n';
- }
-
- $file = fopen("subjects_in_db.$property.txt", 'a');
- fwrite($file, "$tid|$name|$hits\n");
-
- echo "Number of words possibly related to $name: " . count($freq_array) . "\n";
-
- // Create the table if it doesn't exist
- mysql_query("
- CREATE TABLE IF NOT EXISTS `word_relations_$property_esc` (
- `word` varchar(255) NOT NULL,
- `tid` varchar(255) NOT NULL,
- `score` decimal(30,20) unsigned NOT NULL,
- `pass` bigint(20) unsigned NOT NULL,
- KEY (`word`),
- KEY (`tid`)
- ) DEFAULT CHARSET=utf8;
- ") or die(mysql_error());
-
- $freq_array = array_filter($freq_array, function($v) use ($property) { return $v[$property] > 0.2; });
-
- $words_to_be_added = 0;
- foreach($freq_array as $value) {
- if ($words_to_be_added == 0) {
- $query = "INSERT INTO `word_relations_$property_esc` (word, tid, score, pass) VALUES\n";
- }
- else {
- $query .= ', ';
- }
- $query .= '(\''.mysql_real_escape_string($value['word']).'\','.$tid.','.$value[$property].',1)';
- $words_to_be_added++;
-
- if ($words_to_be_added == 1000) {
- if (!mysql_query($query)) {
- echo 'Could not query, line ' . __LINE__ . ': ' . mysql_error() . "<br>\n";
- echo $query . "\n";
- die();
- }
- $words_to_be_added = 0;
- }
- }
-
- if ($words_to_be_added != 0 && !mysql_query($query)) {
- echo 'Could not query, line ' . __LINE__ . ': ' . mysql_error() . "<br>\n";
- echo $query . "\n";
- die();
- }
-
- $new_subjects++;
- }
-
- $end = time();
-
- $time = $end - $start;
-
- echo "Added $new_subjects new subjects to the database.\n";
- echo 'Total time: '. $time .' secs. ('. @($time/$new_subjects) .' seconds per keywords)' . "\n";
- echo 'Total keywords in database: '. count(file("subjects_in_db.$property.txt", FILE_SKIP_EMPTY_LINES)) . "\n";
-
View
3  keyword-import/inf_generate_json.php
@@ -1,3 +0,0 @@
-<?php
- require('lib_information.php');
- inf_generate_json();
View
16 keyword-import/json_create_keywords.php
@@ -1,16 +0,0 @@
-<?php
-/* This file creates the relations between subjects and words in
- the database */
-
-if (count($argv) > 2) {
- file_put_contents('php://stderr', 'Error: Zero or one command-line argument should be given.');
- return 1;
-}
-
-require_once('lib_keyword.php');
-if (count($argv) == 2) {
- multiple_keywords_create_from_json($argv[1]);
-}
-else {
- multiple_keywords_create_from_json();
-}
View
251 keyword-import/lib_information.php
@@ -1,251 +0,0 @@
-<?php
-
- require_once('../Tagger.php');
- require_once __ROOT__ . 'db/TaggerQueryManager.class.php';
- $tagger = Tagger::getTagger();
-
-
- require_once('lib_keyword.php');
-
-
- function inf_generate_json() {
- $texts = inf_get_multiple_keywords_texts(3);
- $json = json_encode($texts);
- file_put_contents('keyword_texts.json', $json);
- }
-
- function inf_create_keywords() {
- multiple_keywords_create(inf_get_multiple_keywords_texts(3), $check = true);
- }
-
- function inf_get_multiple_keywords_texts($keyword_count = 1) {
- // $keyword_count: how many keywords to be added at a time
-
- global $tagger;
- $keyword_conf = $tagger->getConfiguration('keyword');
- $property = $keyword_conf['property'];
-
- $db_conf = $tagger->getConfiguration('db');
- $word_relations_table = $db_conf['word_relations_table'];
-
-
- touch('keywords_selected.txt');
- $error = false;
-
- // Get selected keywords (keywords_selected.txt)
- if ($lines = file('keywords_selected.txt', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) {
- foreach($lines as $line) {
- list($tid, $name) = explode('|', $line);
- $keywords[$tid] = mb_strtolower($name);
- }
- $c = count($lines);
- echo "Found $c keywords in keywords_selected.txt.\n";
-
- // check if the wanted keywords exists in the database
- foreach ($keywords as $tid => $name) {
- echo "$tid: $name\n";
- $query = "SELECT name FROM term_data WHERE vid = 16 AND tid = $tid";
- $result = TaggerQueryManager::query($query);
- if ($row = TaggerQueryManager::fetch($result)) {
- if (mb_strtolower($row['name']) == $name) {
- continue;
- }
- else {
- echo " => TID $tid is '".$row['name']."' not '$name'.\n";
- unset($keywords[$tid]);
- $error = true;
- }
- }
- else {
- echo " => TID $tid not found.\n";
- unset($keywords[$tid]);
- $error = true;
- }
- $query = "SELECT tid FROM term_data WHERE vid = 16 AND name = '".mysql_real_escape_string($name)."'";
- $result = TaggerQueryManager::query($query);
- if ($row = TaggerQueryManager::fetch($result)) {
- echo " => '$name' has TID ".$row['tid']."\n";
- $error = true;
- }
- else {
- echo " => No keyword '$name' found i database.\n";
- $error = true;
- }
- }
- }
- else {
- $query = "SELECT tid, name FROM term_data WHERE vid = 16";
- $result = TaggerQueryManager::query($query);
-
- while ($row = TaggerQueryManager::fetch($result)) {
- $keywords[$row['tid']] = $row['name'];
- }
- }
-
- if ($error) {
- die("Errors in keywords_selected.txt found. Exiting\n");
- }
-
- // filter keywords that:
- // * have too few articles (keyword_non_candidates.txt)
- // * are already in the database (keywords_in_db.txt)
- // * or simply don't wanna have (add them yourself to keywords_non_candidates.txt)
- touch("keywords_non_candidates.txt");
- touch("keywords_in_db.$property.txt");
- $lines1 = file("keywords_non_candidates.txt", FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
- $lines2 = file("keywords_in_db.$property.txt", FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
- $lines = array_merge($lines1, $lines2);
- foreach($lines as $line) {
- list($tid, $name) = explode('|', $line);
- unset($keywords[$tid]);
- }
-
- $keywords = array_slice($keywords, 0, $keyword_count, true);
-
- $keyword_count = count($keywords);
- $new_keywords = 0;
-
- $start = time();
-
- $tids_n_texts = array();
- foreach ($keywords as $tid => $name) {
- echo "Finding articles related to $name...\n";
-
- // Get texts related to this keyword
- $tids_n_texts[$tid] = inf_get_keyword_texts($name, $property, 'underrubrik');
- $hits = count($tids_n_texts[$tid]);
-
- // too few articles found
- echo "$hits articles.";
- if ($hits < 5) {
- unset($tids_n_texts[$tid]);
- echo " Too few. Skipping.\n\n";
- $file = fopen('keywords_non_candidates.txt', 'a');
- fwrite($file, $tid . '|' . $name . '|' . $hits . "\n");
- continue;
- }
- else {
- echo "\n\n";
- }
- }
- return $tids_n_texts;
- }
-
- function inf_get_keyword_texts($name, $property = 'all', $range = 'overskrift', $normalize = false) {
- global $total_doc_count;
-
- $timer = new Timer();
-
- $name_esc = mysql_real_escape_string($name);
-
- $articles_sql = "SELECT n.nid, nr.vid, nr.title, nr.body, cfu.field_underrubrik_value FROM `node` AS n
- JOIN `node_revisions` AS nr ON n.vid = nr.vid
- LEFT JOIN content_field_underrubrik AS cfu ON cfu.vid = n.vid
- WHERE n.type IN ('avisartikel', 'ritzau_telegram')";
- if ($range == 'underrubrik') {
- // earlier search - gave problems with the keyword 'dans' that gave articles with
- // 'dansk', 'danske' osv.
- //$articles_sql .= " AND (n.title REGEXP '[[:<:]]$name_esc'
- // OR cfu.field_underrubrik_value REGEXP '[[:<:]]$name_esc')";
-
- $articles_sql .= " AND (n.title REGEXP '[[:<:]]".$name_esc."[[:>:]]'
- OR cfu.field_underrubrik_value REGEXP '[[:<:]]".$name_esc."[[:>:]]')";
- }
- elseif ($range == 'fulltext') {
- $articles_sql .= " AND (nr.title LIKE '%$name_esc%'
- OR cfu.field_underrubrik_value REGEXP '%$name_esc%'
- OR nr.body LIKE '%$name_esc%')";
- }
- elseif ($range == 'tagged') {
- $sql = "SELECT tid FROM term_data WHERE name = '$name_esc' AND vid = 16";
- $result = mysql_query($sql) or die(mysql_error());
- $row = mysql_fetch_object($result);
- $tid = $row['tid'];
-
- $articles_sql = "SELECT n.nid, nr.vid, nr.title, nr.body, cfu.field_underrubrik_value FROM `node` AS n
- JOIN `node_revisions` AS nr ON n.vid = nr.vid
- JOIN content_field_underrubrik AS cfu ON cfu.vid = n.vid
- JOIN term_node AS tn ON tn.vid = n.vid
- WHERE n.type IN ('avisartikel', 'ritzau_telegram')
- AND tn.tid = $tid AND n.nid > 230908";
- }
- else {
- // search only in headlines
-
- // the problem with the LIKE query below is that 'astronomi' matches 'gastronomi'
- $articles_sql .= " AND n.title LIKE '%" . $name_esc . "%'";
-
- // that doesn't happen in this REGEXP query
- //$articles_sql .= " AND n.title REGEXP '[[:<:]]" . $name_esc . "'";
- //$articles_sql .= " AND n.title REGEXP '[[:<:]]" . $name_esc . "[[:>:]]'";
- }
- //echo $articles_sql;
-
- $timer->start();
- $articles_result = TaggerQueryManager::query($articles_sql);
- $timer->stop();
-
- echo "Query took " . $timer->secsElapsed() . " seconds.\n";
-
-
- $doc_count = 0;
- $word_count = 0;
- $doc_ids = array();
- $freq_array = array();
-
- $texts = array();
- while ($articles_row = TaggerQueryManager::fetch($articles_result)) {
- $doc_ids[] = $articles_row['nid'];
- $texts[] = strip_tags($articles_row['title'].' '.$articles_row['field_underrubrik_value']. ' '.$articles_row['body']);
- $doc_count++;
- }
-
- //$result = array();
- //$result['doc_ids'] = $doc_ids;
- //$result['doc_count'] = $doc_count;
-
- return $texts;
- }
-
- // Calculate word scores in article
- function score_article($id) {
- global $text;
-
- $query = '
- SELECT nr.title, nr.body, cfu.field_underrubrik_value
- FROM node AS n
- JOIN node_revisions AS nr ON nr.vid = n.vid
- JOIN content_field_underrubrik AS cfu ON cfu.vid = n.vid
- WHERE n.type = "avisartikel" AND n.nid = '. $id .'
- LIMIT 0, 1';
-
- $result = TaggerQueryManager::query($query);
- if($row = TaggerQueryManager::fetch($result)){
- print "No article with id=" . $id; exit;
- }
-
-
- $text = strip_tags($row['title'].' '.$row['field_underrubrik_value']. ' '.$row['body']);
-
- return score_text($text);
- }
-
- function get_all_texts() {
-
- $query = '
- SELECT nr.title, nr.body, cfu.field_underrubrik_value
- FROM node AS n
- JOIN node_revisions AS nr ON nr.vid = n.vid
- JOIN content_field_underrubrik AS cfu ON cfu.vid = n.vid
- WHERE n.type = "avisartikel"
- ORDER BY created DESC
- LIMIT 0, 10000;';
-
- $result = mysql_query($query);
-
- while ($row = mysql_fetch_object($result)) {
- $texts[] = count_words(strip_tags($row->title.' '.$row->field_underrubrik_value. ' '.$row->body));
- }
-
- return $texts;
- }
View
578 keyword-import/lib_keyword.php
@@ -1,578 +0,0 @@
-<?php
-
- require_once('../Tagger.php');
- require_once __ROOT__ . 'db/TaggerQueryManager.class.php';
-
- $tagger = Tagger::getTagger();
-
- $db_conf =$tagger->getConfiguration('db');
- $docstats_table = $db_conf['docstats_table'];
- $wordstats_table = $db_conf['wordstats_table'];
- $word_relations_table = $db_conf['word_relations_table'];
-
- $lookup_table = $db_conf['lookup_table'];
-
- $keyword_conf = $tagger->getConfiguration('keyword');
- $property = $keyword_conf['property'];
- $normalize = $keyword_conf['normalize'];
-
-
- // Get total number of documents and words
- $query = "SELECT * FROM $docstats_table LIMIT 0, 1";
- $result = TaggerQueryManager::query($query);
- $row = TaggerQueryManager::fetch($result);
- $total_doc_count = $row['doc_count'];
- $total_word_count = $row['word_count'];
-
- function multiple_keywords_create_from_json($filename = 'keyword_texts.json') {
- if (!is_file($filename)) {
- file_put_contents('php://stderr', "Error: No file named '$filename'.");
- return false;
- }
- $file_contents = file_get_contents($filename);
-
- $json = json_decode($file_contents, true);
-
- if ($json === NULL) {
- $err = json_errcode_to_text(json_last_error());
- file_put_contents('php://stderr', "Error: JSON $err.");
- return false;
- }
-
- multiple_keywords_create($json);
- }
-
- function multiple_keywords_create($tids_n_texts, $check = true) {
- global $tagger, $property, $docstats_table, $wordstats_table, $word_relations_table;
-
- $tids = array_keys($tids_n_texts);
-
- if ($check) {
- $error = false;
-
- foreach ($tids as $tid) {
-
- // Get keyword corresponding to $tid
- if ($name = tid_to_name($tid)) {
- $keywords[$tid] = $name;
- }
- else {
- echo "$tid: Not found.\n";
- unset($tids_n_texts[$tid]);
- $error = true;
- }
- }
- echo "$tid: $name\n";
-
-
- // filter keywords that:
- // * have too few articles (keyword_non_candidates.txt)
- // * are already in the database (keywords_in_db.txt)
- // * or simply don't wanna have (add them yourself to keywords_non_candidates.txt)
- touch("keywords_non_candidates.txt");
- touch("keywords_in_db.$property.txt");
- $lines1 = file("keywords_non_candidates.txt", FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
- $lines2 = file("keywords_in_db.$property.txt", FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
- $lines = array_merge($lines1, $lines2);
- foreach($lines as $line) {
- list($tid, $name) = explode('|', $line);
- unset($keywords[$tid]);
- }
-
-
- // Create the table if it doesn't exist
- TaggerQueryManager::query("
- CREATE TABLE IF NOT EXISTS `$word_relations_table` (
- `word` varchar(255) NOT NULL,
- `tid` varchar(255) NOT NULL,
- `score` decimal(30,20) unsigned NOT NULL,
- `pass` bigint(20) unsigned NOT NULL,
- KEY (`word`),
- KEY (`tid`)
- ) DEFAULT CHARSET=utf8;
- ");
- }
-
- if ($error) {
- die("Errors in TIDs found. Exiting\n");
- }
-
- //$keywords = array_slice($keywords, 0, $maximum_keyword_add_count, true);
-
- $property_esc = mysql_real_escape_string($property);
-
- $keyword_count = count($keywords);
- $new_keywords = 0;
-
- $start = time();
- echo "Trying to add $keyword_count new keywords to the database...\n\n";
-
- foreach ($keywords as $tid => $name) {
- echo "Adding $name...\n";
-
- $name_esc = mysql_real_escape_string($name);
-
- // Create the related words to this keyword
- $result = keyword_create($tid, $tids_n_texts[$tid], FALSE);
-
- if($result) {
- $new_keywords++;
- }
- }
- }
-
- function keyword_create($tid, $texts, $check = TRUE) {
- global $tagger, $property, $word_relations_table;
-
- if ($check) {
- // Create the table if it doesn't exist
- TaggerQueryManager::query("
- CREATE TABLE IF NOT EXISTS `$word_relations_table` (
- `word` varchar(255) NOT NULL,
- `tid` varchar(255) NOT NULL,
- `score` decimal(30,20) unsigned NOT NULL,
- `pass` bigint(20) unsigned NOT NULL,
- KEY (`word`),
- KEY (`tid`)
- ) DEFAULT CHARSET=utf8;
- ");
- }
-
- // Check if $tid is a keyword in the DB
- $name = tid_to_name($tid);
- echo "$tid: $name\n";
-
- // Check if words related to $tid are already in the DB
- $query = "SELECT tid, word FROM `$word_relations_table` WHERE tid = $tid";
- $result = TaggerQueryManager::query($query);
- if(TaggerQueryManager::fetch($result)) {
- echo "$name is already in the database. Skipping\n";
- $file = fopen("keywords_in_db.$property.txt", 'a');
- fwrite($file, $tid . '|' . $name . "\n");
- return FALSE;
- }
-
- $property_esc = mysql_real_escape_string($property);
-
-
- // Get and score the words related to this keyword
- $result = keyword_find_related_words($tid, $texts);
- $hits = $result['doc_count'];
- $freq_array = &$result['freq_array'];
-
-
- echo "Number of words possibly related to $name: " . count($freq_array) . "\n";
-
- $freq_array = array_filter($freq_array, function($v) use ($property) { return $v[$property] > 0.2; });
-
- $words_to_be_added = 0;
- foreach($freq_array as $value) {
- if ($words_to_be_added == 0) {
- $query = "INSERT INTO `$word_relations_table` (word, tid, score, pass) VALUES\n";
- }
- else {
- $query .= ', ';
- }
- $query .= '(\''.mysql_real_escape_string($value['word']).'\','.$tid.','.$value[$property].',1)';
- $words_to_be_added++;
-
- if ($words_to_be_added == 1000) {
- if (!TaggerQueryManager::query($query)) {
- // ******* TO BE REMOVED *******
- echo 'Could not query, line ' . __LINE__ . ': ' . mysql_error() . "<br>\n";
- echo $query . "\n";
- die();
- }
- $words_to_be_added = 0;
- }
- }
-
- if ($words_to_be_added != 0 && !TaggerQueryManager::query($query)) {
- // ******* TO BE REMOVED *******
- echo 'Could not query, line ' . __LINE__ . ': ' . mysql_error() . "<br>\n";
- echo $query . "\n";
- die();
- }
-
- // Added to DB
- $file = fopen("keywords_in_db.$property.txt", 'a');
- fwrite($file, "$tid|$name|$hits\n");
- }
-
- function keyword_find_related_words($tid, $texts, $prop = FALSE) {
- global $property, $normalize, $total_doc_count;
-
- if ($property === FALSE) {
- $property
- }
-
- $timer = new Timer();
-
- $doc_count = 0;
- $word_count = 0;
- $doc_ids = array();
- $freq_array = array();
-
- $timer->start();
-
- foreach ($texts as $text) {
-
- $text = strip_tags($text);
- $frequency = score_text($text);
-
- foreach ($frequency AS $key => $value) {
- $word_count += $value['word_count'];
- if (isset($freq_array[$key])) {
- $freq_array[$key]['word_count'] += $value['word_count'];
- $freq_array[$key]['doc_count']++;
- }
- else {
- $freq_array[$key]['word_count'] = $value['word_count'];
- $freq_array[$key]['doc_count'] = 1;
-
- $freq_array[$key]['word'] = $value['word'];
- $freq_array[$key]['doc_count_db'] = $value['doc_count'];
- $freq_array[$key]['word_freq_db'] = $value['word_freq_db'];
- $freq_array[$key]['doc_freq_db'] = $value['doc_freq_db'];
- $freq_array[$key]['idf'] = $value['idf'];
- }
- }
- $doc_count++;
- }
-
- if ($property == 'all') {
- $properties = array('diff', 'doc_freq', 'inner_doc_freq', 'outer_doc_freq',
- 'diff_outer_doc_freq', 'diff_outer_doc_freq_log');
- }
- else {
- $properties = array($property);
- }
-
- foreach ($freq_array as $key => &$elem) {
- $temp_elem = $elem;
-
- $temp_elem['diff'] = $temp_elem['word_count']/$word_count - $temp_elem['word_freq_db'];
-
- $temp_elem['doc_freq'] = ($temp_elem['doc_count_db']-1)/(1+$total_doc_count);
-
- // in how many related articles does this word occur? (relative to the number of related articles)
- // i.e. the percentage of related articles where this word occurs
- $temp_elem['inner_doc_freq'] = ($temp_elem['doc_count']-1)/(1+$doc_count);
- // in how many related articles does this word occur?
- // (relative to the total number of articles where this word occurs)
- // i.e. the percentage of articles in which this word occurs that are related to this keyword
- $temp_elem['outer_doc_freq'] = min(($temp_elem['doc_count']-1)/(1+$temp_elem['doc_count_db']), 1);
-
- //$temp_elem['malt_x2'] = pow($temp_elem['doc_count'],2)/(1+$temp_elem['doc_count_db']);
- //$temp_elem['diff_malt_x2'] = $temp_elem['diff'] * $temp_elem['malt_x2'];
- //$temp_elem['diff_malt_x2_log'] = log(10000*$temp_elem['diff_malt_x2'], 2);
- //$temp_elem['diff_malt_x2_sqr'] = sqrt($temp_elem['diff_malt_x2']);
-
- $temp_elem['diff_outer_doc_freq'] = $temp_elem['diff'] * $temp_elem['outer_doc_freq'] * 10000;
- $temp_elem['diff_outer_doc_freq_log'] = log10($temp_elem['diff_outer_doc_freq']+1);
-
- if ($property == 'all') {
- $elem = $temp_elem;
- foreach ($properties as $prop) {
- if (is_nan($elem[$prop])) {
- unset($freq_array[$key]);
- }
- }
- }
- else {
- if (is_nan($temp_elem[$property])) {
- unset($freq_array[$key]);
- }
- else {
- $elem[$property] = $temp_elem[$property];
- }
- }
- }
-
- if ($normalize && $doc_count > 0) {
- foreach ($properties as $prop) {
- // get the word with the highest score
- $val = max(array_map(function($value) use ($prop) { return $value[$prop]; }, $freq_array));
- $val = ($val == 0) ? 1 : $val;
- $factor = 1/$val;
-
- // divide any other score by that (the largest) score
- foreach($freq_array as &$value) {
- $value[$prop] *= $factor;
- }
- }
- }
-
- $timer->stop();
-
- echo "Calculations took " . $timer->secsElapsed() . " seconds.\n";
-
- $result = array();
- $result['doc_count'] = $doc_count;
- $result['freq_array'] = &$freq_array;
-
- return $result;
- }
-
-
- function build_wordstats_table($texts) {
- ini_set('memory_limit', '1024M');
-
- $start = time();
-
- TaggerQueryManager::query('
- CREATE TABLE IF NOT EXISTS `$wordstats_table` (
- `word` varchar(255) NOT NULL,
- `word_count` bigint(20) unsigned NOT NULL,
- `doc_count` bigint(20) unsigned NOT NULL,
- `word_freq` decimal(30,20) unsigned NOT NULL,
- `doc_freq` decimal(30,20) unsigned NOT NULL,
- `doc_freq_std` decimal(30,20) unsigned NOT NULL,
- PRIMARY KEY (`word`)
- ) DEFAULT CHARSET=utf8;
- ');
-
- TaggerQueryManager::query('TRUNCATE TABLE `$wordstats_table`;');
-
- $doc_count = 0;
- $word_count = 0;
- $overall_frequency = array();
-
- foreach ($texts as $text) {
-
- $frequency = count_words($text);
-
- foreach ($frequency AS $key => $value){
- $word_count += $value['word_count'];
- if(!isset($overall_frequency[$key])) {
- $overall_frequency[$key]['word_count'] = $value['word_count'];
- $overall_frequency[$key]['doc_count'] = 1;
- $overall_frequency[$key]['doc_freq_sum'] = $value['word_freq'];
- } else {
- $overall_frequency[$key]['word_count'] += $value['word_count'];
- $overall_frequency[$key]['doc_count'] += 1;
- $overall_frequency[$key]['doc_freq_sum'] += $value['word_freq'];
- }
- }
- $doc_count++;
- }
-
- $counter = 0;
- $sql = "INSERT INTO $wordstats_table (word, word_count, doc_count, word_freq, doc_freq) VALUES\n";
- foreach ($overall_frequency AS $key => $value) {
- $key = mysql_escape_string($key);
- if($counter != 0) {
- $sql .= ', ';
- }
- $word_freq = $value['word_count']/$word_count;
- $doc_freq = $value['doc_freq_sum']/$doc_count;
- $sql .= "('$key', $value[word_count], $value[doc_count], $word_freq, $doc_freq)";
- if(++$counter == 1000) {
- $sql .= " ON DUPLICATE KEY UPDATE word_count=word_count+VALUES(word_count),
- doc_count=doc_count+VALUES(doc_count),
- word_freq=(word_count+VALUES(word_count))/$word_count,
- doc_freq=(doc_count+VALUES(doc_count))/$doc_count;";
- if (!mysql_query($sql)) {
- die('Could not query, line ' . __LINE__ . ': ' . mysql_error());
- }
- $sql = "INSERT INTO $wordstats_table (word, word_count, doc_count, word_freq, doc_freq) VALUES\n";
- $counter = 0;
- }
- }
- $sql .= " ON DUPLICATE KEY UPDATE word_count=word_count+VALUES(word_count),
- doc_count=doc_count+VALUES(doc_count),
- word_freq=(word_count+VALUES(word_count))/$word_count,
- doc_freq=(doc_count+VALUES(doc_count))/$doc_count;";
- if (!mysql_query($sql)) {
- echo $sql . "\n";
- die('Could not query, line ' . __LINE__ . ': ' . mysql_error());
- }
-
- $end = time();
-
- $time = $end - $start;
-
- mysql_query('
- CREATE TABLE IF NOT EXISTS `$docstats_table` (
- `word_count` bigint(20) unsigned NOT NULL,
- `doc_count` bigint(20) unsigned NOT NULL
- )
- ');
- mysql_query('TRUNCATE TABLE `$docstats_table`;');
-
- mysql_query('INSERT INTO `$docstats_table` (doc_count,word_count) VALUES ('.$doc_count.','.$word_count.');');
-
- print 'Total documents: '. $doc_count. '<br />';
- print 'Total words: '. $word_count. '<br />';
- print 'Total time: '. $time .' secs. ('. $doc_count/$time .' documents per sec. )';
- }
-
- // Calculate word scores in text
- function score_text($text) {
- global $total_doc_count, $wordstats_table;
- static $db_cache = array();
-
- $frequency = count_words($text);
- $words_to_lookup = array_diff(array_keys($frequency), array_keys($db_cache));
-
- $imploded_words = implode("','", array_map('mysql_real_escape_string', $words_to_lookup));
-
- // Get statistics for the words in the article
- $result = TaggerQueryManager::query("SELECT * FROM $wordstats_table WHERE word IN ('$imploded_words')");
-
- $unmatched_database = array();
- $unmatched_words = $frequency;
- while ($row = TaggerQueryManager::fetch($result)) {
- $key = mb_strtolower($row['word']);
-
- if(array_key_exists($key, $frequency)) {
- unset($unmatched_words[$key]);
- $cur_elem = &$frequency[$key];
- $cur_elem['word'] = $row['word'];
- $cur_elem['word_freq_db'] = $row['word_freq'];
- $cur_elem['doc_count'] = $row['doc_count'];
- $cur_elem['doc_freq_db'] = $row['doc_freq'];
- $cur_elem['doc_freq_std'] = $row['doc_freq_std'];
- } else {
- $unmatched_database[] = $key;
- }
- }
-
- // In the database 'ideen' == 'idéen' but we can't do that in PHP
- // without some cumbersome custom made conversion functions.
- // So instead we loop through the nonmatches and let the database match the
- // individual words that could be matched by PHP.
- foreach ($unmatched_database as $db_word) {
- foreach ($unmatched_words as $word => $value) {
- if(isset($unmatched_words[$word])) {
- $query = "SELECT * FROM $wordstats_table WHERE word = '".mysql_real_escape_string($db_word)."' AND '".mysql_real_escape_string($db_word)."' = '".mysql_real_escape_string($word)."'";
- $result = TaggerQueryManager::query($query);
- if($row = TaggerQueryManager::fetch($result)) {
- unset($unmatched_words[$word]);
- $cur_elem = &$frequency[$word];
- $cur_elem['word'] = $row['word'];
- $cur_elem['word_freq_db'] = $row['word_freq'];
- $cur_elem['doc_count'] = $row['doc_count'];
- $cur_elem['doc_freq_db'] = $row['doc_freq'];
- $cur_elem['doc_freq_std'] = $row['doc_freq_std'];
- }
- }
- }
- }
-
- foreach ($frequency as $key => &$elem) {
- if (!isset($elem['word_freq_db'])) {
- $elem['word'] = $key;
- $elem['word_freq_db'] = 0;
- $elem['doc_count'] = 0;
- $elem['doc_freq_db'] = 0;
- $elem['doc_freq_std'] = 0;
- }
- $elem['diff'] = $elem['word_freq'] - $elem['word_freq_db'];
- //$elem['diff'] = abs($elem['word_freq'] - $elem['word_freq_db']);
- $elem['diff_rel'] = ($elem['word_freq_db'] == 0) ? -1 : $elem['diff']/$elem['word_freq_db'];
- //$elem['idf'] = ($elem['doc_count'] == 0) ? -1 : log($doc_count/$elem['doc_count']);
- $elem['idf'] = log($total_doc_count/(1+$elem['doc_count']));
- $elem['tf-idf'] = $elem['word_count'] * $elem['idf'];
- $elem['std'] = $elem['word_freq'] - $elem['doc_freq_db'];
- $elem['std_rel'] = ($elem['doc_freq_std'] == 0) ? -1 : $elem['std'] / $elem['doc_freq_std'];
- }
-
- return $frequency;
- }
-
- require_once __ROOT__ . 'classes/Token.class.php';
- require_once __ROOT__ . 'classes/Tokenizer.class.php';
-
- // Get word frequencies for a text
- function count_words($text) {
-
- $words = Tokenizer::split_words(trim(mb_strtolower($text)));
- $words_without_stopwords = array_diff($words, Token::$stopwords);
-
- $word_count = count($words_without_stopwords);
- $frequency = array_count_values($words_without_stopwords);
-
- mb_regex_encoding("UTF-8");
- foreach ($frequency as $key => $value) {
- if (!mb_ereg_match('\w', $key)) {
- unset($frequency[$key]);
- }
- }
- //arsort($frequency);
-
- foreach($frequency as $key => $value){
- $frequency[$key] = array('word_count' => $value, 'word_freq' => $value/$word_count);
- }
- return $frequency;
- }
-
- function tid_to_name($tid) {
- global $lookup_table;
-
- $query = "SELECT name FROM $lookup_table WHERE vid = 16 AND tid = $tid AND canonical = 1";
- $result = TaggerQueryManager::query($query);
- if ($row = TaggerQueryManager::fetch($result)) {
- return $row['name'];
- }
- else {
- return FALSE;
- }
- }
-
- function json_errcode_to_text($errcode) {
- $err = '';
- switch ($errcode) {
- case JSON_ERROR_NONE:
- $err = ' - No errors';
- break;
- case JSON_ERROR_DEPTH:
- $err = ' - Maximum stack depth exceeded';
- break;
- case JSON_ERROR_STATE_MISMATCH:
- $err = ' - Underflow or the modes mismatch';
- break;
- case JSON_ERROR_CTRL_CHAR:
- $err = ' - Unexpected control character found';
- break;
- case JSON_ERROR_SYNTAX:
- $err = ' - Syntax error, malformed JSON';
- break;
- case JSON_ERROR_UTF8:
- $err = ' - Malformed UTF-8 characters, possibly incorrectly encoded';
- break;
- default:
- $err = ' - Unknown error';
- break;
- }
-
- return $err;
- }
-
- class Timer {
- private $starttime;
- private $endtime;
- private $running;
-
- function start() {
- $this->starttime = microtime(true);
-
- $this->running = true;
- }
-
- function stop() {
- if ($this->running) {
- $this->endtime = microtime(true);
- }
- $this->running = false;
- }
-
- function secsElapsed() {
- if ($this->running) {
- return microtime(true) - $this->starttime;
- }
- else {
- return $this->endtime - $this->starttime;
- }
- }
- }
View
83 keyword-import/single_article_find_keywords.php
@@ -1,83 +0,0 @@
-<?php
-
-
-function make_text_array($freq_array, $property) {
- usort($freq_array, function($a,$b) use ($property) { return $a[$property] < $b[$property]; });
- return array_map(function($a) use ($property) { return $a['word'].'<br /><span class="rating">('.$a[$property].')</span>'; }, $freq_array);
-}
-
- require_once('lib_calc_score.php');
-
- if(isset($_POST['text'])){
- $text = $_POST['text'];
- $frequency = score_text($text);
- }
-
- if(!isset($_GET['artid'])){
- print "No article id!"; exit;
- } else {
- $frequency = score_article($_GET['artid']);
- }
-
- //print_r($frequency);
- $tds[0] = make_text_array($frequency, 'diff');
- $tds[1] = make_text_array($frequency, 'diff_rel');
- $tds[2] = make_text_array($frequency, 'idf');
- $tds[3] = make_text_array($frequency, 'tf-idf');
- $tds[4] = make_text_array($frequency, 'std_rel');
- $tds[5] = make_text_array($frequency, 'std');
- //print_r($tds);
-
-?>
-<!DOCTYPE html>
-<html lang="en">
-<head>
- <meta charset="utf-8" />
- <title>Tagger-library manual test</title>
- <style>
- .rating { color: gray; }
- </style>
- <!--<link rel="stylesheet" type="text/css" href="reset.css" />-->
- <!--[if IE]>
- <script src="http://html5shiv.googlecode.com/svn/trunk/html5.js"></script>
- <![endif]-->
-</head>
-
-<body>
- <form action="" method="POST">
- <textarea name="text">
- <?php echo $text; ?>
- </textarea>
- </form>
-
-
- <table>
- <thead>
- <tr>
- <th>Johs</th>
- <th>Johs<br>relativ</th>
- <th>idf</th>
- <th>tf-idf</th>
- <th>Standardafvigelse<br>relativ</th>
- <th>Standardafvigelse<br>absolut</th>
- </tr>
- </thead>
- <tbody>
-<?php
- for ($row = 0, $rc = count($tds[0]); $row < $rc; $row++) {
- echo '<tr>';
- for ($column = 0, $cc = count($tds); $column < $cc; $column++) {
- echo '<td>'. $tds[$column][$row] .'</td>';
- }
- echo '</tr>';
- }
-?>
- </tbody>
- </table>
-
- <br /><br />------------------<br /><br />
-
- <?php print $text; ?>
-
-</body>
-</html>
View
198 keyword-import/single_article_find_subject.php
@@ -1,198 +0,0 @@
-<?php
-
-$property = 'diff_outer_doc_freq';
-$property_esc = mysql_real_escape_string($property);
-
-function make_text_array($freq_array, $property, $word_count = 1) {
- usort($freq_array, function($a,$b) use ($property) { return $a[$property] < $b[$property]; });
- return array_map(function($a) use ($property, $word_count) { return $a['word'].'<br /><span class="rating">('.$a[$property]/$word_count.')</span>'; }, $freq_array);
-}
-/*
- if(!isset($_GET['artid'])){
- print "No article id!"; exit;
- }
- */
- $text = '';
-
- $word_count = 1;
- if(isset($_POST['text'])){
- $text = $_POST['text'];
-
-
- require_once('lib_calc_score.php');
- $frequency = score_text($text);
- $tds[] = make_text_array($frequency, 'diff');
- $tds[] = make_text_array($frequency, 'tf-idf');
-
- $word_count = count($frequency);
-
- $implode_words = implode('\',\'', array_map('mysql_real_escape_string', array_keys($frequency)));
-
- $query = 'SELECT * FROM word_relations_'.$property_esc.' WHERE word IN(\''.$implode_words.'\')';
- $result = mysql_query($query);
- if(!$result){
- echo $query;
- print "No words"; exit;
- }
-
- $subjects = array();
-
- $unmatched_database = array();
- $unmatched_words = $frequency;
- while ($row = mysql_fetch_object($result)) {
- $key = mb_strtolower($row->word);
- if(array_key_exists($key, $frequency)) {
- unset($unmatched_words[$key]);
-
- if(!isset($subjects[$row->tid]['rating'])) { $subjects[$row->tid]['rating'] = 0; }
- if(!isset($subjects[$row->tid]['words'])) { $subjects[$row->tid]['words'] = array(); }
- $subjects[$row->tid]['rating'] += $row->score;
- $subjects[$row->tid]['words'][] = array('word' => $row->word, 'rating' => $row->score);
-
- } else {
- $unmatched_database[] = $key;
- }
- }
-
- // In the database 'ideen' == 'idéen' but we can't do that in PHP
- // without some cumbersome custom made conversion functions.
- // So instead we loop through the nonmatches and let the database match the
- // indivual words that could be matched by PHP.
- foreach ($unmatched_database as $db_word) {
- foreach ($unmatched_words as $word => $value) {
- if(isset($unmatched_words[$word])) {
- $query = 'SELECT * FROM word_relations_'.$property_esc.' WHERE word = "'.mysql_real_escape_string($db_word).'" AND "'.mysql_real_escape_string($db_word).'" = "'.mysql_real_escape_string($word).'"';
- $result = mysql_query($query);
- if($result === FALSE) {
- die('Could not query, line ' . __LINE__ . ': ' . mysql_error());
- }
- while($row = mysql_fetch_object($result)) {
- unset($unmatched_words[$word]);
- if(!isset($subjects[$row->tid]['rating'])) { $subjects[$row->tid]['rating'] = 0; }
- if(!isset($subjects[$row->tid]['words'])) { $subjects[$row->tid]['words'] = array(); }
- $subjects[$row->tid]['rating'] += $row->score;
- $subjects[$row->tid]['words'][] = array('word' => $row->word, 'rating' => $row->score);
- }
- }
- }
- }
-
- if(isset($subjects[0])) { unset($subjects[0]); }
-
- foreach($subjects as $tid => &$value) {
- $query = 'SELECT tid, name FROM term_data WHERE tid = '.$tid;
- $result = mysql_query($query);
- $row = mysql_fetch_object($result);
- // $value['rating'] *= pow(count($value['words']), 2);
- $value['word'] = $row->name;
- $td_keyword_ratings[] = make_text_array($value['words'], 'rating');
- }
- $td_keywords[] = make_text_array($subjects, 'rating', $word_count);
- }
-
-?>
-<!DOCTYPE html>
-<html lang="en">
-<head>
- <meta charset="utf-8" />
- <title>Tagger-library manual test</title>
- <style>
- .rating { color: gray; }
- </style>
- <!--<link rel="stylesheet" type="text/css" href="reset.css" />-->
- <!--[if IE]>
- <script src="http://html5shiv.googlecode.com/svn/trunk/html5.js"></script>
- <![endif]-->
-</head>
-
-<body>
- <form action="single_article_find_subject.php" method="POST">
- <textarea name="text" rows="20" cols="60">
- <?php echo $text; ?>
- </textarea>
- <input type="submit" value="Send">
- </form>
-
-
- <table style="float: right">
- <thead>
- <tr>
- <th>Johs</th>
- <th>tf-idf</th>
- </tr>
- </thead>
- <tbody>
-<?php
- if(isset($tds)) {
- for ($row = 0, $rc = count($tds[0]); $row < $rc; $row++) {
- echo '<tr>';
- for ($column = 0, $cc = count($tds); $column < $cc; $column++) {
- echo '<td>'. $tds[$column][$row] .'</td>';
- }
- echo '</tr>';
- }
- }
-?>
- </tbody>
- </table>
-
- <table>
- <thead>
- <tr>
- <th>Keywords</th>
- </tr>
- </thead>
- <tbody>
-<?php
- if(isset($td_keywords)) {
- for ($row = 0, $rc = count($td_keywords[0]); $row < $rc; $row++) {
- echo '<tr>';
- for ($column = 0, $cc = count($td_keywords); $column < $cc; $column++) {
- echo '<td>'. $td_keywords[$column][$row] .'</td>';
- }
- echo '</tr>';
- }
- }
-?>
- </tbody>
- </table>
-
- <table>
- <thead>
- <tr>
-<?php
- if (isset($subjects)) {
- foreach ($subjects as $value) {
- echo '<th>'.$value['word'].'</th>';
- }
- }
-?>
- </tr>
- </thead>
- <tbody>
-<?php
- if(isset($td_keyword_ratings)) {
- for ($row = 0, $rc = max(array_map('count', $td_keyword_ratings)); $row < $rc; $row++) {
- echo '<tr>';
- for ($column = 0, $cc = count($td_keyword_ratings); $column < $cc; $column++) {
- if (isset($td_keyword_ratings[$column][$row])) {
- echo '<td>'. $td_keyword_ratings[$column][$row] .'</td>';
- }
- else {
- echo '<td></td>';
- }
- }
- echo '</tr>';
- }
- }
-?>
- </tbody>
- </table>
-
-
- <br /><br />------------------<br /><br />
-
- <?php print $text; ?>
-
-</body>
-</html>
View
171 keyword-import/subject_compare_methods.php
@@ -1,171 +0,0 @@
-<?php
- require_once('lib_keyword.php');
-
- if(isset($_GET['n'])){
- $start = $_GET['n'];
- $stop = $_GET['n']+1;
- } else {
- $start = 0;
- $stop = 1;
- }
- $query = "SELECT tid, name FROM term_data WHERE vid = 16 LIMIT $start,$stop";
- if(isset($_GET['tid'])){
- $query = "SELECT tid, name FROM term_data WHERE vid = 16 AND tid = $_GET[tid]";
- }
-
- echo $query . "<br>\n";
-
- $result = TaggerQueryManager::query($query);
- $row = TaggerQueryManager::fetch($result);
-
- // get articles with that subject in the title
- $keyword_subject = $row['name'];
- $keyword_subject_id = $row['tid'];
- $keyword_subject_esc = mysql_real_escape_string($keyword_subject);
-
- $normalize = false;
- if (isset($_GET['normalize']) && $_GET['normalize'] == 'true') {
- $normalize = true;
- }
-
- $range = '';
- if (isset($_GET['fulltext'])) {
- $range = 'fulltext';
- }
- if (isset($_GET['underrubrik'])) {
- $range = 'underrubrik';
- }
- if (isset($_GET['tagged'])) {
- $range = 'tagged';
- }
- $result = subject_name_related_words($keyword_subject, 'all', $range, $normalize);
-
- $doc_count = $result['doc_count'];
- $doc_ids = $result['doc_ids'];
- $freq_array = $result['freq_array'];
-
- function make_text_array($freq_array, $property) {
- $test = current($freq_array);
- if(!isset($test[$property])) { return false; }
- // sort the related words by score
- usort($freq_array, function($a,$b) use ($property) { return $a[$property] < $b[$property]; });
- return array_map(function($a) use ($property) { return $a['word'].'<br /><span class="rating">('.$a[$property].')</span>'; }, $freq_array);
- }
-
- //print_r($frequency);
- $tds[] = make_text_array($freq_array, 'diff');
- //$tds[] = make_text_array(array_filter($freq_array, function($value) use ($doc_count) { return $value['doc_count'] > $doc_count/5; }), 'diff');
- $tds[] = make_text_array($freq_array, 'diff_outer_doc_freq');
- $tds[] = make_text_array($freq_array, 'diff_outer_doc_freq_log');
- //$tds[] = make_text_array(array_filter($freq_array, function($value) use ($doc_count) { return $value['doc_count'] > $doc_count/5; }), 'diff_malt_lin');
- //$tds[] = make_text_array($freq_array, 'diff_malt_x2');
- //$tds[] = make_text_array($freq_array, 'diff_malt_x2_log');
- //$tds[] = make_text_array($freq_array, 'diff_malt_log');
- //$tds[] = make_text_array($freq_array, 'diff_malt_x2_idf');
- //$tds[] = make_text_array(array_filter($freq_array, function($value) use ($doc_count) { return $value['doc_count'] > $doc_count/5; }), 'diff_malt_x2');
- $tds[] = make_text_array($freq_array, 'outer_doc_freq');
- $tds[] = make_text_array($freq_array, 'inner_doc_freq');
- $tds[] = make_text_array($freq_array, 'doc_freq');
- //$tds[] = make_text_array($freq_array, 'malt_x2');
- $tds[] = make_text_array($freq_array, 'doc_count');
- $tds[] = make_text_array($freq_array, 'doc_count_db');
- //$tds[] = make_text_array($freq_array, 'tf-idf');
- //$tds[] = make_text_array(array_filter($freq_array, function($value) use ($doc_count) { return $value['doc_count'] > $doc_count/5; }), 'tf-idf');
- //$tds[] = make_text_array($freq_array, 'tf-idf_malt_lin');
- //$tds[] = make_text_array($freq_array, 'tf-idf_malt_x2');
- //print_r($tds);
-
-?>
-<!DOCTYPE html>
-<html lang="en">
-<head>
- <meta charset="utf-8" />
- <title>Tagger-library manual test</title>
- <style>
- .rating { color: gray; }
- </style>
- <!--<link rel="stylesheet" type="text/css" href="reset.css" />-->
- <!--[if IE]>
- <script src="http://html5shiv.googlecode.com/svn/trunk/html5.js"></script>
- <![endif]-->
- <script src="http://ajax.googleapis.com/ajax/libs/jquery/1.6.2/jquery.min.js">
- </script>
- <script>
- $(document).ready(function(){
- //you might want to be a bit more specific than only 'td', maybe 'table.classname td' or 'table#id td'
- $('td').click(function(){
- var $this = $(this);
- //find the index of the clicked cell in the row
- var index = $this.prevAll().length;
- //go back to the parent table (here you might also want to use the more specific selector as above)
- //and in each row of that table...
- $this.parents('table').find('tr').each(function(){
- //...highlight the indexth cell
- if($(this).find('td:eq('+index+')').css('background-color') != 'yellow') {
- $(this).find('td:eq('+index+')').css('background-color', 'yellow')
- } else {
- $(this).find('td:eq('+index+')').css('background-color', 'none')
- }
- });
- });
- });
- </script>
-
-</head>
-
-<body>
- <h3>Emne: <?php echo $keyword_subject; ?></h3>
- <h4>ID: <?php echo $keyword_subject_id; ?></h4>
- <p>Antal fundne artikler: <?php echo $doc_count; ?></p>
- <p>(<?php if (!empty($doc_ids)) { echo implode(',', $doc_ids); } ?>)</p>
-
- <table>
- <thead>
- <tr>
- <th>Johs</th>
- <!--<th>Johs<br>20% grænse</th>-->
- <th>Johs<br>outer_doc_freq</th>
- <!--<th>Johs<br>Malthe-lineær<br>20% grænse</th>-->
- <th>log(Johs<br>outer_doc_freq)</th>
- <!--<th>Johs<br>Malthe-x<sup>2</sup></th>-->
- <!--<th>log8(10000/Johs<br>Malthe-x<sup>2</sup>)</th>-->
- <!--<th>Johs<br>Malthe-log</th>-->
- <!--<th>Johs<br>Malthe-x<sup>2</sup><br>idf</th>
- <th>Johs<br>Malthe-x<sup>2</sup><br>20% grænse</th>-->
- <th>outer_doc_freq</th>
- <th>inner_doc_freq</th>
- <th>doc_freq</th>
- <!--<th>Malthe-x<sup>2</sup></th>-->
- <th>Optræder i x artikler<br>relateret til dette emne</th>
- <th>Optræder i x artikler</th>
- <!--<th>tf-idf</th>
- <th>tf-idf<br>20%</th>
- <th>tf-idf<br>Malthe-lineær</th>
- <th>tf-idf<br>Malthe-x<sup>2</sup></th>-->
- <!--<th>Gennemsnit</th>-->
- </tr>
- </thead>
- <tbody>
-<?php
- for ($row = 0, $rc = count($tds[0]); $row < $rc; $row++) {
- echo '<tr>';
- for ($column = 0, $cc = count($tds); $column < $cc; $column++) {
- if(isset($tds[$column][$row])) {
- echo '<td>'. $tds[$column][$row] .'</td>';
- }
- else {
- echo '<td></td>';
- }
- }
- echo '</tr>';
- }
-?>
- </tbody>
- </table>
-
- <br /><br />------------------<br /><br />
-
- <?php print $text; ?>
-
-</body>
-</html>
Please sign in to comment.
Something went wrong with that request. Please try again.