Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Add KeywordImporter class

  • Loading branch information...
commit a932d6ca5cb5ebf37b62babd7b2c97a741ed0b31 1 parent 711404f
@malthejorgensen malthejorgensen authored
Showing with 197 additions and 183 deletions.
  1. +197 −183 keyword-import/lib_keyword.php → classes/KeywordImporter.class.php
View
380 keyword-import/lib_keyword.php → classes/KeywordImporter.class.php
@@ -1,37 +1,64 @@
<?php
- require_once('../Tagger.php');
- require_once __ROOT__ . 'db/TaggerQueryManager.class.php';
+require_once __ROOT__ . 'db/TaggerQueryManager.class.php';
- $tagger = Tagger::getTagger();
- $options = $tagger->getConfiguration();
+require_once __ROOT__ . 'classes/Tokenizer.class.php';
+require_once __ROOT__ . 'classes/Stemmer.class.php';
- $db_conf =$tagger->getConfiguration('db');
- $docstats_table = $db_conf['docstats_table'];
- $wordstats_table = $db_conf['wordstats_table'];
- $word_relations_table = $db_conf['word_relations_table'];
+//require_once('Timer.class.php');
- $lookup_table = $db_conf['lookup_table'];
+class KeywordImporter {
- $keyword_conf = $tagger->getConfiguration('keyword');
- $property = $keyword_conf['property'];
- $normalize = $keyword_conf['normalize'];
+ public function __construct() {
+ $db_conf = Tagger::getConfiguration('db');
+ $this->docstatsTable = $db_conf['docstats_table'];
+ $this->wordstatsTable = $db_conf['wordstats_table'];
+ $this->wordRelationsTable = $db_conf['word_relations_table'];
+ $this->lookupTable = $db_conf['lookup_table'];
+ $this->property = Tagger::getConfiguration('keyword', 'property');
+ //$this->normalize = Tagger::getConfiguration('keyword', 'normalize');
- // Get total number of documents and words
- $query = "SELECT * FROM $docstats_table LIMIT 0, 1";
- $result = TaggerQueryManager::query($query);
- $row = TaggerQueryManager::fetch($result);
- $total_doc_count = $row['doc_count'];
- $total_word_count = $row['word_count'];
- function json_load($filename) {
+ // Get total number of documents and words
+ $query = "SELECT * FROM $this->docstatsTable LIMIT 0, 1";
+ $result = TaggerQueryManager::query($query);
+ $row = TaggerQueryManager::fetch($result);
+ $this->totalDocCount = $row['doc_count'];
+ $this->totalWordCount = $row['word_count'];
+ }
+
+ public function jsonCreateKeywords($filename = 'keywords.json') {
+ $json = $this->jsonLoad($filename);
+
+ $this->createKeywords($json);
+ }
+
+ public function jsonCreateWordstats($filename = 'keyword_texts.json') {
+ $json = $this->jsonLoad($filename);
+
+ $texts = array();
+ foreach($json as $tid => $keyword_texts) {
+ $texts = array_merge($texts, $keyword_texts);
+ }
+
+ return $this->createWordstats($texts);
+ }
+
+ public function jsonCreateWordRelations($filename = 'keyword_texts.json') {
+ $json = $this->jsonLoad($filename);
+
+ $this->createWordRelations($json);
+ }
+
+
+ private function jsonLoad($filename) {
if (!is_file($filename)) {
throw new Exception("No file named '$filename'.");
}
$file_contents = file_get_contents($filename);
- $json = json_decode($file_contents, true);
+ $json = json_decode($file_contents, TRUE);
if ($json === NULL) {
$err = json_errcode_to_text(json_last_error());
@@ -41,28 +68,41 @@ function json_load($filename) {
return $json;
}
- function json_create_wordstats($filename = 'keyword_texts.json') {
- $json = json_load($filename);
-
- function flatten(array $array) {
- $return = array();
- array_walk_recursive($array, function($a) use (&$return) { $return[] = $a; });
- return $return;
+ /**
+ * Fills the tagger_lookup table with keywords
+ *
+ * @param array $tids_keywords
+ * Array of keyword names {keys: tids, values: keywords}
+ * e.g. array(214 => 'Forest fires', ..)
+ * @param bool $check
+ * Check whether keyword exists in database.
+ *
+ */
+ protected function createKeywords($tids_keywords, $check = TRUE) {
+ $fields = array('tid', 'vid', 'name', 'canonical');
+
+ $values = array();
+ foreach($tids_keywords as $tid => $keyword) {
+ $values[] = array($tid, '16', $keyword, '1');
}
- return create_wordstats(flatten(array_values($json)));
+ TaggerQueryManager::bufferedInsert($this->lookupTable, $fields, $values);
}
- function json_create_keywords($filename = 'keyword_texts.json') {
- $json = json_load($filename);
-
- create_keywords($json);
- }
-
- function create_keywords($tids_n_texts, $check = true) {
- global $tagger, $property, $docstats_table, $wordstats_table, $word_relations_table;
-
- $tids = array_keys($tids_n_texts);
+ /**
+ * Fills the tagger_word_relations table with words related to keywords (tids).
+ *
+ * @param array $tids_texts
+ * Array of arrays {keys: keyword tids, values: arrays texts}
+ * e.g. array(214 => array('Forest fire consumes city','Montana gets new seaplane'), ..)
+ * where 214 is a keyword tid and the array contains text related to that keyword
+ * @param bool $check
+ * Check whether keyword exists in database.
+ *
+ */
+ protected function createWordRelations($tids_texts, $check = TRUE) {
+
+ $tids = array_keys($tids_texts);
if ($check) {
$error = false;
@@ -71,16 +111,16 @@ function create_keywords($tids_n_texts, $check = true) {
foreach ($tids as $tid) {
// Get keyword corresponding to $tid
- if ($name = tid_to_name($tid)) {
+ if ($name = $this->tidToName($tid)) {
+ echo "$tid: $name\n";
$keywords[$tid] = $name;
}
else {
echo "$tid: Not found.\n";
- unset($tids_n_texts[$tid]);
+ unset($tids_texts[$tid]);
$error = true;
}
}
- echo "$tid: $name\n";
// filter keywords that:
@@ -88,15 +128,19 @@ function create_keywords($tids_n_texts, $check = true) {
// * are already in the database (keywords_in_db.txt)
// * or simply don't wanna have (add them yourself to keywords_non_candidates.txt)
touch("keywords_non_candidates.txt");
- touch("keywords_in_db.$property.txt");
+ touch("keywords_in_db.$this->property.txt");
$lines1 = file("keywords_non_candidates.txt", FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
- $lines2 = file("keywords_in_db.$property.txt", FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
+ $lines2 = file("keywords_in_db.$this->property.txt", FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
$lines = array_merge($lines1, $lines2);
foreach($lines as $line) {
list($tid, $name) = explode('|', $line);
unset($keywords[$tid]);
}
-
+
+ $keyword_count = count($keywords);
+ }
+ else {
+ $keyword_count = count(array_keys($tid));
}
if ($error) {
@@ -105,36 +149,34 @@ function create_keywords($tids_n_texts, $check = true) {
//$keywords = array_slice($keywords, 0, $maximum_keyword_add_count, true);
- $property_esc = mysql_real_escape_string($property);
+ $property_esc = mysql_escape_string($this->property);
- $keyword_count = count($keywords);
$new_keywords = 0;
$start = time();
echo "Trying to add $keyword_count new keywords to the database...\n\n";
- foreach ($keywords as $tid => $name) {
- echo "Adding $name...\n";
-
- $name_esc = mysql_real_escape_string($name);
+ foreach ($tids_texts as $tid => $texts) {
// Create the related words to this keyword
- $result = keyword_create($tid, $tids_n_texts[$tid], FALSE);
+ if (!empty($texts)) {
+ echo "Adding $name...\n";
+ $result = $this->createRelatedWords($tid, $texts, FALSE);
- if($result) {
- $new_keywords++;
+ if($result) {
+ $new_keywords++;
+ }
}
}
}
- function keyword_create($tid, $texts, $check = TRUE) {
- global $tagger, $options, $property, $word_relations_table;
+ public function createRelatedWords($tid, $texts, $check = TRUE) {
if ($check) {
// too few articles found
- if (count($texts) < $options['keyword']['minimum_number_of_texts']) {
- throw new Exception("Too few texts (" . count($texts) . " for " . tid_to_name($tid));
+ if (count($texts) < Tagger::getConfiguration('keyword', 'minimum_number_of_texts')) {
+ throw new Exception("Too few texts (" . count($texts) . " for " . $this->tidToName($tid));
}
/*
echo "$hits articles.";
@@ -152,118 +194,98 @@ function keyword_create($tid, $texts, $check = TRUE) {
}
// Check if $tid is a keyword in the DB
- $name = tid_to_name($tid);
+ $name = $this->tidToName($tid);
echo "tid: $tid\n";
// Check if words related to $tid are already in the DB
- $query = "SELECT tid, word FROM `$word_relations_table` WHERE tid = $tid";
+ $query = "SELECT tid, word FROM `$this->wordRelationsTable` WHERE tid = $tid";
$result = TaggerQueryManager::query($query);
if(TaggerQueryManager::fetch($result)) {
echo "$name is already in the database. Skipping\n";
- $file = fopen("keywords_in_db.$property.txt", 'a');
+ $file = fopen("keywords_in_db.$this->property.txt", 'a');
fwrite($file, $tid . '|' . $name . "\n");
return FALSE;
}
- $property_esc = mysql_real_escape_string($property);
+ $property_esc = mysql_real_escape_string($this->property);
// Get and score the words related to this keyword
- $result = find_significant_words($texts);
+ $result = $this->findSignificantWords($texts);
$hits = $result['doc_count'];
$freq_array = &$result['freq_array'];
echo "Number of words possibly related to $name: " . count($freq_array) . "\n";
- $freq_array = array_filter($freq_array, function($v) use ($property) { return $v[$property] > 0.2; });
+ $freq_array = array_filter($freq_array, create_function('$v', 'return $v[\'' . $this->property . '\'] > 0.2; '));
+
+ $values_array = array();
$words_to_be_added = 0;
foreach($freq_array as $value) {
- if ($words_to_be_added == 0) {
- $query = "INSERT INTO `$word_relations_table` (word, tid, score, pass) VALUES\n";
- }
- else {
- $query .= ', ';
- }
- $query .= '(\''.mysql_real_escape_string($value['word']).'\','.$tid.','.$value[$property].',1)';
+ $values_array[] = array($value['word'], $tid, $value[$this->property],1);
$words_to_be_added++;
-
- if ($words_to_be_added == 1000) {
- if (!TaggerQueryManager::query($query)) {
- // ******* TO BE REMOVED *******
- echo 'Could not query, line ' . __LINE__ . ': ' . mysql_error() . "<br>\n";
- echo $query . "\n";
- die();
- }
- $words_to_be_added = 0;
- }
}
- if ($words_to_be_added != 0 && !TaggerQueryManager::query($query)) {
- // ******* TO BE REMOVED *******
- echo 'Could not query, line ' . __LINE__ . ': ' . mysql_error() . "<br>\n";
- echo $query . "\n";
- die();
- }
+ $fields = array('word', 'tid', 'score', 'pass');
+ TaggerQueryManager::bufferedInsert($this->wordRelationsTable, $fields, $values_array);
// Added to DB
- $file = fopen("keywords_in_db.$property.txt", 'a');
+ $file = fopen("keywords_in_db.$this->property.txt", 'a');
fwrite($file, "$tid|$name|$hits\n");
}
function keyword_update($tid, $texts) {
- global $options, $word_relations_table;
- if (count($texts) < $options['keyword']['minimum_number_of_texts']) {
- throw new Exception("Too few texts (" . count($texts) . " for " . tid_to_name($tid));
+ if (count($texts) < Tagger::getConfiguration('keyword', 'minimum_number_of_texts')) {
+ throw new Exception("Too few texts (" . count($texts) . " for " . $this->tidToName($tid));
}
- $query = "DELETE FROM `$word_relations_table` WHERE tid = $tid";
+ $query = "DELETE FROM `$this->wordRelationsTable` WHERE tid = $tid";
TaggerQueryManager::query($query);
keyword_create($tid, $texts);
}
- require_once('Timer.class.php');
- function find_significant_words($texts, $prop = FALSE, $normalize = FALSE) {
- global $property, $total_doc_count;
+ function findSignificantWords($texts, $prop = FALSE, $normalize = FALSE) {
if ($prop === FALSE) {
- $prop = $property;
+ $prop = $this->property;
}
- $timer = new Timer();
+ //$timer = new Timer();
$doc_count = 0;
$word_count = 0;
$doc_ids = array();
$freq_array = array();
- $timer->start();
+ //$timer->start();
foreach ($texts as $text) {
-
- $frequency = score_text($text);
-
- foreach ($frequency AS $key => $value) {
- $word_count += $value['word_count'];
- if (isset($freq_array[$key])) {
- $freq_array[$key]['word_count'] += $value['word_count'];
- $freq_array[$key]['doc_count']++;
- }
- else {
- $freq_array[$key]['word_count'] = $value['word_count'];
- $freq_array[$key]['doc_count'] = 1;
-
- $freq_array[$key]['word'] = $value['word'];
- $freq_array[$key]['doc_count_db'] = $value['doc_count'];
- $freq_array[$key]['word_freq_db'] = $value['word_freq_db'];
- $freq_array[$key]['doc_freq_db'] = $value['doc_freq_db'];
+ if($text != '') {
+ $frequency = $this->scoreText($text);
+
+ foreach ($frequency AS $key => $value) {
+ $word_count += $value['word_count'];
+ if (isset($freq_array[$key])) {
+ $freq_array[$key]['word_count'] += $value['word_count'];
+ $freq_array[$key]['doc_count']++;
+ }
+ else {
+ $freq_array[$key]['word_count'] = $value['word_count'];
+ $freq_array[$key]['doc_count'] = 1;
+
+ $freq_array[$key]['word'] = $value['word'];
+ $freq_array[$key]['doc_count_db'] = $value['doc_count'];
+ $freq_array[$key]['word_freq_db'] = $value['word_freq_db'];
+ $freq_array[$key]['doc_freq_db'] = $value['doc_freq_db'];
+ }
}
+ $doc_count++;
}
- $doc_count++;
}
if ($prop == 'all') {
@@ -279,7 +301,7 @@ function find_significant_words($texts, $prop = FALSE, $normalize = FALSE) {
$temp_elem['diff'] = $temp_elem['word_count']/$word_count - $temp_elem['word_freq_db'];
- $temp_elem['doc_freq'] = $temp_elem['doc_count_db']/$total_doc_count;
+ $temp_elem['doc_freq'] = $temp_elem['doc_count_db']/$this->totalDocCount;
// in how many related articles does this word occur? (relative to the number of related articles)
// i.e. the percentage of related articles where this word occurs
@@ -320,7 +342,7 @@ function find_significant_words($texts, $prop = FALSE, $normalize = FALSE) {
if ($normalize && $doc_count > 0) {
foreach ($properties as $p) {
// get the word with the highest score
- $val = max(array_map(function($value) use ($p) { return $value[$p]; }, $freq_array));
+ $val = max(array_map(create_function('$value', 'return $value[$p];'), $freq_array));
$val = ($val == 0) ? 1 : $val;
$factor = 1/$val;
@@ -331,9 +353,9 @@ function find_significant_words($texts, $prop = FALSE, $normalize = FALSE) {
}
}
- $timer->stop();
+ //$timer->stop();
- echo "Calculations took " . $timer->secsElapsed() . " seconds.\n";
+ //echo "Calculations took " . $timer->secsElapsed() . " seconds.\n";
$result = array();
$result['doc_count'] = $doc_count;
@@ -343,63 +365,70 @@ function find_significant_words($texts, $prop = FALSE, $normalize = FALSE) {
}
- function create_wordstats($texts) {
- init_wordstats_table();
- list($doc_count, $word_count) = calculate_wordstats($texts);
- end_wordstats_table($doc_count, $word_count);
+ public function createWordstats($texts) {
+ $sql = "TRUNCATE $this->wordstatsTable";
+ TaggerQueryManager::query($sql);
- create_docstats_table($doc_count, $word_count);
+ list($this->totalDocCount, $this->totalWordCount) = $this->calculateWordstats($texts);
- return array($doc_count, $word_count);
+ // Set document frequency and word frequency for all words i.e. all rows in the DB
+ $sql = "UPDATE $this->wordstatsTable
+ SET doc_freq=doc_count/$this->totalDocCount,
+ word_freq=word_count/$this->totalWordCount";
+ TaggerQueryManager::query($sql);
+
+ // Sets a table (docstats) with the total number of words and documents
+ $sql = "TRUNCATE $this->docstatsTable";
+ TaggerQueryManager::query($sql);
+ $sql = "INSERT INTO `$this->docstatsTable` (doc_count,word_count)
+ VALUES ($this->totalDocCount, $this->totalWordCount);";
+ TaggerQueryManager::query($sql);
+
+ return array($this->totalDocCount, $this->totalWordCount);
}
- function calculate_wordstats($texts) {
+ public function calculateWordstats($texts) {
$doc_count = 0;
$word_count = 0;
$overall_frequency = array();
foreach ($texts as $text) {
-
- $frequency = count_words($text);
-
- foreach ($frequency AS $key => $value){
- $word_count += $value['word_count'];
- if(!isset($overall_frequency[$key])) {
- $overall_frequency[$key]['word_count'] = $value['word_count'];
- $overall_frequency[$key]['doc_count'] = 1;
- $overall_frequency[$key]['doc_freq_sum'] = $value['word_freq'];
- } else {
- $overall_frequency[$key]['word_count'] += $value['word_count'];
- $overall_frequency[$key]['doc_count'] += 1;
- $overall_frequency[$key]['doc_freq_sum'] += $value['word_freq'];
+ if ($text != '') {
+ $frequency = $this->countWords($text);
+
+ foreach ($frequency AS $key => $value){
+ $word_count += $value['word_count'];
+ if(!isset($overall_frequency[$key])) {
+ $overall_frequency[$key]['word_count'] = $value['word_count'];
+ $overall_frequency[$key]['doc_count'] = 1;
+ $overall_frequency[$key]['doc_freq_sum'] = $value['word_freq'];
+ } else {
+ $overall_frequency[$key]['word_count'] += $value['word_count'];
+ $overall_frequency[$key]['doc_count'] += 1;
+ $overall_frequency[$key]['doc_freq_sum'] += $value['word_freq'];
+ }
}
- }
- $doc_count++;
+ $doc_count++;
- if(($doc_count % 1000) == 0) {
- update_wordstats_table($overall_frequency);
- $overall_frequency = array();
+ if(($doc_count % 1000) == 0) {
+ $this->updateWordstatsTable($overall_frequency);
+ $overall_frequency = array();
+ }
}
}
return array($doc_count, $word_count);
}
- function init_wordstats_table() {
- global $wordstats_table;
-
- }
-
- function update_wordstats_table($frequencies) {
- global $wordstats_table;
+ function updateWordstatsTable($frequencies) {
$counter = 0;
foreach ($frequencies AS $key => $value) {
if($counter == 0) {
- $sql = "INSERT INTO $wordstats_table (word, word_count, doc_count) VALUES\n";
+ $sql = "INSERT INTO $this->wordstatsTable (word, word_count, doc_count) VALUES\n";
}
else {
$sql .= ', ';
@@ -422,33 +451,18 @@ function update_wordstats_table($frequencies) {
TaggerQueryManager::query($sql);
}
- function end_wordstats_table($doc_count, $word_count) {
- global $wordstats_table;
- $sql = "UPDATE $wordstats_table SET doc_freq=doc_count/$doc_count, word_freq=word_count/$word_count";
- TaggerQueryManager::query($sql);
- }
-
-
- function create_docstats_table($doc_count, $word_count) {
- global $docstats_table;
-
- TaggerQueryManager::query("INSERT INTO `$docstats_table` (doc_count,word_count) VALUES ($doc_count,$word_count);");
- }
-
// Calculate word scores in text
- function score_text($text) {
- global $total_doc_count, $tagger;
+ private function scoreText($text) {
static $db_cache = array();
- $wordstats_table = $tagger->getConfiguration('db', 'wordstats_table');
- $frequency = count_words($text);
+ $frequency = $this->countWords($text);
$words_to_lookup = array_diff(array_keys($frequency), array_keys($db_cache));
$imploded_words = implode("','", array_map('mysql_real_escape_string', $words_to_lookup));
// Get statistics for the words in the article
- $result = TaggerQueryManager::query("SELECT * FROM $wordstats_table WHERE word IN ('$imploded_words')");
+ $result = TaggerQueryManager::query("SELECT * FROM $this->wordstatsTable WHERE word IN ('$imploded_words')");
$unmatched_database = array();
$unmatched_words = $frequency;
@@ -476,24 +490,24 @@ function score_text($text) {
return $frequency;
}
- require_once __ROOT__ . 'classes/Token.class.php';
- require_once __ROOT__ . 'classes/Tokenizer.class.php';
- require_once __ROOT__ . 'classes/Stemmer.class.php';
// Get word frequencies for a text
- function count_words($text) {
- $tagger = Tagger::getTagger();
- $options_keyword = $tagger->getConfiguration('keyword');
+ private function countWords($text) {
+ if ($text == '') {
+ return array();
+ }
+ if (is_array($text)) {
+ //var_dump($text);
+ }
$words = Tokenizer::split_words(trim(mb_strtolower(strip_tags($text))));
- if ($options_keyword['stemmer']) {
+ if (Tagger::getConfiguration('keyword', 'enable_stemmer')) {
foreach ($words as &$word) {
$word = Stemmer::stemWord($word);
}
}
$frequency = array_count_values($words);
- $t = new Token('t');
- $frequency = array_diff_key($frequency, $t::$stopwords);
+ $frequency = array_diff_key($frequency, Tagger::$stopwords);
$word_count = array_sum($frequency);
@@ -511,10 +525,9 @@ function count_words($text) {
return $frequency;
}
- function tid_to_name($tid) {
- global $lookup_table;
+ public function tidToName($tid) {
- $query = "SELECT name FROM $lookup_table WHERE vid = 16 AND tid = $tid AND canonical = 1";
+ $query = "SELECT name FROM $this->lookupTable WHERE vid = 16 AND tid = $tid AND canonical = 1";
$result = TaggerQueryManager::query($query);
if ($row = TaggerQueryManager::fetch($result)) {
return $row['name'];
@@ -524,7 +537,7 @@ function tid_to_name($tid) {
}
}
- function json_errcode_to_text($errcode) {
+ private function json_errcode_to_text($errcode) {
$err = '';
switch ($errcode) {
case JSON_ERROR_NONE:
@@ -553,4 +566,5 @@ function json_errcode_to_text($errcode) {
return $err;
}
+}
Please sign in to comment.
Something went wrong with that request. Please try again.