Skip to content

Commit

Permalink
Prevent zero values for idf measure in search algorithm (see contao#2172
Browse files Browse the repository at this point in the history
)

Description
-----------

Our idf (inverse document frequency) measure is currently zero for terms that are present in every document. This results in divisions by zero and the ranking of other factors like term frequency and document length gets lost.

By adding one to the searchCount the idf value is always larger than zero.

Commits
-------

549da7c Prevent zero values for idf measure
  • Loading branch information
ausi committed Aug 19, 2020
1 parent b61086c commit fab8317
Showing 1 changed file with 4 additions and 4 deletions.
8 changes: 4 additions & 4 deletions core-bundle/src/Resources/contao/library/Contao/Search.php
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ public static function indexPage($arrData)
tl_search_index.pid,
SQRT(SUM(POW(
(1 + LOG(relevance)) * LOG((
" . (int) $objDatabase->query("SELECT COUNT(*) as count FROM tl_search")->count . "
" . (int) ($objDatabase->query("SELECT COUNT(*) as count FROM tl_search")->count + 1) . "
) / GREATEST(1, documentFrequency)),
2
))) as vectorLength
Expand Down Expand Up @@ -573,7 +573,7 @@ public static function searchFor($strKeywords, $blnOrSearch=false, $arrPid=array

if (isset($arrWildcards[$index]))
{
$strQuery .= "+ ((1+LOG(SUM(match$index * tl_search_index.relevance))) * POW(LOG(@searchCount / @wildcardCount$index), 2) / " . (\count($arrAllKeywords) - \count($arrExcludedMatches)) . ")";
$strQuery .= "+ ((1+LOG(SUM(match$index * tl_search_index.relevance))) * POW(LOG((@searchCount + 1) / @wildcardCount$index), 2) / " . (\count($arrAllKeywords) - \count($arrExcludedMatches)) . ")";
}
else
{
Expand All @@ -592,7 +592,7 @@ public static function searchFor($strKeywords, $blnOrSearch=false, $arrPid=array

if (isset($arrWildcards[$index]))
{
$strQuery .= " + POW(LOG(@searchCount / @wildcardCount$index) / " . (\count($arrAllKeywords) - \count($arrExcludedMatches)) . ", 2)";
$strQuery .= " + POW(LOG((@searchCount + 1) / @wildcardCount$index) / " . (\count($arrAllKeywords) - \count($arrExcludedMatches)) . ", 2)";
}
else
{
Expand All @@ -604,7 +604,7 @@ public static function searchFor($strKeywords, $blnOrSearch=false, $arrPid=array
$strQuery .= " FROM (SELECT id, term";

// Calculate inverse document frequency of every matching term
$strQuery .= ", LOG(@searchCount / GREATEST(1, documentFrequency)) AS idf";
$strQuery .= ", LOG((@searchCount + 1) / GREATEST(1, documentFrequency)) AS idf";

// Store the match of every keyword and wildcard in its own column match0, match1, ...
foreach ($arrAllKeywords as $index => $strKeywordExpression)
Expand Down

0 comments on commit fab8317

Please sign in to comment.