diff --git a/PMBApi.php b/PMBApi.php index eb719f4..87aa0c5 100644 --- a/PMBApi.php +++ b/PMBApi.php @@ -1034,7 +1034,6 @@ public function MetaphoneSearch($query) # contains expanded matches ( original + all prefix matches ) if ( isset($this->final_doc_matches[$original_token]) ) { - #echo "USING FINAL DOC MATCHES \n"; $original_match_count = $this->final_doc_matches[$original_token]; $max_matches = $original_match_count; } @@ -1770,11 +1769,107 @@ public function Search($query, $offset = 0, $limit = 10) } } - $tic = 0; - + $token_order_rev = array_flip($token_order); ksort($token_order_rev); - + + #$exact_id_required_bits = 0; + + if ( !empty($exact_pairs) ) + { + # we are looking for keywords in certain order + $exact_mode = true; + + foreach ( $exact_pairs as $token_pair => $int ) + { + $t_parts = explode(" ", $token_pair); + if ( count($t_parts) === 2 ) + { + $pos1 = $token_order_rev[$t_parts[0]]; + $pos2 = $token_order_rev[$t_parts[1]]; + + if ( $pos1 + 1 === $pos2 ) + { + $exact_ids_lookup[(1 << $pos1) | (1 << $pos2)] = 1; + } + } + } + } + else + { + # no certain order required + $exact_mode = false; + } + + $disable_score_calculation = false; + if ( isset($this->non_scored_sortmodes[$this->sortmode]) && + $this->group_sort_attr !== "@score" && + $this->group_sort_attr !== "@sentiscore" && + $this->sort_attr !== "@score" ) { + + $disable_score_calculation = true; + } + + # sorting by external attribute, no keyword order requirements + if ( !$exact_mode && $disable_score_calculation && $this->matchmode !== PMB_MATCH_STRICT && $this->enabled_fields === $all_fields ) + { + $fast_external_sort = true; + } + else + { + $fast_external_sort = false; + } + + # sorting by external attribute, keyword order requirements apply + if ( ($exact_mode || $this->enabled_fields !== $all_fields) && $disable_score_calculation ) + { + $external_sort = true; + } + else + { + $external_sort = false; + } + + if ( $this->matchmode === PMB_MATCH_STRICT ) + { + $strict_match_cmp_value = 1; + } + else + { + $strict_match_cmp_value = 0; + } + + # special case: if sorting by @id and no grouping is enabled, + if ( $disable_score_calculation && $this->sort_attr === "@id" && $this->groupmode === 1 ) + { + if ( $this->sortdirection === "desc" ) + { + # descending order + $decode_descend = true; + } + else + { + # ascending order + $decode_ascend = true; + } + + # in any case, external_sort is true ( score calculation not needed ) + $external_sort = true; + + # how many results are needed to satisfy the fast sorting mode's requirements + $fast_ext_sort_req_count = $offset + $limit; + + # if we are sorting by internal @id + $id_sort_enabled = true; + } + else + { + # default sorting mode is ascending + $decode_ascend = true; + $fast_ext_sort_req_count = pow(2, 32); # fast sorting is not enabled in this normal mode + } + + $tic = 0; $sumdata = array(); $sumcounts = array(); $bin_separator = $bin_sep = pack("H*", "80"); @@ -1812,12 +1907,27 @@ public function Search($query, $offset = 0, $limit = 10) $token_positions_sql = "SUBSTR(doc_ids FROM LOCATE(:bin_sep, doc_ids)+1) as token_positions"; } + if ( $fast_external_sort ) + { + # no need to return document match position data + # just return an empty string + $token_positions_sql = "'' AS token_positions"; + } + + $max_dox_id_column = ""; + if ( isset($decode_descend) ) + { + # the max doc id value is needed + $max_dox_id_column = "max_doc_id,"; + } + if ( $this->delta_documents > 0 ) { $token_main_sql = "( SELECT token, $switch_typecase, - doc_matches, + doc_matches, + $max_dox_id_column SUBSTR(doc_ids, 1, LOCATE(:bin_sep, doc_ids)-1) as doc_ids, $token_positions_sql FROM PMBTokens".$this->suffix." WHERE " . implode(" OR ", $token_sql) . " @@ -1827,6 +1937,7 @@ public function Search($query, $offset = 0, $limit = 10) SELECT token, $switch_typecase, doc_matches, + $max_dox_id_column SUBSTR(doc_ids, 1, LOCATE(:bin_sep, doc_ids)-1) as doc_ids, $token_positions_sql FROM PMBTokens".$this->suffix."_delta WHERE " . implode(" OR ", $token_sql) . " @@ -1836,7 +1947,8 @@ public function Search($query, $offset = 0, $limit = 10) { $token_main_sql = "SELECT token, $switch_typecase, - doc_matches, + doc_matches, + $max_dox_id_column SUBSTR(doc_ids, 1, LOCATE(:bin_sep, doc_ids)-1) as doc_ids, $token_positions_sql FROM PMBTokens".$this->suffix." WHERE " . implode(" OR ", $token_sql); @@ -1870,12 +1982,10 @@ public function Search($query, $offset = 0, $limit = 10) { $sumdata[$token_order_rev[$row["token"]]] = array(); $sumcounts[$token_order_rev[$row["token"]]] = 0; - $summatchcounts[$token_order_rev[$row["token"]]] = array(); } $sumdata[$token_order_rev[$row["token"]]][] = $row["ID"]; # store as int because exact match $sumcounts[$token_order_rev[$row["token"]]] += $row["doc_matches"]; # store how many doc_matches this token has - $summatchcounts[$token_order_rev[$row["token"]]][] = $row["doc_matches"]; } # prefix match else @@ -1892,7 +2002,7 @@ public function Search($query, $offset = 0, $limit = 10) } $current_prefix = $prefix_grouper[crc32($row["token"])]; - if ( !empty($exact_words[$current_prefix]) ) + if ( isset($exact_words[$current_prefix]) ) { continue; } @@ -1915,13 +2025,11 @@ public function Search($query, $offset = 0, $limit = 10) { $sumdata[$token_order_rev[$keyword_pairs[$row["token"]]]] = array(); $sumcounts[$token_order_rev[$keyword_pairs[$row["token"]]]] = 0; - $summatchcounts[$token_order_rev[$keyword_pairs[$row["token"]]]] = array(); } # do not overwrite better results ! $sumdata[$token_order_rev[$keyword_pairs[$row["token"]]]][] = $row["ID"]; $sumcounts[$token_order_rev[$keyword_pairs[$row["token"]]]] += $row["doc_matches"]; - $summatchcounts[$token_order_rev[$keyword_pairs[$row["token"]]]][] = $row["doc_matches"]; ++$token_match_count[$keyword_pairs[$row["token"]]]; } @@ -1929,23 +2037,40 @@ public function Search($query, $offset = 0, $limit = 10) # score lookup table for fuzzy matches $score_lookup_alt[] = (float)$token_score; - $encoded_data[] = $row["doc_ids"]; - $lengths[] = strlen($row["doc_ids"]); - $encode_pointers[] = 0; - $encode_delta[] = 1; - - # document match positions - $doc_match_data[] = $row["token_positions"]; - $doc_pos_pointers[] = 0; - $avgs[] = strlen($row["token_positions"])/+$row["doc_matches"]; - $doc_lengths[] = strlen($row["token_positions"]); + $doc_id_data_len = strlen($row["doc_ids"]); # length of document id data + $token_pos_data_len = strlen($row["token_positions"]); # length of token position data + + $encoded_data[] = $row["doc_ids"]; # the actual document id data + $doc_match_data[] = $row["token_positions"]; # the actual token position data + + # if we are decoding results in "reverse" mode + if ( isset($decode_descend) ) + { + $maximum_doc_ids_vals[] = (int)$row["max_doc_id"]; + $encode_delta[] = (int)$row["max_doc_id"]; + $encode_pointers[] = $doc_id_data_len-1; + $doc_pos_pointers[] = $token_pos_data_len; + + # predecode the first char from docids & token match data + $encode_temp_docs[] = $this->hex_lookup_decode[$row["doc_ids"][$doc_id_data_len-1]]-128; + } + # if results are decoded in normal mode + else + { + $encode_delta[] = 1; + $encode_pointers[] = 0; + $doc_pos_pointers[] = 0; + $lengths[] = $doc_id_data_len; + } + + $avgs[] = $token_pos_data_len/+$row["doc_matches"]; + $doc_lengths[] = $token_pos_data_len; $undone_values[] = 0; - $token_list[] = $row["token"]; - + ++$tic; } - + foreach ( $token_order_rev as $token => $order_index ) { if ( isset($sumcounts[$order_index]) ) @@ -1953,7 +2078,7 @@ public function Search($query, $offset = 0, $limit = 10) $this->final_doc_matches[$token] = $sumcounts[$order_index]; } } - + # close cursor $tokpdo->closeCursor(); @@ -1961,7 +2086,7 @@ public function Search($query, $offset = 0, $limit = 10) $this->db_connection->setAttribute(PDO::MYSQL_ATTR_USE_BUFFERED_QUERY, true); $this->result["stats"]["payload_time"] = microtime(true) - $payload_start; - + # keyword suggestions if ( !empty($this->keyword_suggestions) ) { @@ -1998,16 +2123,22 @@ public function Search($query, $offset = 0, $limit = 10) if ( count($token_order_rev) !== count($real_token_order) ) { $sumdata_new = array(); + $sumcounts_new = array(); foreach ( $real_token_order as $index => $token ) { if ( isset($token_order_rev[$token]) ) { # duplicate results ( get the token db ids ) $sumdata_new[] = $sumdata[$token_order_rev[$token]]; + $sumcounts_new[] = $sumcounts[$token_order_rev[$token]]; } } - + + # replace old ones $sumdata = $sumdata_new; + $sumcounts = $sumcounts_new; + + unset($sumdata_new, $sumcounts_new); } } @@ -2033,22 +2164,13 @@ public function Search($query, $offset = 0, $limit = 10) $token_pairs = array(); - $i = 1; + # create a lookup array for tokens ( which group they belong ) foreach ( $sumcounts as $k_index => $doc_matches ) { foreach ( $sumdata[$k_index] as $token_id ) { $token_group_lookup[$token_id] = $k_index; } - - # for exact/strict searchmode - $prev_index = $k_index-1; - if ( $prev_index >= 0 ) - { - $exact_ids_lookup[(1<<$k_index)|(1<<$prev_index)] = 1; - } - - ++$i; } # ensure that all provided keywords return results @@ -2092,7 +2214,7 @@ public function Search($query, $offset = 0, $limit = 10) } } } - + # number of all inputted keywords ( even non-wanted ones ) $token_count = count($token_order_rev); @@ -2141,26 +2263,6 @@ public function Search($query, $offset = 0, $limit = 10) $total_matches = 0; $tmp_matches = 0; - if ( !empty($exact_pairs) ) - { - # we are looking for keywords in certain order - $exact_mode = true; - } - else - { - # no certain order required - $exact_mode = false; - } - - $disable_score_calculation = false; - if ( isset($this->non_scored_sortmodes[$this->sortmode]) && - $this->group_sort_attr !== "@score" && - $this->group_sort_attr !== "@sentiscore" && - $this->sort_attr !== "@score" ) { - - $disable_score_calculation = true; - } - $last_index = $token_count-1; $scored_count = 0; @@ -2170,9 +2272,23 @@ public function Search($query, $offset = 0, $limit = 10) $group_count = count($encoded_data); $t_matches = array(); - $interval = 10000; - $min_doc_id = 0; - $max_doc_id = $interval + $min_doc_id; + + if ( isset($decode_descend) ) + { + $interval = -10000; + $max_doc_id = max($encode_delta)+1; + $min_doc_id = $max_doc_id + $interval; + $max_doc_id_original = $max_doc_id; + } + else + { + # positive interval + $interval = 10000; + $min_doc_id = 0; + $max_doc_id = $interval + $min_doc_id; + } + + $vals = 0; $stop = false; $total_vals = 0; @@ -2181,7 +2297,7 @@ public function Search($query, $offset = 0, $limit = 10) helper variables + micro-optimization :-) */ - # for BM25 calculation to avoid repetitive function cals + # for BM25 calculation to avoid repetitive function calls $IDF_denominator = log(1+$this->documents_in_collection); foreach ( $sumcounts as $s_ind => $scount ) @@ -2234,38 +2350,9 @@ public function Search($query, $offset = 0, $limit = 10) { $bm25_field_scores[$w_bits] = $weighted_score_lookup[$w_bits]-$value; } - - # sorting by external attribute, no keyword order requirements - if ( !$exact_mode && $disable_score_calculation && $this->matchmode !== PMB_MATCH_STRICT && $this->enabled_fields === $all_fields ) - { - $fast_external_sort = true; - } - else - { - $fast_external_sort = false; - } - - # sorting by external attribute, keyword order requirements apply - if ( ($exact_mode || $this->enabled_fields !== $all_fields) && $disable_score_calculation ) - { - $external_sort = true; - } - else - { - $external_sort = false; - } - # is it enough if only one keyword matches ? - if ( $this->matchmode === PMB_MATCH_ANY ) - { - $match_any = true; - } - else - { - $match_any = false; - } - $group_token_count = array(); + # count how many infixes each token main group haves foreach ( $token_list as $group => $token ) { @@ -2278,9 +2365,7 @@ public function Search($query, $offset = 0, $limit = 10) ++$group_token_count[$token_group]; } - - $finished_groups = array(); - + foreach ( $encoded_data as $d_group => $datatata ) { $sorted_groups[$d_group] = $token_group_lookup[$token_list[$d_group]]; @@ -2288,578 +2373,23 @@ public function Search($query, $offset = 0, $limit = 10) asort($sorted_groups); - $start = microtime(true); - $loop = 0; - while ( true ) - { - $finished = 0; - $skipped = 0; - $end = 0; - ++$loop; - - foreach ( $sorted_groups as $group => $token_group ) - { - $group_bits = 1 << $token_group; - $encoded_group = $this->hex_lookup_encode[$group]; - - if ( $encode_delta[$group] >= $max_doc_id ) - { - ++$skipped; - continue; // skip this group - } - else if ( $encode_pointers[$group] > $lengths[$group] && !$undone_values[$group] ) - { - /* - $finished_main_groups[$token_group][$group] = 1; - - if ( count($finished_main_groups[$token_group]) >= $group_token_count[$token_group] ) - { - # just for curiosity at this point - }*/ - - $finished_groups[$group] = 1; - ++$skipped; - ++$end; - continue; - } - - $delta = $encode_delta[$group]; - $temp = 0; - $shift = 0; - $bin_data = &$encoded_data[$group]; # reference to document id data - $docids_len = $lengths[$group]; # length of compressed document id data - $i = $encode_pointers[$group]; # string pointer of compressed document id data - $vals = $undone_values[$group]; # how many match position values waiting to be decoded ( for this group ) - $matchpos_data = &$doc_match_data[$group]; # reference to keyword match position data - $matchpos_len = $doc_lengths[$group]; # keyword match position data length - - if ( isset($temp_doc_ids_storage[$group]) ) - { - $temp_doc_ids = $temp_doc_ids_storage[$group]; - } - else - { - $temp_doc_ids = array(); - } - - # reset undone values - $undone_values[$group] = 0; - - // decode first (min) doc_id of each result group - for ( ; $i < $docids_len ; ++$i ) - { - if ( ($bits = $this->hex_lookup_decode[$bin_data[$i]]) < 128 ) - { - # number is yet to end - $temp |= $bits << $shift*7; - ++$shift; - } - else - { - # number ends - if ( $shift ) - { - $delta = ($temp|($bits-128 << $shift*7))+$delta-1; - $shift = 0; - $temp = 0; - } - else - { - $delta = $bits-129+$delta; - } - - if ( $delta <= $max_doc_id ) - { - $temp_doc_ids[$delta] = 1; - ++$vals; - - // store value only if we are in the current seek-range - if ( isset($t_matches[$delta]) ) - { - $t_matches[$delta] |= $group_bits; - } - else - { - $t_matches[$delta] = $group_bits; - } - } - else - { - ++$finished; - - if ( empty($t_matches_awaiting[$delta][$group]) ) - { - $t_matches_awaiting[$delta][$group] = $group_bits; - } - else - { - $t_matches_awaiting[$delta][$group] |= $group_bits; - } - - break; - } - } - } - - $encode_delta[$group] = $delta; - $encode_pointers[$group] = $i+1; - - if ( $vals && $matchpos_len ) - { - $expl_s = microtime(true); - - $r = 0; - $travel = (int)($avgs[$group]*$vals); - $p = $doc_pos_pointers[$group]+$travel; - if ( $p >= $matchpos_len ) - { - $p = $matchpos_len-1; - $travel = $matchpos_len-$doc_pos_pointers[$group]; - } - - $got = substr_count($doc_match_data[$group], $bin_sep, $doc_pos_pointers[$group], $travel); - - if ( $got < $vals ) - { - $vals = $vals-$got; - while ( true ) - { - if ( $matchpos_data[$p] === $bin_sep ) - { - ++$p; - ++$r; - if ( $r === $vals ) - { - break; - } - } - ++$p; - if ( $p >= $matchpos_len ) - { - ++$p; - break; - } - } - --$p; - } - else - { - if ( $got === $vals ) - { - $vals = 1; - } - else - { - $vals = $got-$vals+1; - } - - if ( $matchpos_data[$p] === $bin_sep ) ++$vals; - - while ( true ) - { - if ( $matchpos_data[$p] === $bin_sep ) - { - --$p; - ++$r; - if ( $r === $vals ) - { - break; - } - } - --$p; - if ( $p <= 0 ) - { - $p=-1; - break; - } - } - ++$p; - } - - - $a = $p-$doc_pos_pointers[$group]; - $data = explode($bin_sep, substr($doc_match_data[$group], $doc_pos_pointers[$group], $a)); - $doc_pos_pointers[$group] = $p+1; - - $l = 0; - foreach ( $temp_doc_ids as $doc_id => $string ) - { - if ( !empty($loop_doc_positions[$doc_id]) ) - { - $loop_doc_positions[$doc_id] .= $bin_sep.$encoded_group.$data[$l]; - } - else - { - $loop_doc_positions[$doc_id] = $encoded_group.$data[$l]; - } - ++$l; - } - - unset($temp_doc_ids, $data); - $temp_doc_ids_storage[$group] = array(); - $exp_time = microtime(true)-$expl_s; - } - - # this group is done - if ( $i >= $docids_len ) - { - ++$end; - } - } # <---- foreach group ends - - if ( $end >= $group_count ) $stop = true; - - # all groups have finished, lets check the results - if ( $finished >= $group_count || $skipped >= $group_count || $stop ) - { - if ( $stop ) - { - if ( !empty($t_matches_awaiting) ) - { - foreach ( $t_matches_awaiting as $doc_id => $data ) - { - if ( $doc_id >= $min_doc_id && $doc_id <= $max_doc_id ) - { - foreach ( $t_matches_awaiting[$doc_id] as $group => $bits ) - { - $undone_values[$group] = 1; - $temp_doc_ids_storage[$group][$doc_id] = $bits; - - if ( !empty($t_matches[$doc_id]) ) - { - $t_matches[$doc_id] |= $bits; - } - else - { - $t_matches[$doc_id] = $bits; - } - } - - unset($t_matches_awaiting[$doc_id]); - } - } - } - } - - $t = 0; - - # get documents match position data - foreach ( $t_matches as $doc_id => $bits ) - { - if ( ($bits & $reference_bits) === $goal_bits ) - { - # skip the whole score calculation phase if we are sorting by an external attribute - # and there is no strict keyword order lookup - if ( $fast_external_sort ) - { - $temp_doc_id_sql .= ",$doc_id"; - ++$tmp_matches; - continue; - } - - $match_position_string = &$loop_doc_positions[$doc_id]; - # now calculate document scores - if ( $exact_mode ) - { - $exact_ids_lookup_copy = $exact_ids_lookup; - } - - unset($best_match_score, $phrase_data, $document_count, $sentiment_data); - - $data_len = strlen($loop_doc_positions[$doc_id]); - $phrase_score = 0; - $bm25_score = 0; - $self_score = 0; - $maxscore_total = 0; - $sentiscore = 0; - $position_storage = $last_pos_lookup; - $exact_match = false; - - $t_group = $this->hex_lookup_decode[$match_position_string[0]]; - $qind = $sorted_groups[$t_group]; - $prev_group = $qind-1; + $start = microtime(true); + $loop = 0; + $total_documents = 0; - # initialize temporary array variables for each token group - $phrase_data[$qind] = 0; # for phrase score bits - $document_count[$qind] = 0; # how many documents for this token group - $best_match_score[$qind] = $score_lookup_alt[$t_group]; # maxscore ( token quality ) - - $temp = 0; - $shift = 0; - $delta = 1; - $x = 0; - - for ( $i = 1 ; $i < $data_len ; ++$i ) - { - $bits = $this->hex_lookup_decode[$match_position_string[$i]]; - - if ( $bits === 128 ) - { - # increase document match count for the previous token group ( if sentiment analysis is on, decrement the count by one ) - $document_count[$qind] += $x - $this->sentiment_index; - - # zero, as in binary separator - # token changes - - ++$i; # first char will be the group - $t_group = $this->hex_lookup_decode[$match_position_string[$i]]; - $qind = $sorted_groups[$t_group]; - $prev_group = $qind-1; - - if ( !isset($best_match_score[$qind]) ) - { - # initialize temporary array variables for each token group - $phrase_data[$qind] = 0; # for phrase score bits - $document_count[$qind] = 0; # how many documents for this token group - $best_match_score[$qind] = 0; # maxscore ( token quality ) - } - - # better quality score for this result group - if ( $score_lookup_alt[$t_group] > $best_match_score[$qind] ) - { - $best_match_score[$qind] = $score_lookup_alt[$t_group]; - } - - # reset temporary variables - $temp = 0; - $shift = 0; - $delta = 1; - $x = 0; - - } - else if ( $bits < 128 ) - { - # number is yet to end - # check also gere if shift is === 0 ( then temp = bits; ) - $temp |= $bits << $shift*7; - ++$shift; - } - else - { - # 8th bit is set, number ends here ! - - if ( $x < $this->sentiment_index ) - { - $sentiscore += ($temp|($bits-128 << $shift*7))-128; - $temp = 0; - $shift = 0; - } - else - { - # otherwise this value is keyword position in document - if ( $shift ) - { - $delta = ($temp|($bits-128 << $shift*7))+$delta-1; - $shift = 0; - $temp = 0; - } - else - { - $delta = $bits-129+$delta; - } - - $field_id_bit = 1 << ($delta & $this->lsbits); - - if ( $field_id_bit & $this->enabled_fields ) - { - $field_pos = $delta >> $this->field_id_width; - - # self score match - $self_score |= $field_id_bit; - - # if there is a match in the same field - if ( $position_storage[$field_id_bit][$prev_group] === $field_pos-1 ) - { - $phrase_data[$qind] |= $field_id_bit; - - if ( $exact_mode ) - { - unset($exact_ids_lookup_copy[(1<<$qind)|(1<<$prev_group)]); - } - } - # if field_pos is 1 and token group is 0 -> exact match - else if ( $field_pos+$qind === 1 ) - { - $exact_match = true; - } - - $position_storage[$field_id_bit][$qind] = $field_pos; - } - } - - ++$x; - } - } - - if ( !$self_score ) - { - # self_score is zero => none of the keywords were found on enabled fields - # this document is not a match - continue; - } - - $document_count[$qind] += $x - $this->sentiment_index; - - if ( $exact_mode && !empty($exact_ids_lookup_copy) ) - { - # exact mode is on but document does not - # satisfy strict keyword order conditions - continue; - } - else if ( $this->matchmode === PMB_MATCH_STRICT && !$exact_match ) - { - # strict matchmode's requirements not satisfied - continue; - } - - ++$total_matches; - - # skip rest of the score calculation - # documents are ranked by an external attribute - if ( $external_sort ) - { - $temp_doc_id_sql .= ",$doc_id"; - continue; - } - - foreach ( $phrase_data as $vind => $value ) - { - $phrase_score += $weighted_score_lookup[$value]; - $maxscore_total += $best_match_score[$vind]; - - if ( $sentimode && $this->sentiweight ) - { - # if field_weights are applied also sentiment scores - $sentiscore += $bm25_field_scores[$value]; - } - - $effective_match_count = $bm25_field_scores[$value] + $document_count[$vind]; - - $bm25_score += $effective_match_count * $IDF_lookup[$vind] / ($effective_match_count+1.2); - } - - # calculate self_score - $final_self_score = $weighted_score_lookup[$self_score]; - - # is quality scoring enabled ? - if ( $this->quality_scoring ) - { - $score_multiplier = $maxscore_total/count($phrase_data); - } - else - { - $score_multiplier = 1; - } - - switch ( $this->rankmode ) - { - case PMB_RANK_PROXIMITY_BM25: - $this->temp_matches[$doc_id] = (int)((($phrase_score + $final_self_score) * 1000 + round((0.5 + $bm25_score / $bm25_token_count) * 999)) * $score_multiplier); - break; - - case PMB_RANK_BM25: - $this->temp_matches[$doc_id] = (int)(round((0.5 + $bm25_score / $bm25_token_count) * 999) * $score_multiplier); - break; - - case PMB_RANK_PROXIMITY: - $this->temp_matches[$doc_id] = (int)((($phrase_score + $final_self_score) * 1000) * $score_multiplier); - break; - } - - - # special case: store sentiment score if sorting/grouping by sentiment score - if ( $sentimode ) - { - $this->temp_sentiscores[$doc_id] = $sentiscore; - } - - /* - at this point, check how many temp_matches we have - if count(temp_matches) > 10000, sort and keep only 1000 best matches - */ - - if ( $total_matches % $this->temp_grouper_size === 0 ) - { - # sort results - arsort($this->temp_matches); - - /* if grouping is enabled, it should be done at this point*/ - if ( $this->groupmode > 1 ) - { - $this->GroupTemporaryResults(); - } - else - { - # keep only $this->max_results - $this->temp_matches = array_slice($this->temp_matches, 0, $this->max_results, true); - } - - if ( $sentimode ) - { - $t_sentiscores = array(); - # rewrite sentiment score data - foreach ( $this->temp_matches as $t_doc_id => $doc_score ) - { - $t_sentiscores[$t_doc_id] = $this->temp_sentiscores[$t_doc_id]; - unset($this->temp_sentiscores[$t_doc_id]); - } - $this->temp_sentiscores = $t_sentiscores; - unset($t_sentiscores); - } - } - - } - } - - $min_doc_id += $interval; - $max_doc_id += $interval; - - unset($t_matches, $loop_doc_groups, $loop_doc_positions); - $t_matches = array(); - $loop_doc_positions = array(); - $loop_doc_groups = array(); - - if ( !empty($t_matches_awaiting) ) - { - foreach ( $t_matches_awaiting as $doc_id => $data ) - { - if ( $doc_id >= $min_doc_id && $doc_id <= $max_doc_id ) - { - foreach ( $t_matches_awaiting[$doc_id] as $group => $bits ) - { - $undone_values[$group] = 1; - $temp_doc_ids_storage[$group][$doc_id] = $bits; - - if ( !empty($t_matches[$doc_id]) ) - { - $t_matches[$doc_id] |= $bits; - } - else - { - $t_matches[$doc_id] = $bits; - } - } - - unset($t_matches_awaiting[$doc_id]); - } - else if ( $doc_id < $min_doc_id ) - { - unset($t_matches_awaiting[$doc_id]); - } - } - } - else if ( $stop ) - { - break; - } - } + if ( isset($decode_ascend) ) + { + include("decode_asc.php"); + } + else + { + include("decode_desc.php"); + } - } # <------ while ( true ) ends - unset($loop_doc_positions, $loop_doc_groups, $t_matches, $t_matches_awaiting); if ( $tmp_matches ) $total_matches = $tmp_matches; - + $this->result["stats"]["processing_time"] = microtime(true) - $data_start; $this->result["total_matches"] = $total_matches; @@ -2868,7 +2398,19 @@ public function Search($query, $offset = 0, $limit = 10) # no results return $this->result; } - + else if ( isset($id_sort_enabled) && !isset($id_sort_goal) ) + { + $max_value = ($tmp_matches > $total_matches) ? $t_matches : $total_matches; + + if ( $max_value <= ($fast_ext_sort_req_count - $limit) ) + { + $this->result["out_of_bounds"] = 1; + return $this->result; + } + } + + $postprocessing_start = microtime(true); + /* at this point we know all matching document ids ( - minus filterby ) find out all the external attributes ( columns ) that should be fetched @@ -3313,29 +2855,22 @@ public function Search($query, $offset = 0, $limit = 10) } // special case: results are to be ordered by document id ( and grouping is disabled ) else if ( $disable_score_calculation && $this->sort_attr === "@id" && $this->groupmode === 1 ) - { + { # parse results from the document id list ( for the sql ) $temp_doc_id_sql[0] = " "; $temp_doc_id_sql = trim($temp_doc_id_sql); - - $value_count = substr_count($temp_doc_id_sql, ",") + 1; - $temp_ids_len = strlen($temp_doc_id_sql); - $item_len = $temp_ids_len / $value_count; - $required_amount = $limit + $offset; - $required_len = round($required_amount * $item_len * 1.5); + if ( $this->sortdirection === "asc" ) { # results with smallest ids first - $arr = explode(",", substr($temp_doc_id_sql, 0, $required_len)); - # sort values + $arr = explode(",", $temp_doc_id_sql); sort($arr); } else { - # results with highest ids - $arr = explode(",", substr($temp_doc_id_sql, -$required_len)); - # sort values + # results are in reverse order ( biggest @id first ) + $arr = explode(",", $temp_doc_id_sql); rsort($arr); } @@ -3579,11 +3114,11 @@ public function Search($query, $offset = 0, $limit = 10) } } + $ext_docinfo_start = microtime(true); + # fetch docinfo separately if ( $this->index_type == 1 ) { - $ext_docinfo_start = microtime(true); - $docsql = "SELECT ID as doc_id, SUBSTRING(field0, 1, 150) AS title, URL, field1 AS content, field3 as meta FROM PMBDocinfo".$this->suffix." WHERE ID IN (".implode(",", array_keys($this->result["matches"])).")"; $docpdo = $this->db_connection->query($docsql); @@ -3595,8 +3130,6 @@ public function Search($query, $offset = 0, $limit = 10) $this->result["matches"][(int)$row["doc_id"]][$column] = $column_value; } } - - $this->result["stats"]["ext_docinfo_time"] = microtime(true)-$ext_docinfo_start; } # if user has requested that original data must be included with the results else if ( $this->include_original_data && !empty($this->sql_body) ) @@ -3658,6 +3191,8 @@ public function Search($query, $offset = 0, $limit = 10) } } } + + $this->result["stats"]["ext_docinfo_time"] = microtime(true)-$ext_docinfo_start; } catch ( PDOException $e ) { diff --git a/README.txt b/README.txt index da142fa..1635f23 100644 --- a/README.txt +++ b/README.txt @@ -7,7 +7,7 @@ Published: 18.07.2017 copyright 2015-2017 Henri Ruutinen email: henri.ruutinen@gmail.com -website: http://www.hollilla.com/pickmybrain +website: http://www.pickmybra.in Overview @@ -47,7 +47,7 @@ under the GPLv3, or purchase a commercial Pickmybrain source license. If you're interested in commercial licensing, please see the Pickmybrain web site: - http://www.hollilla.com/pickmybrain + http://www.pickmybra.in Compatibility @@ -75,7 +75,7 @@ Getting Pickmybrain ------------------- The latest version is available from: -http://www.hollilla.com/pickmybrain +http://www.pickmybra.in Installation diff --git a/autostop.php b/autostop.php index 1e80569..5c3429e 100644 --- a/autostop.php +++ b/autostop.php @@ -13,6 +13,8 @@ $total_size = 0; if ( $enable_exec && $enable_ext_sorting ) { + $not_readable = array(); + for ( $i = 0 ; $i < $dist_threads ; ++$i ) { $filename = $directory . "/datatemp_".$index_id."_".$i.".txt"; @@ -20,6 +22,36 @@ { $total_size += filesize($filename); } + else + { + # add into another array for later inspection + $not_readable[$i] = 1; + } + } + + # some files couldn't be read - they were still open in another process + # try accessing them now + if ( !empty($not_readable) ) + { + foreach ( $not_readable as $i => $error_count ) + { + $filename = $directory . "/datatemp_".$index_id."_".$i.".txt"; + + while ( $not_readable[$i] < 10 ) + { + if ( is_readable($filename) ) + { + # everything is OK now ! + $not_readable[$i] = 10; + $total_size += filesize($filename); + } + else + { + ++$not_readable[$i]; + usleep(300000); # wait for 300ms + } + } + } } } else diff --git a/db_tokenizer.php b/db_tokenizer.php index dc07273..44c7817 100644 --- a/db_tokenizer.php +++ b/db_tokenizer.php @@ -29,6 +29,9 @@ require_once("tokenizer_functions.php"); } +# set process state on +SetProcessState($index_id, $process_number, 1); + register_shutdown_function($shutdown_function); define("CHARSET_REGEXP", "/[^" . $charset . preg_quote(implode("", $blend_chars)) . "]/u"); @@ -591,9 +594,6 @@ } } - # set process state on - SetProcessState($index_id, $process_number, 1); - # fetch data from ( external ) database $mainpdo = $ext_connection->query($main_sql_query); } @@ -883,6 +883,17 @@ $fields[$f_id] = $field; } + /* + INSERT CUSTOM FUNCTIONS HERE + */ + if ( !empty($approved_custom_functions) ) + { + foreach ( $approved_custom_functions as $f_name ) + { + + } + } + foreach ( $fields as $f_id => $field ) { $pos = 1; diff --git a/db_tokenizer_ext.php b/db_tokenizer_ext.php index d1b86dc..5f0911d 100644 --- a/db_tokenizer_ext.php +++ b/db_tokenizer_ext.php @@ -29,6 +29,9 @@ require_once("tokenizer_functions.php"); } +# set process state on +SetProcessState($index_id, $process_number, 1); + register_shutdown_function($shutdown_function); define("CHARSET_REGEXP", "/[^" . $charset . preg_quote(implode("", $blend_chars)) . "]/u"); @@ -545,9 +548,6 @@ } } - # set process state on - SetProcessState($index_id, $process_number, 1); - # fetch data from ( external ) database $mainpdo = $ext_connection->query($main_sql_query); } diff --git a/decode_asc.php b/decode_asc.php new file mode 100644 index 0000000..6b46a51 --- /dev/null +++ b/decode_asc.php @@ -0,0 +1,619 @@ + $token_group ) + { + $group_bits = 1 << $token_group; + $encoded_group = $this->hex_lookup_encode[$group]; + + if ( $encode_delta[$group] >= $max_doc_id ) + { + ++$skipped; + continue; // skip this group + } + else if ( $encode_pointers[$group] > $lengths[$group] && !$undone_values[$group] ) + { + ++$skipped; + ++$end; + continue; + } + + $delta = $encode_delta[$group]; + $temp = 0; + $shift = 0; + $bin_data = &$encoded_data[$group]; # reference to document id data + $docids_len = $lengths[$group]; # length of compressed document id data + $i = $encode_pointers[$group]; # string pointer of compressed document id data + $vals = $undone_values[$group]; # how many match position values waiting to be decoded ( for this group ) + $matchpos_data = &$doc_match_data[$group]; # reference to keyword match position data + $matchpos_len = $doc_lengths[$group]; # keyword match position data length + + if ( isset($temp_doc_ids_storage[$group]) ) + { + $temp_doc_ids = $temp_doc_ids_storage[$group]; + } + else + { + $temp_doc_ids = array(); + } + + # reset undone values + $undone_values[$group] = 0; + + // decode first (min) doc_id of each result group + for ( ; $i < $docids_len ; ++$i ) + { + if ( ($bits = $this->hex_lookup_decode[$bin_data[$i]]) < 128 ) + { + # number is yet to end + $temp |= $bits << $shift*7; + ++$shift; + } + else + { + # number ends + if ( $shift ) + { + $delta = ($temp|($bits-128 << $shift*7))+$delta-1; + $shift = 0; + $temp = 0; + } + else + { + $delta = $bits-129+$delta; + } + + if ( $delta <= $max_doc_id ) + { + $temp_doc_ids[$delta] = 1; + ++$vals; + + // store value only if we are in the current seek-range + if ( isset($t_matches[$delta]) ) + { + $t_matches[$delta] |= $group_bits; + } + else + { + $t_matches[$delta] = $group_bits; + } + } + else + { + ++$finished; + + if ( empty($t_matches_awaiting[$delta][$group]) ) + { + $t_matches_awaiting[$delta][$group] = $group_bits; + } + else + { + $t_matches_awaiting[$delta][$group] |= $group_bits; + } + + break; + } + } + } + + $encode_delta[$group] = $delta; + $encode_pointers[$group] = $i+1; + + if ( $vals && $matchpos_len ) + { + $r = 0; + $travel = (int)($avgs[$group]*$vals); + $p = $doc_pos_pointers[$group]+$travel; + if ( $p >= $matchpos_len ) + { + $p = $matchpos_len-1; + $travel = $matchpos_len-$doc_pos_pointers[$group]; + } + + $got = substr_count($doc_match_data[$group], $bin_sep, $doc_pos_pointers[$group], $travel); + + if ( $got < $vals ) + { + $vals = $vals-$got; + while ( true ) + { + if ( $matchpos_data[$p] === $bin_sep ) + { + ++$p; + ++$r; + if ( $r === $vals ) + { + break; + } + } + ++$p; + if ( $p >= $matchpos_len ) + { + ++$p; + break; + } + } + --$p; + } + else + { + if ( $got === $vals ) + { + $vals = 1; + } + else + { + $vals = $got-$vals+1; + } + + if ( $matchpos_data[$p] === $bin_sep ) ++$vals; + + while ( true ) + { + if ( $matchpos_data[$p] === $bin_sep ) + { + --$p; + ++$r; + if ( $r === $vals ) + { + break; + } + } + --$p; + if ( $p <= 0 ) + { + $p=-1; + break; + } + } + ++$p; + } + + + $a = $p-$doc_pos_pointers[$group]; + $data = explode($bin_sep, substr($doc_match_data[$group], $doc_pos_pointers[$group], $a)); + $doc_pos_pointers[$group] = $p+1; + + $l = 0; + foreach ( $temp_doc_ids as $doc_id => $string ) + { + if ( !empty($loop_doc_positions[$doc_id]) ) + { + $loop_doc_positions[$doc_id] .= $bin_sep.$encoded_group.$data[$l]; + } + else + { + $loop_doc_positions[$doc_id] = $encoded_group.$data[$l]; + } + + ++$l; + } + + unset($temp_doc_ids, $data); + $temp_doc_ids_storage[$group] = array(); + } + + # this group is done + if ( $i >= $docids_len ) + { + ++$end; + } + } # <---- foreach group ends + + if ( $end >= $group_count ) $stop = true; + + # all groups have finished, lets check the results + if ( $finished >= $group_count || $skipped >= $group_count || $stop ) + { + if ( $stop ) + { + if ( !empty($t_matches_awaiting) ) + { + foreach ( $t_matches_awaiting as $doc_id => $data ) + { + if ( $doc_id >= $min_doc_id && $doc_id <= $max_doc_id ) + { + foreach ( $t_matches_awaiting[$doc_id] as $group => $bits ) + { + $undone_values[$group] = 1; + $temp_doc_ids_storage[$group][$doc_id] = $bits; + + if ( !empty($t_matches[$doc_id]) ) + { + $t_matches[$doc_id] |= $bits; + } + else + { + $t_matches[$doc_id] = $bits; + } + } + + unset($t_matches_awaiting[$doc_id]); + } + } + } + } + + $t = 0; + $prev_tmp_matches = $tmp_matches; + $total_documents += count($t_matches); + + # get documents match position data + foreach ( $t_matches as $doc_id => $bits ) + { + if ( ($bits & $reference_bits) === $goal_bits ) + { + # skip the whole score calculation phase if we are sorting by an external attribute + # and there is no strict keyword order lookup + if ( $fast_external_sort ) + { + $temp_doc_id_sql .= ",$doc_id"; + ++$tmp_matches; + continue; + } + # now calculate document scores + else if ( $exact_mode ) + { + $exact_ids_lookup_copy = $exact_ids_lookup; + } + + # reset old variables + unset($best_match_score, $phrase_data, $document_count, $sentiment_data); + + $match_position_string = &$loop_doc_positions[$doc_id]; + $data_len = strlen($loop_doc_positions[$doc_id]); + $phrase_score = 0; + $bm25_score = 0; + $self_score = 0; + $maxscore_total = 0; + $sentiscore = 0; + $position_storage = $last_pos_lookup; + $strict_match = 0; + + $t_group = $this->hex_lookup_decode[$match_position_string[0]]; + $qind = $sorted_groups[$t_group]; + $prev_group = $qind-1; + + # initialize temporary array variables for each token group + $phrase_data[$qind] = 0; # for phrase score bits + $document_count[$qind] = 0; # how many documents for this token group + $best_match_score[$qind] = $score_lookup_alt[$t_group]; # maxscore ( token quality ) + + $temp = 0; + $shift = 0; + $delta = 1; + $x = 0; + + for ( $i = 1 ; $i < $data_len ; ++$i ) + { + $bits = $this->hex_lookup_decode[$match_position_string[$i]]; + + if ( $bits === 128 ) + { + # increase document match count for the previous token group ( if sentiment analysis is on, decrement the count by one ) + $document_count[$qind] += $x - $this->sentiment_index; + + # zero, as in binary separator + # token changes + + ++$i; # first char will be the group + $t_group = $this->hex_lookup_decode[$match_position_string[$i]]; + $qind = $sorted_groups[$t_group]; + $prev_group = $qind-1; + + if ( !isset($best_match_score[$qind]) ) + { + # initialize temporary array variables for each token group + $phrase_data[$qind] = 0; # for phrase score bits + $document_count[$qind] = 0; # how many documents for this token group + $best_match_score[$qind] = $score_lookup_alt[$t_group]; # maxscore ( token quality ) + } + # better quality score for this result group + else if ( $score_lookup_alt[$t_group] > $best_match_score[$qind] ) + { + $best_match_score[$qind] = $score_lookup_alt[$t_group]; + } + + # reset temporary variables + $temp = 0; + $shift = 0; + $delta = 1; + $x = 0; + + } + else if ( $bits < 128 ) + { + # number is yet to end + # check also gere if shift is === 0 ( then temp = bits; ) + $temp |= $bits << $shift*7; + ++$shift; + } + else + { + # 8th bit is set, number ends here ! + + if ( $x < $this->sentiment_index ) + { + $sentiscore += ($temp|($bits-128 << $shift*7))-128; + $temp = 0; + $shift = 0; + } + else + { + # otherwise this value is keyword position in document + if ( $shift ) + { + $delta = ($temp|($bits-128 << $shift*7))+$delta-1; + $shift = 0; + $temp = 0; + } + else + { + $delta = $bits-129+$delta; + } + + $field_id_bit = 1 << ($delta & $this->lsbits); + + if ( $field_id_bit & $this->enabled_fields ) + { + $field_pos = $delta >> $this->field_id_width; + + # self score match + $self_score |= $field_id_bit; + + # if there is a match in the same field + if ( $position_storage[$field_id_bit][$prev_group] === $field_pos-1 ) + { + $phrase_data[$qind] |= $field_id_bit; + + if ( $exact_mode ) + { + unset($exact_ids_lookup_copy[(1<<$qind)|(1<<$prev_group)]); + } + } + # if field_pos is 1 and token group is 0 -> strict match + else if ( $field_pos+$qind === 1 ) + { + $strict_match = 1; + } + + $position_storage[$field_id_bit][$qind] = $field_pos; + } + } + + ++$x; + } + } + + if ( !$self_score ) + { + # self_score is zero => none of the keywords were found on enabled fields + # this document is not a match + continue; + } + else if ( $exact_mode && !empty($exact_ids_lookup_copy) ) + { + # exact mode is on but document does not + # satisfy strict keyword order conditions + continue; + } + else if ( $strict_match_cmp_value > $strict_match ) + { + # strict matchmode's requirements not satisfied + continue; + } + + ++$total_matches; + + # skip rest of the score calculation + # documents are ranked by an external attribute + if ( $external_sort ) + { + $temp_doc_id_sql .= ",$doc_id"; + continue; + } + + # how many matches for this keyword + $document_count[$qind] += $x - $this->sentiment_index; + + foreach ( $phrase_data as $vind => $value ) + { + $phrase_score += $weighted_score_lookup[$value]; + $maxscore_total += $best_match_score[$vind]; + + if ( $sentimode && $this->sentiweight ) + { + # if field_weights are applied also sentiment scores + $sentiscore += $bm25_field_scores[$value]; + } + + $effective_match_count = $bm25_field_scores[$value] + $document_count[$vind]; + + $bm25_score += $effective_match_count * $IDF_lookup[$vind] / ($effective_match_count+1.2); + } + + # calculate self_score + $final_self_score = $weighted_score_lookup[$self_score]; + + # is quality scoring enabled ? + if ( $this->quality_scoring ) + { + $score_multiplier = $maxscore_total/count($phrase_data); + } + else + { + $score_multiplier = 1; + } + + switch ( $this->rankmode ) + { + case PMB_RANK_PROXIMITY_BM25: + $this->temp_matches[$doc_id] = (int)((($phrase_score + $final_self_score) * 1000 + round((0.5 + $bm25_score / $bm25_token_count) * 999)) * $score_multiplier); + break; + + case PMB_RANK_BM25: + $this->temp_matches[$doc_id] = (int)(round((0.5 + $bm25_score / $bm25_token_count) * 999) * $score_multiplier); + break; + + case PMB_RANK_PROXIMITY: + $this->temp_matches[$doc_id] = (int)((($phrase_score + $final_self_score) * 1000) * $score_multiplier); + break; + } + + + # special case: store sentiment score if sorting/grouping by sentiment score + if ( $sentimode ) + { + $this->temp_sentiscores[$doc_id] = $sentiscore; + } + + /* + at this point, check how many temp_matches we have + if count(temp_matches) > 10000, sort and keep only 1000 best matches + */ + + if ( $total_matches % $this->temp_grouper_size === 0 ) + { + # sort results + arsort($this->temp_matches); + + /* if grouping is enabled, it should be done at this point*/ + if ( $this->groupmode > 1 ) + { + $this->GroupTemporaryResults(); + } + else + { + # keep only $this->max_results + $this->temp_matches = array_slice($this->temp_matches, 0, $this->max_results, true); + } + + if ( $sentimode ) + { + $t_sentiscores = array(); + # rewrite sentiment score data + foreach ( $this->temp_matches as $t_doc_id => $doc_score ) + { + $t_sentiscores[$t_doc_id] = $this->temp_sentiscores[$t_doc_id]; + unset($this->temp_sentiscores[$t_doc_id]); + } + $this->temp_sentiscores = $t_sentiscores; + unset($t_sentiscores); + } + } + + } + } + + # if sorting by @id is enabled and we have enough results + if ( $tmp_matches >= $fast_ext_sort_req_count || $total_matches >= $fast_ext_sort_req_count ) + { + # we have found $fast_ext_sort_req_count results + $id_sort_goal = true; + + if ( $total_matches ) $tmp_matches = $total_matches; + + # veey approximate number of results + $approximate_docs = round(($tmp_matches / $total_documents) * $this->documents_in_collection); + + # set the flag on for approximate result count + $this->result["approximate_count"] = 1; + + # the maximum amount of matches is the + $keyword_count = count($sumcounts); + $match_sum = array_sum($sumcounts); + + # any keyword === match + if ( $this->matchmode === 1 ) + { + $minimum_matches = max($sumcounts); + + if ( $approximate_docs > $match_sum ) + { + $approximate_docs = $match_sum * 0.9; + } + else if ( $approximate_docs < $minimum_matches ) + { + # any keyword matches will do + $approximate_docs = $minimum_matches; + } + } + # all keywords === match + else + { + $maximum_matches = min($sumcounts); + + if ( $approximate_docs < $tmp_matches ) + { + $approximate_docs = $tmp_matches; + } + else if ( $approximate_docs > $maximum_matches ) + { + $approximate_docs = $maximum_matches; + } + } + + if ( $approximate_docs >= 100 ) + { + $tmp_matches = (int)round($approximate_docs, -2); + } + else + { + $tmp_matches = $approximate_docs; + } + + break; + } + + $min_doc_id += $interval; + $max_doc_id += $interval; + + unset($t_matches, $loop_doc_groups, $loop_doc_positions); + $t_matches = array(); + $loop_doc_positions = array(); + $loop_doc_groups = array(); + + if ( !empty($t_matches_awaiting) ) + { + foreach ( $t_matches_awaiting as $doc_id => $data ) + { + if ( $doc_id >= $min_doc_id && $doc_id <= $max_doc_id ) + { + foreach ( $t_matches_awaiting[$doc_id] as $group => $bits ) + { + $undone_values[$group] = 1; + $temp_doc_ids_storage[$group][$doc_id] = $bits; + + if ( !empty($t_matches[$doc_id]) ) + { + $t_matches[$doc_id] |= $bits; + } + else + { + $t_matches[$doc_id] = $bits; + } + } + + unset($t_matches_awaiting[$doc_id]); + } + else if ( $doc_id < $min_doc_id ) + { + unset($t_matches_awaiting[$doc_id]); + } + } + } + else if ( $stop ) + { + break; + } + } # <------- # (all groups have finished, lets check the results) if block ends + + } # <---------# while ( true ) ends + +?> \ No newline at end of file diff --git a/decode_desc.php b/decode_desc.php new file mode 100644 index 0000000..6c34462 --- /dev/null +++ b/decode_desc.php @@ -0,0 +1,617 @@ + $doc_id ) + { + if ( $doc_id >= $min_doc_id && $doc_id <= $max_doc_id ) + { + $group_bits = 1 << $sorted_groups[$t_index]; + $t_matches_awaiting[$doc_id][$t_index] = $group_bits; + $temp_doc_ids_storage[$t_index][$doc_id] = 1; + + // store value only if we are in the current seek-range + if ( isset($t_matches[$doc_id]) ) + { + $t_matches[$doc_id] |= $group_bits; + } + else + { + $t_matches[$doc_id] = $group_bits; + } + + ++$undone_values[$t_index]; + + unset($maximum_doc_ids_vals[$t_index]); + } + } + } + + foreach ( $sorted_groups as $group => $token_group ) + { + $group_bits = 1 << $token_group; + $encoded_group = $this->hex_lookup_encode[$group]; + + if ( $encode_delta[$group] < $min_doc_id ) + { + ++$skipped; + continue; // skip this group + } + else if ( $encode_pointers[$group] <= 0 && !$undone_values[$group] ) + { + ++$skipped; + ++$end; + continue; + } + + $delta = $encode_delta[$group]; + $temp = $encode_temp_docs[$group]; + $bin_data = &$encoded_data[$group]; # reference to document id data + $i = $encode_pointers[$group]; # string pointer of compressed document id data + $vals = $undone_values[$group]; # how many match position values waiting to be decoded ( for this group ) + $matchpos_data = &$doc_match_data[$group]; # reference to keyword match position data + $matchpos_len = $doc_lengths[$group]; # keyword match position data length + + if ( isset($temp_doc_ids_storage[$group]) ) + { + $temp_doc_ids = $temp_doc_ids_storage[$group]; + } + else + { + $temp_doc_ids = array(); + } + + # reset undone values + $undone_values[$group] = 0; + + // decode first (min) doc_id of each result group + while ( $i ) + { + --$i; + + if ( ($next_bits = $this->hex_lookup_decode[$bin_data[$i]]) < 128 ) + { + $temp = ($temp << 7) | $next_bits; # the new bits get added as LSBs + } + else + { + $delta = $delta - $temp + 1; + $temp = $next_bits-128; + + if ( $delta >= $min_doc_id ) + { + $temp_doc_ids[$delta] = 1; + ++$vals; + + // store value only if we are in the current seek-range + if ( isset($t_matches[$delta]) ) + { + $t_matches[$delta] |= $group_bits; + } + else + { + $t_matches[$delta] = $group_bits; + } + } + else + { + ++$finished; + + if ( empty($t_matches_awaiting[$delta][$group]) ) + { + $t_matches_awaiting[$delta][$group] = $group_bits; + } + else + { + $t_matches_awaiting[$delta][$group] |= $group_bits; + } + + break; + } + } + } + + $encode_delta[$group] = $delta; + $encode_temp_docs[$group] = $temp; + $encode_pointers[$group] = $i; + + if ( $vals && $matchpos_len ) + { + $travel = (int)($avgs[$group]*$vals); + $p = $doc_pos_pointers[$group]-$travel; # maybe something funny here ? len 100 , travel 4 => start 96 + + if ( $p < 0 ) + { + $p = 0; # cannot be smaller than 0 + $travel = $doc_pos_pointers[$group]; # to get to 0 from doc_pos_pointers, doc_pos_pointers chars must be travelled + if ( $travel < 1 ) + { + $travel = 1; + } + } + + $got = substr_count($doc_match_data[$group], $bin_sep, $p, $travel); + $balance = $got-$vals; + + if ( $balance >= 0 ) + { + # increment the pointer ( forward towards end of the string) + do + { + if ( $matchpos_data[$p] === $bin_sep ) + { + --$balance; + } + ++$p; + + } while ( $balance !== -1 && $p !== $matchpos_len ); + } + else + { + # balance is negative, we got less values than needed + if ( $p ) + { + do + { + --$p; + if ( $p !== -1 && $matchpos_data[$p] === $bin_sep ) + { + ++$balance; + } + } while ( $balance !== 0 && $p !== -1 ); + + # go forward to the first non binary separator char + ++$p; + } + } + + $travel_len = $doc_pos_pointers[$group] - $p; + if ( !$travel_len ) $travel_len = 1; + + $data = explode($bin_sep, substr($doc_match_data[$group], $p, $travel_len)); + $doc_pos_pointers[$group] = $p-1; + + $l = $vals-1; + foreach ( $temp_doc_ids as $doc_id => $string ) + { + if ( !empty($loop_doc_positions[$doc_id]) ) + { + $loop_doc_positions[$doc_id] .= $bin_sep.$encoded_group.$data[$l]; + } + else + { + $loop_doc_positions[$doc_id] = $encoded_group.$data[$l]; + } + + --$l; + } + + unset($temp_doc_ids, $data); + $temp_doc_ids_storage[$group] = array(); + } + + # this group is done + if ( $i <= 0 ) + { + ++$end; + } + } # <---- foreach group ends + + if ( $end >= $group_count ) $stop = true; + + # all groups have finished, lets check the results + if ( $finished >= $group_count || $skipped >= $group_count || $stop ) + { + if ( $stop ) + { + if ( !empty($t_matches_awaiting) ) + { + foreach ( $t_matches_awaiting as $doc_id => $data ) + { + if ( $doc_id >= $min_doc_id && $doc_id <= $max_doc_id ) + { + foreach ( $t_matches_awaiting[$doc_id] as $group => $bits ) + { + $undone_values[$group] = 1; + $temp_doc_ids_storage[$group][$doc_id] = $bits; + + if ( !empty($t_matches[$doc_id]) ) + { + $t_matches[$doc_id] |= $bits; + } + else + { + $t_matches[$doc_id] = $bits; + } + } + + unset($t_matches_awaiting[$doc_id]); + } + } + } + } + + $t = 0; + $prev_tmp_matches = $tmp_matches; + $total_documents += count($t_matches); + + # get documents match position data + foreach ( $t_matches as $doc_id => $bits ) + { + if ( ($bits & $reference_bits) === $goal_bits ) + { + # skip the whole score calculation phase if we are sorting by an external attribute + # and there is no strict keyword order lookup + if ( $fast_external_sort ) + { + $temp_doc_id_sql .= ",$doc_id"; + ++$tmp_matches; + continue; + } + else if ( $exact_mode ) + { + $exact_ids_lookup_copy = $exact_ids_lookup; + } + + # reset old variables + unset($best_match_score, $phrase_data, $document_count, $sentiment_data); + + $match_position_string = &$loop_doc_positions[$doc_id]; + $data_len = strlen($loop_doc_positions[$doc_id]); + $phrase_score = 0; + $bm25_score = 0; + $self_score = 0; + $maxscore_total = 0; + $sentiscore = 0; + $position_storage = $last_pos_lookup; + $strict_match = 0; + + $t_group = $this->hex_lookup_decode[$match_position_string[0]]; + $qind = $sorted_groups[$t_group]; + $prev_group = $qind-1; + + # initialize temporary array variables for each token group + $phrase_data[$qind] = 0; # for phrase score bits + $document_count[$qind] = 0; # how many documents for this token group + $best_match_score[$qind] = $score_lookup_alt[$t_group]; # maxscore ( token quality ) + + $temp = 0; + $shift = 0; + $delta = 1; + $x = 0; + + + for ( $i = 1 ; $i < $data_len ; ++$i ) + { + $bits = $this->hex_lookup_decode[$match_position_string[$i]]; + + if ( $bits === 128 ) + { + # increase document match count for the previous token group ( if sentiment analysis is on, decrement the count by one ) + $document_count[$qind] += $x - $this->sentiment_index; + + # zero, as in binary separator + # token changes + + ++$i; # first char will be the group + $t_group = $this->hex_lookup_decode[$match_position_string[$i]]; + $qind = $sorted_groups[$t_group]; + $prev_group = $qind-1; + + if ( !isset($best_match_score[$qind]) ) + { + # initialize temporary array variables for each token group + $phrase_data[$qind] = 0; # for phrase score bits + $document_count[$qind] = 0; # how many documents for this token group + $best_match_score[$qind] = $score_lookup_alt[$t_group]; # maxscore ( token quality ) + } + # better quality score for this result group + else if ( $score_lookup_alt[$t_group] > $best_match_score[$qind] ) + { + $best_match_score[$qind] = $score_lookup_alt[$t_group]; + } + + # reset temporary variables + $temp = 0; + $shift = 0; + $delta = 1; + $x = 0; + + } + else if ( $bits < 128 ) + { + # number is yet to end + # check also gere if shift is === 0 ( then temp = bits; ) + $temp |= $bits << $shift*7; + ++$shift; + } + else + { + # 8th bit is set, number ends here ! + + if ( $x < $this->sentiment_index ) + { + $sentiscore += ($temp|($bits-128 << $shift*7))-128; + $temp = 0; + $shift = 0; + } + else + { + # otherwise this value is keyword position in document + if ( $shift ) + { + $delta = ($temp|($bits-128 << $shift*7))+$delta-1; + $shift = 0; + $temp = 0; + } + else + { + $delta = $bits-129+$delta; + } + + $field_id_bit = 1 << ($delta & $this->lsbits); + + if ( $field_id_bit & $this->enabled_fields ) + { + $field_pos = $delta >> $this->field_id_width; + + # self score match + $self_score |= $field_id_bit; + + # if there is a match in the same field + if ( $position_storage[$field_id_bit][$prev_group] === $field_pos-1 ) + { + $phrase_data[$qind] |= $field_id_bit; + + if ( $exact_mode ) + { + unset($exact_ids_lookup_copy[(1<<$qind)|(1<<$prev_group)]); + } + } + # if field_pos is 1 and token group is 0 -> strict match + else if ( $field_pos+$qind === 1 ) + { + $strict_match = 1; + } + + $position_storage[$field_id_bit][$qind] = $field_pos; + } + } + + ++$x; + } + } + + if ( !$self_score ) + { + # self_score is zero => none of the keywords were found on enabled fields + # this document is not a match + continue; + } + else if ( $exact_mode && !empty($exact_ids_lookup_copy) ) + { + # exact mode is on but document does not + # satisfy strict keyword order conditions + continue; + } + else if ( $strict_match_cmp_value > $strict_match ) + { + # strict matchmode's requirements not satisfied + continue; + } + + ++$total_matches; + + # skip rest of the score calculation + # documents are ranked by an external attribute + if ( $external_sort ) + { + $temp_doc_id_sql .= ",$doc_id"; + continue; + } + + # how many matches for this keyword + $document_count[$qind] += $x - $this->sentiment_index; + + foreach ( $phrase_data as $vind => $value ) + { + $phrase_score += $weighted_score_lookup[$value]; + $maxscore_total += $best_match_score[$vind]; + + if ( $sentimode && $this->sentiweight ) + { + # if field_weights are applied also sentiment scores + $sentiscore += $bm25_field_scores[$value]; + } + + $effective_match_count = $bm25_field_scores[$value] + $document_count[$vind]; + + $bm25_score += $effective_match_count * $IDF_lookup[$vind] / ($effective_match_count+1.2); + } + + # calculate self_score + $final_self_score = $weighted_score_lookup[$self_score]; + + # is quality scoring enabled ? + if ( $this->quality_scoring ) + { + $score_multiplier = $maxscore_total/count($phrase_data); + } + else + { + $score_multiplier = 1; + } + + switch ( $this->rankmode ) + { + case PMB_RANK_PROXIMITY_BM25: + $this->temp_matches[$doc_id] = (int)((($phrase_score + $final_self_score) * 1000 + round((0.5 + $bm25_score / $bm25_token_count) * 999)) * $score_multiplier); + break; + + case PMB_RANK_BM25: + $this->temp_matches[$doc_id] = (int)(round((0.5 + $bm25_score / $bm25_token_count) * 999) * $score_multiplier); + break; + + case PMB_RANK_PROXIMITY: + $this->temp_matches[$doc_id] = (int)((($phrase_score + $final_self_score) * 1000) * $score_multiplier); + break; + } + + + # special case: store sentiment score if sorting/grouping by sentiment score + if ( $sentimode ) + { + $this->temp_sentiscores[$doc_id] = $sentiscore; + } + + /* + at this point, check how many temp_matches we have + if count(temp_matches) > 10000, sort and keep only 1000 best matches + */ + + if ( $total_matches % $this->temp_grouper_size === 0 ) + { + # sort results + arsort($this->temp_matches); + + /* if grouping is enabled, it should be done at this point*/ + if ( $this->groupmode > 1 ) + { + $this->GroupTemporaryResults(); + } + else + { + # keep only $this->max_results + $this->temp_matches = array_slice($this->temp_matches, 0, $this->max_results, true); + } + + if ( $sentimode ) + { + $t_sentiscores = array(); + # rewrite sentiment score data + foreach ( $this->temp_matches as $t_doc_id => $doc_score ) + { + $t_sentiscores[$t_doc_id] = $this->temp_sentiscores[$t_doc_id]; + unset($this->temp_sentiscores[$t_doc_id]); + } + $this->temp_sentiscores = $t_sentiscores; + unset($t_sentiscores); + } + } + + } + } + + # if sorting by @id is enabled and we have enough results + if ( ($tmp_matches >= $fast_ext_sort_req_count || $total_matches >= $fast_ext_sort_req_count) ) + { + # we have found $fast_ext_sort_req_count results + $id_sort_goal = true; + + if ( $total_matches ) $tmp_matches = $total_matches; + + # very approximate number of results + $approximate_docs = ($tmp_matches / $total_documents) * $this->documents_in_collection; + + # set the flag on for approximate result count + $this->result["approximate_count"] = 1; + + # the maximum amount of matches is the + $keyword_count = count($sumcounts); + $match_sum = array_sum($sumcounts); + + # any keyword + if ( $this->matchmode === 1 ) + { + $minimum_matches = max($sumcounts); + + if ( $approximate_docs > $match_sum ) + { + $approximate_docs = $match_sum * 0.9; + } + else if ( $approximate_docs < $minimum_matches ) + { + # any keyword matches will do + $approximate_docs = $minimum_matches; + } + } + else + { + $maximum_matches = min($sumcounts); + # all keywords must match + if ( $approximate_docs < $tmp_matches ) + { + $approximate_docs = $tmp_matches; + } + else if ( $approximate_docs > $maximum_matches ) + { + $approximate_docs = $maximum_matches; + } + } + + if ( $approximate_docs >= 100 ) + { + $tmp_matches = (int)round($approximate_docs, -2); + } + else + { + $tmp_matches = $approximate_docs; + } + + break; + } + + $min_doc_id += $interval; + $max_doc_id += $interval; + + unset($t_matches, $loop_doc_groups, $loop_doc_positions); + $t_matches = array(); + $loop_doc_positions = array(); + $loop_doc_groups = array(); + + if ( !empty($t_matches_awaiting) ) + { + foreach ( $t_matches_awaiting as $doc_id => $data ) + { + if ( $doc_id >= $min_doc_id && $doc_id <= $max_doc_id ) + { + foreach ( $t_matches_awaiting[$doc_id] as $group => $bits ) + { + $undone_values[$group] = 1; + $temp_doc_ids_storage[$group][$doc_id] = $bits; + + if ( !empty($t_matches[$doc_id]) ) + { + $t_matches[$doc_id] |= $bits; + } + else + { + $t_matches[$doc_id] = $bits; + } + } + + unset($t_matches_awaiting[$doc_id]); + } + else if ( $doc_id > $max_doc_id ) + { + unset($t_matches_awaiting[$doc_id]); + } + } + } + else if ( $stop ) + { + break; + } + } # <------- # (all groups have finished, lets check the results) if block ends + + } # <---------# while ( true ) ends + +?> \ No newline at end of file diff --git a/finalization.php b/finalization.php index b06e526..616b074 100644 --- a/finalization.php +++ b/finalization.php @@ -22,10 +22,11 @@ # delete old docinfo table and replace it with the new one $connection->query("DROP TABLE PMBDocinfo$index_suffix"); $connection->query("ALTER TABLE $docinfo_target_table RENAME TO PMBDocinfo$index_suffix"); - # delete old docinfo table and replace it with the new one + # delete old PMBTokens table and replace it with the new one $connection->query("DROP TABLE PMBTokens$index_suffix"); $connection->query("ALTER TABLE PMBTokens".$index_suffix."_temp RENAME TO PMBTokens$index_suffix"); - # delete old docinfo table and replace it with the new one + $connection->query("ALTER TABLE PMBTokens$index_suffix ADD INDEX(metaphone, doc_matches)"); # add metaphone index + # delete old PMBPrefixes table and replace it with the new one $connection->query("DROP TABLE PMBPrefixes$index_suffix"); $connection->query("ALTER TABLE PMBPrefixes".$index_suffix."_temp RENAME TO PMBPrefixes$index_suffix"); @@ -38,6 +39,7 @@ $connection->beginTransaction(); $connection->query("DROP TABLE IF EXISTS PMBTokens".$index_suffix."_delta"); $connection->query("ALTER TABLE PMBTokens".$index_suffix."_temp RENAME TO PMBTokens".$index_suffix."_delta"); + $connection->query("ALTER TABLE PMBTokens".$index_suffix."_delta ADD INDEX(metaphone, doc_matches)"); # add metaphone index $connection->commit(); $connection->beginTransaction(); @@ -49,10 +51,20 @@ $connection->query("DELETE FROM PMBDocinfo".$index_suffix." WHERE ID > (SELECT max_id FROM PMBIndexes WHERE ID = $index_id)"); $connection->query("INSERT INTO PMBDocinfo".$index_suffix." SELECT * FROM PMBDocinfo".$index_suffix."_delta"); $connection->query("DROP TABLE PMBDocinfo".$index_suffix."_delta"); - + $latest_rotation_sql = ""; } + else + { + # just add the metaphone index + # we are here if we are starting from a scratch or if we are merging to indexes + + echo "Creating a metaphone index...\n"; + $connection->query("ALTER TABLE PMBTokens".$index_suffix." ADD INDEX(metaphone, doc_matches)"); + + } + # delta-indexing is not enable, delete possible old values if ( empty($delta_indexing) ) { $connection->query("DROP TABLE IF EXISTS PMBTokens".$index_suffix."_delta"); diff --git a/livesearch.php b/livesearch.php index 438f6ad..0f5ce23 100644 --- a/livesearch.php +++ b/livesearch.php @@ -11,7 +11,7 @@ ini_set("display_errors", 1); error_reporting(E_ALL); -set_time_limit(10); +set_time_limit(3); /* livesearch.php @@ -75,16 +75,27 @@ } $result = $pickmybrain->Search($_GET["q"], $offset, $slots); - + if ( !empty($result["matches"]) ) { + $about = ""; + if ( isset($result["approximate_count"]) ) + { + $about = "About"; + } + $pagenumber = ""; if ( $offset > 0 ) { $pagenumber = "Page " . (round($offset/$slots)+1) . " of "; + if ( isset($result["approximate_count"]) ) + { + $about = ""; + $pagenumber .= " about "; + } } - echo "
$pagenumber" . $result["total_matches"]." results ( ".round(ceil($result["query_time"]*1000)/1000, 3)." seconds )
"; + echo "
$about $pagenumber" . $result["total_matches"]." results ( ".round(ceil($result["query_time"]*1000)/1000, 3)." seconds )
"; if ( !empty($result["did_you_mean"]) ) { diff --git a/prefix_composer_ext.php b/prefix_composer_ext.php index db37c3a..6c5f62d 100644 --- a/prefix_composer_ext.php +++ b/prefix_composer_ext.php @@ -59,7 +59,6 @@ register_shutdown_function("shutdown", $index_id, $process_number); } - # dialect processing if ( $dialect_processing ) { diff --git a/prefix_compressor_ext.php b/prefix_compressor_ext.php index 3ab39fe..6d9b0ac 100644 --- a/prefix_compressor_ext.php +++ b/prefix_compressor_ext.php @@ -51,7 +51,7 @@ require "data_partitioner.php"; # launch sister processes here if multiprocessing is turned on! -if ( $dist_threads > 1 && $process_number === 0 && empty($temp_disable_multiprocessing) ) +if ( $dist_threads > 1 && $process_number === 0 && empty($temp_disable_multiprocessing) && $data_size > 0 ) { # launch sister-processes for ( $x = 1 ; $x < $dist_threads ; ++$x ) @@ -131,6 +131,15 @@ ) ENGINE=INNODB $innodb_row_format_sql"); } } + + + # if we are at the main process + # and there is no data to compress, skip the rest of the file + if ( $process_number === 0 && $data_size === 0 ) + { + echo "Skipping prefix compression, nothing to compress! \n"; + return; + } $insert_time = 0; $statistic_total_time = 0; diff --git a/prefix_compressor_merger_ext.php b/prefix_compressor_merger_ext.php index 6d97320..15510e3 100644 --- a/prefix_compressor_merger_ext.php +++ b/prefix_compressor_merger_ext.php @@ -51,7 +51,7 @@ require "data_partitioner.php"; # launch sister processes here if multiprocessing is turned on! -if ( $dist_threads > 1 && $process_number === 0 && empty($temp_disable_multiprocessing) ) +if ( $dist_threads > 1 && $process_number === 0 && empty($temp_disable_multiprocessing) && $data_size > 0 ) { # launch sister-processes for ( $x = 1 ; $x < $dist_threads ; ++$x ) @@ -132,6 +132,15 @@ ) ENGINE=INNODB $innodb_row_format_sql"); } + + + # if we are at the main process + # and there is no data to compress, skip the rest of the file + if ( $process_number === 0 && $data_size === 0 ) + { + echo "Skipping prefix compression, nothing to compress! \n"; + return; + } $insert_time = 0; $statistic_total_time = 0; diff --git a/sentiment/finnish/ignore.php b/sentiment/finnish/ignore.php index 05958b1..7e29e7e 100644 --- a/sentiment/finnish/ignore.php +++ b/sentiment/finnish/ignore.php @@ -1,17 +1,17 @@ 1, "paskan" => 1, "paskanaama" => 1, +"paskanaaman" => 1, +"paskanaamat" => 1, +"paskanaamoja" => 1, +"paskanaamoilla" => 1, +"paskanaamoilta" => 1, +"paskanaamoille" => 1, "paskasitte" => 1, "paska-aivo" => 1, "paska-aivot" => 1, diff --git a/sentiment/finnish/neutral.php b/sentiment/finnish/neutral.php index 5232543..4caa077 100644 --- a/sentiment/finnish/neutral.php +++ b/sentiment/finnish/neutral.php @@ -1,17 +1,17 @@ query("DROP INDEX metaphone ON $target_table"); + } + catch ( PDOException $e ) + { + + } } else { @@ -120,6 +129,7 @@ token varbinary(40) NOT NULL, metaphone smallint(5) unsigned DEFAULT 0, doc_matches int(8) unsigned NOT NULL, + max_doc_id int(8) unsigned NOT NULL, doc_ids mediumblob NOT NULL, PRIMARY KEY (checksum,token) ) ENGINE=INNODB DEFAULT CHARSET=utf8;"); @@ -309,7 +319,7 @@ { $metaphone = $Metaphones->token_to_int16($min_token); - $insert_sql .= ",($min_checksum,".$connection->quote($min_token).",$metaphone,$document_count,".$connection->quote($doc_id_string . $token_data_string).")"; + $insert_sql .= ",($min_checksum,".$connection->quote($min_token).",$metaphone,$document_count,$min_doc_id,".$connection->quote($doc_id_string . $token_data_string).")"; ++$x; ++$w; @@ -324,7 +334,7 @@ { $token_insert_time_start = microtime(true); $insert_sql[0] = " "; - $ins = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, doc_ids) VALUES $insert_sql"); + $ins = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, max_doc_id, doc_ids) VALUES $insert_sql"); $token_insert_time += (microtime(true)-$token_insert_time_start); $w = 0; ++$insert_counter; @@ -390,7 +400,7 @@ { $token_insert_time_start = microtime(true); $insert_sql[0] = " "; - $ins = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, doc_ids) VALUES $insert_sql"); + $ins = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, max_doc_id, doc_ids) VALUES $insert_sql"); $token_insert_time += (microtime(true)-$token_insert_time_start); # reset write buffer @@ -458,7 +468,7 @@ if ( $w >= $write_buffer_len || memory_get_usage() > $memory_usage_limit ) { $ins_sql[0] = " "; - $inspdo = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, doc_ids) VALUES $ins_sql"); + $inspdo = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, max_doc_id, doc_ids) VALUES $ins_sql"); unset($ins_sql); $ins_sql = ""; $w = 0; @@ -472,7 +482,7 @@ } } - $ins_sql .= ",(".$row["checksum"].",".$connection->quote($row["token"]).",".$row["metaphone"].",".$row["doc_matches"].",".$connection->quote($row["doc_ids"]).")"; + $ins_sql .= ",(".$row["checksum"].",".$connection->quote($row["token"]).",".$row["metaphone"].",".$row["doc_matches"].",".$row["max_doc_id"].",".$connection->quote($row["doc_ids"]).")"; ++$w; } @@ -482,7 +492,7 @@ if ( !empty($ins_sql) ) { $ins_sql[0] = " "; - $inspdo = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, doc_ids) VALUES $ins_sql"); + $inspdo = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, max_doc_id, doc_ids) VALUES $ins_sql"); unset($ins_sql); $ins_sql = ""; $insert_counter = 0; diff --git a/token_compressor_ext.php b/token_compressor_ext.php index 24bad2a..da5954e 100644 --- a/token_compressor_ext.php +++ b/token_compressor_ext.php @@ -52,7 +52,7 @@ require "data_partitioner.php"; # launch sister processes here if multiprocessing is turned on! -if ( $dist_threads > 1 && $process_number === 0 && empty($temp_disable_multiprocessing) ) +if ( $dist_threads > 1 && $process_number === 0 && empty($temp_disable_multiprocessing) && $data_size > 0 ) { # launch sister-processes for ( $x = 1 ; $x < $dist_threads ; ++$x ) @@ -106,6 +106,7 @@ token varbinary(40) NOT NULL, metaphone smallint(5) unsigned DEFAULT 0, doc_matches int(8) unsigned NOT NULL, + max_doc_id int(8) unsigned NOT NULL, doc_ids mediumblob NOT NULL ) ENGINE=MYISAM DEFAULT CHARSET=utf8;"); } @@ -114,6 +115,14 @@ if ( empty($replace_index) && $clean_slate ) { $target_table = "PMBTokens$index_suffix"; + try + { + $connection->query("DROP INDEX metaphone ON $target_table"); + } + catch ( PDOException $e ) + { + + } } else { @@ -125,12 +134,22 @@ token varbinary(40) NOT NULL, metaphone smallint(5) unsigned DEFAULT 0, doc_matches int(8) unsigned NOT NULL, + max_doc_id int(8) unsigned NOT NULL, doc_ids mediumblob NOT NULL, PRIMARY KEY (checksum,token) ) ENGINE=INNODB DEFAULT CHARSET=utf8;"); } } + + # if we are at the main process + # and there is no data to compress, skip the rest of the file + if ( $process_number === 0 && $data_size === 0 ) + { + echo "Skipping token compression, nothing to compress! \n"; + return; + } + # for fetching data $rows = 0; $write_buffer_len = 250; @@ -348,7 +367,7 @@ { $metaphone = $Metaphones->token_to_int16($min_token); - $insert_sql .= ",($min_checksum,".$connection->quote($min_token).",$metaphone,$document_count,".$connection->quote($doc_id_string . $token_data_string).")"; + $insert_sql .= ",($min_checksum,".$connection->quote($min_token).",$metaphone,$document_count,$min_doc_id,".$connection->quote($doc_id_string . $token_data_string).")"; ++$x; ++$w; @@ -363,7 +382,7 @@ { $token_insert_time_start = microtime(true); $insert_sql[0] = " "; - $ins = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, doc_ids) VALUES $insert_sql"); + $ins = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, max_doc_id, doc_ids) VALUES $insert_sql"); $token_insert_time += (microtime(true)-$token_insert_time_start); $w = 0; ++$insert_counter; @@ -441,7 +460,7 @@ { $token_insert_time_start = microtime(true); $insert_sql[0] = " "; - $ins = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, doc_ids) VALUES $insert_sql"); + $ins = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, max_doc_id, doc_ids) VALUES $insert_sql"); $token_insert_time += (microtime(true)-$token_insert_time_start); # reset write buffer @@ -456,7 +475,7 @@ echo $string; - file_put_contents("/var/www/localsearch/errorlog_".$process_number.".txt", $string); + file_put_contents("$directory/errorlog_".$process_number.".txt", $string); } @@ -507,7 +526,7 @@ if ( $w >= $write_buffer_len || memory_get_usage() > $memory_usage_limit ) { $ins_sql[0] = " "; - $inspdo = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, doc_ids) VALUES $ins_sql"); + $inspdo = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, max_doc_id, doc_ids) VALUES $ins_sql"); unset($ins_sql); $ins_sql = ""; $w = 0; @@ -521,7 +540,7 @@ } } - $ins_sql .= ",(".$row["checksum"].",".$connection->quote($row["token"]).",".$row["metaphone"].",".$row["doc_matches"].",".$connection->quote($row["doc_ids"]).")"; + $ins_sql .= ",(".$row["checksum"].",".$connection->quote($row["token"]).",".$row["metaphone"].",".$row["doc_matches"].",".$row["max_doc_id"].",".$connection->quote($row["doc_ids"]).")"; ++$w; } @@ -532,7 +551,7 @@ if ( !empty($ins_sql) ) { $ins_sql[0] = " "; - $inspdo = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, doc_ids) VALUES $ins_sql"); + $inspdo = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, max_doc_id, doc_ids) VALUES $ins_sql"); unset($ins_sql); $ins_sql = ""; $insert_counter = 0; diff --git a/token_compressor_merger.php b/token_compressor_merger.php index e40563c..084db7b 100644 --- a/token_compressor_merger.php +++ b/token_compressor_merger.php @@ -102,6 +102,7 @@ token varbinary(40) NOT NULL, metaphone smallint(5) unsigned DEFAULT 0, doc_matches int(8) unsigned NOT NULL, + max_doc_id int(8) unsigned NOT NULL, doc_ids mediumblob NOT NULL ) ENGINE=MYISAM DEFAULT CHARSET=utf8;"); } @@ -123,6 +124,7 @@ token varbinary(40) NOT NULL, metaphone smallint(5) unsigned DEFAULT 0, doc_matches int(8) unsigned NOT NULL, + max_doc_id int(8) unsigned NOT NULL, doc_ids mediumblob NOT NULL, PRIMARY KEY(checksum, token) ) ENGINE=INNODB DEFAULT CHARSET=utf8;"); @@ -330,6 +332,7 @@ ".$connection->quote($oldrow["token"]).", ".$oldrow["metaphone"].", ".$oldrow["doc_matches"].", + ".$oldrow["max_doc_id"].", ".$connection->quote($oldrow["doc_ids"]).")"; ++$x; ++$w; @@ -338,7 +341,7 @@ { $token_insert_time_start = microtime(true); $insert_sql[0] = " "; - $ins = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, doc_ids) VALUES $insert_sql"); + $ins = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, max_doc_id, doc_ids) VALUES $insert_sql"); $token_insert_time += (microtime(true)-$token_insert_time_start); $w = 0; ++$insert_counter; @@ -382,6 +385,7 @@ ".$connection->quote($oldrow["token"]).", ".$oldrow["metaphone"].", ".($oldrow["doc_matches"]+$document_count).", + ".$min_doc_id.", ".$connection->quote(MergeCompressedData($old_doc_ids, $doc_id_string, $hex_lookup_decode, $hex_lookup_encode) . substr($oldrow["doc_ids"], $pos) . $token_data_string).")"; ++$combinations; ++$x; @@ -417,6 +421,7 @@ ".$connection->quote($oldrow["token"]).", ".$oldrow["metaphone"].", ".($oldrow["doc_matches"]+$document_count).", + ".$min_doc_id.", ".$connection->quote(MergeCompressedData($old_doc_ids, $doc_id_string, $hex_lookup_decode, $hex_lookup_encode) . substr($oldrow["doc_ids"], $pos) . $token_data_string).")"; ++$combinations; ++$x; @@ -437,6 +442,7 @@ ".$connection->quote($min_token).", $metaphone, $document_count, + $min_doc_id, ".$connection->quote($doc_id_string . $token_data_string).")"; ++$x; ++$w; @@ -467,6 +473,7 @@ ".$connection->quote($min_token).", $metaphone, $document_count, + $min_doc_id, ".$connection->quote($doc_id_string . $token_data_string).")"; ++$x; ++$w; @@ -483,7 +490,7 @@ { $token_insert_time_start = microtime(true); $insert_sql[0] = " "; - $ins = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, doc_ids) VALUES $insert_sql"); + $ins = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, max_doc_id, doc_ids) VALUES $insert_sql"); $token_insert_time += (microtime(true)-$token_insert_time_start); $w = 0; ++$insert_counter; @@ -552,6 +559,7 @@ ".$connection->quote($oldrow["token"]).", ".$oldrow["metaphone"]." ".$oldrow["doc_matches"].", + ".$oldrow["max_doc_id"].", ".$connection->quote($oldrow["doc_ids"]).")"; } @@ -563,6 +571,7 @@ ".$connection->quote($oldrow["token"]).", ".$oldrow["metaphone"]." ".$oldrow["doc_matches"].", + ".$oldrow["max_doc_id"].", ".$connection->quote($oldrow["doc_ids"]).")"; ++$x; ++$w; @@ -571,7 +580,7 @@ { $token_insert_time_start = microtime(true); $insert_sql[0] = " "; - $ins = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, doc_ids) VALUES $insert_sql"); + $ins = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, max_doc_id, doc_ids) VALUES $insert_sql"); $token_insert_time += (microtime(true)-$token_insert_time_start); $w = 0; ++$insert_counter; @@ -596,7 +605,7 @@ { $token_insert_time_start = microtime(true); $insert_sql[0] = " "; - $ins = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, doc_ids) VALUES $insert_sql"); + $ins = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, max_doc_id, doc_ids) VALUES $insert_sql"); $token_insert_time += (microtime(true)-$token_insert_time_start); # reset write buffer @@ -677,7 +686,7 @@ if ( $w >= $write_buffer_len || memory_get_usage() > $memory_usage_limit ) { $ins_sql[0] = " "; - $inspdo = $connection->query("INSERT INTO $target_table ( checksum, token, metaphone, doc_matches, doc_ids ) VALUES $ins_sql"); + $inspdo = $connection->query("INSERT INTO $target_table ( checksum, token, metaphone, doc_matches, max_doc_id, doc_ids ) VALUES $ins_sql"); unset($ins_sql); $ins_sql = ""; $w = 0; @@ -695,6 +704,7 @@ ".$connection->quote($row["token"]).", ".$row["metaphone"].", ".$row["doc_matches"].", + ".$row["max_doc_id"].", ".$connection->quote($row["doc_ids"]).")"; ++$w; } @@ -705,7 +715,7 @@ if ( !empty($ins_sql) ) { $ins_sql[0] = " "; - $inspdo = $connection->query("INSERT INTO $target_table ( checksum, token, metaphone, doc_matches, doc_ids ) VALUES $ins_sql"); + $inspdo = $connection->query("INSERT INTO $target_table ( checksum, token, metaphone, doc_matches, max_doc_id, doc_ids ) VALUES $ins_sql"); unset($ins_sql); $ins_sql = ""; $insert_counter = 0; @@ -721,9 +731,10 @@ $drop_start = microtime(true); $connection->beginTransaction(); - # remove the old table and rename the new one + # remove the old table and rename the new one $connection->query("DROP TABLE $clean_slate_target"); $connection->query("ALTER TABLE $target_table RENAME TO $clean_slate_target"); + #$connection->query("ALTER TABLE $clean_slate_target ADD INDEX(metaphone, doc_matches)"); # add metaphone index $connection->commit(); $drop_end = microtime(true) - $drop_start; diff --git a/token_compressor_merger_ext.php b/token_compressor_merger_ext.php index e62ff47..5ceb51a 100644 --- a/token_compressor_merger_ext.php +++ b/token_compressor_merger_ext.php @@ -52,7 +52,7 @@ require "data_partitioner.php"; # launch sister processes here if multiprocessing is turned on! -if ( $dist_threads > 1 && $process_number === 0 && empty($temp_disable_multiprocessing) ) +if ( $dist_threads > 1 && $process_number === 0 && empty($temp_disable_multiprocessing) && $data_size > 0 ) { # launch sister-processes for ( $x = 1 ; $x < $dist_threads ; ++$x ) @@ -108,6 +108,7 @@ token varbinary(40) NOT NULL, metaphone smallint(5) unsigned DEFAULT 0, doc_matches int(8) unsigned NOT NULL, + max_doc_id int(8) unsigned NOT NULL, doc_ids mediumblob NOT NULL ) ENGINE=MYISAM DEFAULT CHARSET=utf8;"); } @@ -123,11 +124,18 @@ token varbinary(40) NOT NULL, metaphone smallint(5) unsigned DEFAULT 0, doc_matches int(8) unsigned NOT NULL, + max_doc_id int(8) unsigned NOT NULL, doc_ids mediumblob NOT NULL, PRIMARY KEY(checksum, token) ) ENGINE=INNODB DEFAULT CHARSET=utf8;"); } + + if ( $process_number === 0 && $data_size === 0 ) + { + echo "Skipping token compression, nothing to compress! \n"; + return; + } $rows = 0; $write_buffer_len = 250; @@ -357,6 +365,7 @@ ".$connection->quote($oldrow["token"]).", ".$oldrow["metaphone"].", ".$oldrow["doc_matches"].", + ".$oldrow["max_doc_id"].", ".$connection->quote($oldrow["doc_ids"]).")"; ++$x; ++$w; @@ -365,7 +374,7 @@ { $token_insert_time_start = microtime(true); $insert_sql[0] = " "; - $ins = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, doc_ids) VALUES $insert_sql"); + $ins = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, max_doc_id, doc_ids) VALUES $insert_sql"); $token_insert_time += (microtime(true)-$token_insert_time_start); $w = 0; ++$insert_counter; @@ -395,8 +404,6 @@ } } - - # if these rows are to be combined if ( $oldrow && $min_checksum == $oldrow["checksum"] ) { @@ -411,6 +418,7 @@ ".$connection->quote($oldrow["token"]).", ".$oldrow["metaphone"].", ".($oldrow["doc_matches"]+$document_count).", + ".$min_doc_id.", ".$connection->quote(MergeCompressedData($old_doc_ids, $doc_id_string, $hex_lookup_decode, $hex_lookup_encode) . substr($oldrow["doc_ids"], $pos) . $token_data_string).")"; ++$combinations; ++$x; @@ -446,6 +454,7 @@ ".$connection->quote($oldrow["token"]).", ".$oldrow["metaphone"].", ".($oldrow["doc_matches"]+$document_count).", + ".$min_doc_id.", ".$connection->quote(MergeCompressedData($old_doc_ids, $doc_id_string, $hex_lookup_decode, $hex_lookup_encode) . substr($oldrow["doc_ids"], $pos) . $token_data_string).")"; ++$combinations; ++$x; @@ -467,6 +476,7 @@ ".$connection->quote($min_token).", $metaphone, $document_count, + $min_doc_id, ".$connection->quote($doc_id_string . $token_data_string).")"; ++$x; ++$w; @@ -494,6 +504,7 @@ ".$connection->quote($min_token).", $metaphone, $document_count, + $min_doc_id, ".$connection->quote($doc_id_string . $token_data_string).")"; ++$x; ++$w; @@ -510,7 +521,7 @@ { $token_insert_time_start = microtime(true); $insert_sql[0] = " "; - $ins = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, doc_ids) VALUES $insert_sql"); + $ins = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, max_doc_id, doc_ids) VALUES $insert_sql"); $token_insert_time += (microtime(true)-$token_insert_time_start); $w = 0; ++$insert_counter; @@ -589,6 +600,7 @@ ".$connection->quote($row_storage["token"]).", ".$row_storage["metaphone"].", ".$row_storage["doc_matches"].", + ".$row_storage["max_doc_id"].", ".$connection->quote($row_storage["doc_ids"]).")"; unset($row_storage); } @@ -599,6 +611,7 @@ ".$connection->quote($oldrow["token"]).", ".$oldrow["metaphone"].", ".$oldrow["doc_matches"].", + ".$oldrow["max_doc_id"].", ".$connection->quote($oldrow["doc_ids"]).")"; unset($oldrow); } @@ -611,6 +624,7 @@ ".$connection->quote($oldrow["token"]).", ".$oldrow["metaphone"].", ".$oldrow["doc_matches"].", + ".$oldrow["max_doc_id"].", ".$connection->quote($oldrow["doc_ids"]).")"; ++$x; ++$w; @@ -619,7 +633,7 @@ { $token_insert_time_start = microtime(true); $insert_sql[0] = " "; - $ins = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, doc_ids) VALUES $insert_sql"); + $ins = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, max_doc_id, doc_ids) VALUES $insert_sql"); $token_insert_time += (microtime(true)-$token_insert_time_start); $w = 0; ++$insert_counter; @@ -644,7 +658,7 @@ { $token_insert_time_start = microtime(true); $insert_sql[0] = " "; - $ins = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, doc_ids) VALUES $insert_sql"); + $ins = $connection->query("INSERT INTO $target_table (checksum, token, metaphone, doc_matches, max_doc_id, doc_ids) VALUES $insert_sql"); $token_insert_time += (microtime(true)-$token_insert_time_start); # reset write buffer @@ -720,7 +734,7 @@ if ( $w >= $write_buffer_len || memory_get_usage() > $memory_usage_limit ) { $ins_sql[0] = " "; - $inspdo = $connection->query("INSERT INTO $target_table ( checksum, token, metaphone, doc_matches, doc_ids ) VALUES $ins_sql"); + $inspdo = $connection->query("INSERT INTO $target_table ( checksum, token, metaphone, doc_matches, max_doc_id, doc_ids ) VALUES $ins_sql"); unset($ins_sql); $ins_sql = ""; $w = 0; @@ -738,6 +752,7 @@ ".$connection->quote($row["token"]).", ".$row["metaphone"].", ".$row["doc_matches"].", + ".$row["max_doc_id"].", ".$connection->quote($row["doc_ids"]).")"; ++$w; } @@ -749,7 +764,7 @@ if ( !empty($ins_sql) ) { $ins_sql[0] = " "; - $inspdo = $connection->query("INSERT INTO $target_table ( checksum, token, metaphone, doc_matches, doc_ids ) VALUES $ins_sql"); + $inspdo = $connection->query("INSERT INTO $target_table ( checksum, token, metaphone, doc_matches, max_doc_id, doc_ids ) VALUES $ins_sql"); unset($ins_sql); $ins_sql = ""; $insert_counter = 0; @@ -769,6 +784,7 @@ # remove the old table and rename the new one $connection->query("DROP TABLE $clean_slate_target"); $connection->query("ALTER TABLE $target_table RENAME TO $clean_slate_target"); + #$connection->query("ALTER TABLE $clean_slate_target ADD INDEX(metaphone, doc_matches)"); # add metaphone index $connection->commit(); $drop_end = microtime(true) - $drop_start; diff --git a/tokenizer_functions.php b/tokenizer_functions.php index b8fb105..e99f9cc 100644 --- a/tokenizer_functions.php +++ b/tokenizer_functions.php @@ -1763,7 +1763,16 @@ function check_tables($index_id, &$log = "") # metaphone column is missing ! $log .= "Metaphone column definition is missing, updating table...\n"; echo "Metaphone column definition is missing, updating table...\n"; - $alter_sql = "ALTER TABLE PMBTokens$index_suffix ADD metaphone smallint(5) unsigned DEFAULT 0 AFTER token, ADD INDEX (metaphone, doc_matches)"; + $alter_sql[] = "ADD metaphone smallint(5) unsigned DEFAULT 0 AFTER token, ADD INDEX (metaphone, doc_matches)"; + } + + # if max_doc_id is not defined + if ( stripos($data_string, "max_doc_id") === false ) + { + # max_doc_id column is missing ! + $log .= "Maximum document id column definition is missing, updating table...\n"; + echo "Maximum document id column definition is missing, updating table...\n"; + $alter_sql[] = "ADD max_doc_id int(8) unsigned NOT NULL AFTER doc_matches"; } } } @@ -1773,7 +1782,7 @@ function check_tables($index_id, &$log = "") { try { - $connection->query($alter_sql); + $connection->query("ALTER TABLE PMBTokens$index_suffix " . implode(", ", $alter_sql)); echo "PMBTokens$index_suffix table definition updated successfully.\n"; $log .= "PMBTokens$index_suffix table definition updated successfully.\n"; @@ -1836,6 +1845,7 @@ function create_tables($index_id, $index_type, &$created_tables = array(), &$dat token varbinary(40) NOT NULL, metaphone smallint(5) unsigned DEFAULT 0, doc_matches int(8) unsigned NOT NULL, + max_doc_id int(8) unsigned NOT NULL, doc_ids mediumblob NOT NULL, PRIMARY KEY (checksum, token), KEY metaphone (metaphone,doc_matches) diff --git a/web_tokenizer.php b/web_tokenizer.php index 19ff0f9..d4d0135 100644 --- a/web_tokenizer.php +++ b/web_tokenizer.php @@ -29,6 +29,9 @@ require_once("tokenizer_functions.php"); } +# set process state on +SetProcessState($index_id, $process_number, 1); + register_shutdown_function($shutdown_function); $suffix_list = array(); @@ -179,7 +182,6 @@ } # update current indexing state to true ( 1 ) - SetProcessState($index_id, $process_number, 1); SetIndexingState(1, $index_id); $upd_state = $connection->prepare("UPDATE PMBIndexes SET indexing_started = UNIX_TIMESTAMP() WHERE ID = ?"); diff --git a/web_tokenizer_ext.php b/web_tokenizer_ext.php index 54aabbe..7df2acd 100644 --- a/web_tokenizer_ext.php +++ b/web_tokenizer_ext.php @@ -29,6 +29,9 @@ require_once("tokenizer_functions.php"); } +# set process state on +SetProcessState($index_id, $process_number, 1); + register_shutdown_function($shutdown_function); $suffix_list = array(); @@ -177,7 +180,6 @@ } # update current indexing state to true ( 1 ) - SetProcessState($index_id, $process_number, 1); SetIndexingState(1, $index_id); $upd_state = $connection->prepare("UPDATE PMBIndexes SET indexing_started = UNIX_TIMESTAMP() WHERE ID = ?");