Skip to content
This repository has been archived by the owner on Nov 25, 2020. It is now read-only.

Commit

Permalink
Finish ElasticSearch implementation / Refactor some method to common …
Browse files Browse the repository at this point in the history
…parent with Lucene.
  • Loading branch information
cdujeu committed Apr 16, 2015
1 parent d20c545 commit b790df3
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 86 deletions.
72 changes: 72 additions & 0 deletions core/src/plugins/core.index/class.AbstractSearchEngineIndexer.php
Expand Up @@ -8,6 +8,19 @@

abstract class AbstractSearchEngineIndexer extends AJXP_AbstractMetaSource {

/**
* @param DOMNode $contribNode
*/
public function parseSpecificContributions(&$contribNode){
parent::parseSpecificContributions($contribNode);
if($this->getFilteredOption("HIDE_MYSHARES_SECTION") !== true) return;
if($contribNode->nodeName != "client_configs") return ;
$actionXpath=new DOMXPath($contribNode->ownerDocument);
$nodeList = $actionXpath->query('component_config[@className="AjxpPane::navigation_scroller"]', $contribNode);
if(!$nodeList->length) return ;
$contribNode->removeChild($nodeList->item(0));
}

/**
* @param AJXP_Node $ajxpNode
* @return null|string
Expand Down Expand Up @@ -63,4 +76,63 @@ protected function extractIndexableContent($ajxpNode){
return null;
}

/**
* @param String $query
* @return String mixed
*/
protected function filterSearchRangesKeywords($query)
{
if (strpos($query, "AJXP_SEARCH_RANGE_TODAY") !== false) {
$t1 = date("Ymd");
$t2 = date("Ymd");
$query = str_replace("AJXP_SEARCH_RANGE_TODAY", "[$t1 TO $t2]", $query);
} else if (strpos($query, "AJXP_SEARCH_RANGE_YESTERDAY") !== false) {
$t1 = date("Ymd", mktime(0,0,0,date('m'), date('d')-1, date('Y')));
$t2 = date("Ymd", mktime(0,0,0,date('m'), date('d')-1, date('Y')));
$query = str_replace("AJXP_SEARCH_RANGE_YESTERDAY", "[$t1 TO $t2]", $query);
} else if (strpos($query, "AJXP_SEARCH_RANGE_LAST_WEEK") !== false) {
$t1 = date("Ymd", mktime(0,0,0,date('m'), date('d')-7, date('Y')));
$t2 = date("Ymd", mktime(0,0,0,date('m'), date('d'), date('Y')));
$query = str_replace("AJXP_SEARCH_RANGE_LAST_WEEK", "[$t1 TO $t2]", $query);
} else if (strpos($query, "AJXP_SEARCH_RANGE_LAST_MONTH") !== false) {
$t1 = date("Ymd", mktime(0,0,0,date('m')-1, date('d'), date('Y')));
$t2 = date("Ymd", mktime(0,0,0,date('m'), date('d'), date('Y')));
$query = str_replace("AJXP_SEARCH_RANGE_LAST_MONTH", "[$t1 TO $t2]", $query);
} else if (strpos($query, "AJXP_SEARCH_RANGE_LAST_YEAR") !== false) {
$t1 = date("Ymd", mktime(0,0,0,date('m'), date('d'), date('Y')-1));
$t2 = date("Ymd", mktime(0,0,0,date('m'), date('d'), date('Y')));
$query = str_replace("AJXP_SEARCH_RANGE_LAST_YEAR", "[$t1 TO $t2]", $query);
}

$split = array_map("trim", explode("AND", $query));
foreach($split as $s){
list($k, $v) = explode(":", $s, 2);
if($k == "ajxp_bytesize"){
//list($from, $to) = sscanf($v, "[%s TO %s]");
preg_match('/\[(.*) TO (.*)\]/', $v, $matches);
$oldSize = $s;
$newSize = "ajxp_bytesize:[".intval(AJXP_Utils::convertBytes($matches[1]))." TO ".intval(AJXP_Utils::convertBytes($matches[2]))."]";
}
}
if(isSet($newSize) && isSet($oldSize)){
$query = str_replace($oldSize, $newSize, $query);
}

return $query;
}

/**
* @param String $repositoryId
* @param String $userId
* @return string
*/
protected function buildSpecificId($repositoryId, $userId = null){
$specificId = "";
$specKey = $this->getFilteredOption("repository_specific_keywords");
if (!empty($specKey)) {
$specificId = "-".str_replace(array(",", "/"), array("-", "__"), AJXP_VarsFilter::filter($specKey, $userId));
}
return $repositoryId.$specificId;
}

}
117 changes: 82 additions & 35 deletions core/src/plugins/index.elasticsearch/class.AjxpElasticSearch.php
Expand Up @@ -62,7 +62,7 @@ class AjxpElasticSearch extends AbstractSearchEngineIndexer

public function init($options)
{
parent::init($options);;
parent::init($options);
$metaFields = $this->getFilteredOption("index_meta_fields");
$specKey = $this->getFilteredOption("repository_specific_keywords");
if (!empty($metaFields)) {
Expand Down Expand Up @@ -113,7 +113,7 @@ public function indexationIndexNode($node){
* @param AJXP_Node $parentNode
*/
public function indexationStarts($parentNode){
$this->loadIndex($parentNode->getRepositoryId(), true);
$this->loadIndex($parentNode->getRepositoryId(), true, $parentNode->getUser());
}

/**
Expand Down Expand Up @@ -145,6 +145,42 @@ public function applyAction($actionName, $httpVars, $fileVars)
throw new Exception($messages["index.lucene.7"]);
}

$textQuery = $httpVars["query"];
if($this->getFilteredOption("AUTO_WILDCARD") === true && strlen($textQuery) > 0 && ctype_alnum($textQuery)){
if($textQuery[0] == '"' && $textQuery[strlen($textQuery)-1] == '"'){
$textQuery = substr($textQuery, 1, -1);
}else if($textQuery[strlen($textQuery)-1] != "*" ){
$textQuery.="*";
}
}


$this->currentIndex->open();
$fieldQuery = new Elastica\Query\QueryString();
$fieldQuery->setAllowLeadingWildcard(false);
$fieldQuery->setFuzzyMinSim(0.8);

if($textQuery == "*"){

$fields = array("ajxp_node");
$fieldQuery->setQuery("yes");
$fieldQuery->setFields($fields);

}else if(strpos($textQuery, ":") !== false){

// USE LUCENE DSL DIRECTLY (key1:value1 AND key2:value2...)
$textQuery = str_replace("ajxp_meta_ajxp_document_content:","body:", $textQuery);
$textQuery = $this->filterSearchRangesKeywords($textQuery);
$fieldQuery->setQuery($textQuery);

} else{

$fields = array("basename","ajxp_meta_*", "node_*","body");
$fieldQuery->setQuery($textQuery);
$fieldQuery->setFields($fields);

}

/*
TODO : READAPT QUERY WITH EACH FIELD
if ((isSet($this->metaFields) || $this->indexContent) && isSet($httpVars["fields"])) {
Expand All @@ -165,22 +201,8 @@ public function applyAction($actionName, $httpVars, $fileVars)
$this->logDebug("Query : $query");
} else {
*/
$this->currentIndex->open();
$query = $httpVars["query"];
$fieldQuery = new Elastica\Query\QueryString();

//}
//$this->setDefaultAnalyzer();
if ($query == "*") {
$fields = array("ajxp_node");
$fieldQuery->setQuery("yes");
} else {
$fields = array("basename","ajxp_meta_*", "node_*","body");
$fieldQuery->setQuery($query);
}
$fieldQuery->setFields($fields);
$fieldQuery->setAllowLeadingWildcard(false);
$fieldQuery->setFuzzyMinSim(0.8);
/*
We create this object search because it'll allow us to fetch the number of results we want at once.
We just have to set some parameters, the query type and the size of the result set.
Expand All @@ -196,14 +218,20 @@ public function applyAction($actionName, $httpVars, $fileVars)
\Elastica\Search::OPTION_SEARCH_TYPE => \Elastica\Search::OPTION_SEARCH_TYPE_QUERY_THEN_FETCH,
\Elastica\Search::OPTION_SIZE => $maxResults);

$this->logDebug(__FUNCTION__,"Executing query: ", $query);
$this->logDebug(__FUNCTION__,"Executing query: ", $textQuery);
$fullQuery = new Elastica\Query();
$fullQuery->setQuery($fieldQuery);

// ADD SCOPE FILTER
$term = new Elastica\Filter\Term();
$term->setTerm("ajxp_scope", "shared");
$fullQuery->setPostFilter($term);
$qb = new Elastica\QueryBuilder();
$fullQuery = new Elastica\Query();
$fullQuery->setQuery(
$qb->query()->filtered(
$fieldQuery,
$qb->filter()->bool()
->addMust(new Elastica\Filter\Term(array("ajxp_scope" => "shared")))
)
);


$result = $search->search($fullQuery, $searchOptions);
$this->logDebug(__FUNCTION__,"Search finished. ");
Expand Down Expand Up @@ -236,7 +264,7 @@ public function applyAction($actionName, $httpVars, $fileVars)

$scope = "user";
try {
$this->loadIndex(ConfService::getRepository()->getId(), false);
$this->loadIndex($repoId, false);
} catch (Exception $ex) {
throw new Exception($messages["index.lucene.7"]);
}
Expand Down Expand Up @@ -351,7 +379,7 @@ public function recursiveIndexation($url)
*/
public function updateNodeIndexMeta($node)
{
$this->loadIndex(ConfService::getRepository()->getId());
$this->loadIndex($node->getRepositoryId(), true, $node->getUser());
if (AuthService::usersEnabled() && AuthService::getLoggedUser()!=null) {

$query = new Elastica\Query\Term();
Expand Down Expand Up @@ -387,9 +415,9 @@ public function updateNodeIndexMeta($node)
public function updateNodeIndex($oldNode, $newNode = null, $copy = false, $recursive = false)
{
if($oldNode == null){
$this->loadIndex($newNode->getRepositoryId());
$this->loadIndex($newNode->getRepositoryId(), true, $newNode->getUser());
}else{
$this->loadIndex($oldNode->getRepositoryId());
$this->loadIndex($oldNode->getRepositoryId(), true, $oldNode->getUser());
}

if ($oldNode != null && $copy == false) {
Expand All @@ -412,14 +440,21 @@ public function updateNodeIndex($oldNode, $newNode = null, $copy = false, $recur
// Make sure it does not already exists anyway
$newDocId = $this->getIndexedDocumentId($newNode);
if ($newDocId != null) {
$this->currentType->deleteById($newDocId);
try{
$this->currentType->deleteById($newDocId);
}catch (Elastica\Exception\NotFoundException $eEx){
$this->logError(__FUNCTION__, "Trying to delete a non existing document");
}
$childrenHits = $this->getIndexedChildrenDocuments($newNode);

if ($childrenHits != null) {
$childrenHits = $childrenHits->getResults();

foreach ($childrenHits as $hit) {
$this->currentType->deleteById($hit->getId());
try{
$this->currentType->deleteById($hit->getId());
}catch (Elastica\Exception\NotFoundException $eEx){
$this->logError(__FUNCTION__, "Trying to delete a non existing document");
}
}
}
}
Expand Down Expand Up @@ -473,6 +508,14 @@ public function createIndexedDocument($ajxpNode)
$data["ajxp_node"] = "yes";
$data["ajxp_scope"] = "shared";
$data["serialized_metadata"] = base64_encode(serialize($ajxpNode->metadata));
$data["ajxp_modiftime"] = date("Ymd", $ajxpNode->ajxp_modiftime);
$data["ajxp_bytesize"] = $ajxpNode->bytesize;
$ajxpMime = $ajxpNode->ajxp_mime;
if (empty($ajxpMime)) {
$data["ajxp_mime"] = pathinfo($ajxpNode->getLabel(), PATHINFO_EXTENSION);
} else {
$data["ajxp_mime"] = $ajxpNode->ajxp_mime;
}

if (isSet($ajxpNode->indexableMetaKeys["shared"])) {
foreach ($ajxpNode->indexableMetaKeys["shared"] as $sharedField) {
Expand Down Expand Up @@ -540,9 +583,10 @@ protected function dataToMappingProperties($data){
$mapping_properties[$key] = array("type" => "string", "index" => "not_analyzed");
} else if($key == "serialized_metadata"){
$mapping_properties[$key] = array("type" => "string" /*, "index" => "no" */);
} else if ($key == "ajxp_bytesize"){
$mapping_properties[$key] = array("type" => "long");
} else {
$type = gettype($value);

if ($type != "integer" && $type != "boolean" && $type != "double") {
$type = "string";
}
Expand Down Expand Up @@ -599,23 +643,26 @@ public function getIndexedChildrenDocuments($ajxpNode)
* load the index into the class parameter currentIndex
* @param Integer $repositoryId
* @param bool $create
* @param null $resolveUserId
*/
protected function loadIndex($repositoryId, $create = true)
protected function loadIndex($repositoryId, $create = true, $resolveUserId = null)
{
$this->currentIndex = $this->client->getIndex($repositoryId);
$specificId = $this->buildSpecificId($repositoryId, $resolveUserId);

$this->currentIndex = $this->client->getIndex($specificId);

/* if the cache directory for the repository index is not created we do create it */
$iPath = (defined('AJXP_SHARED_CACHE_DIR')?AJXP_SHARED_CACHE_DIR:AJXP_CACHE_DIR)."/indexes/".$repositoryId;
$iPath = (defined('AJXP_SHARED_CACHE_DIR')?AJXP_SHARED_CACHE_DIR:AJXP_CACHE_DIR)."/indexes/".$specificId;
if(!is_dir($iPath)) mkdir($iPath,0755, true);

if (!$this->currentIndex->exists() && $create) {
if ($create && !$this->currentIndex->exists()) {
$this->currentIndex->create();
}

$this->currentType = new Elastica\Type($this->currentIndex, "type_".$repositoryId);
$this->currentType = new Elastica\Type($this->currentIndex, "type_".$specificId);

/* we fetch the last id we used to create a document and set the variable nextId */
$this->lastIdPath = (defined('AJXP_SHARED_CACHE_DIR')?AJXP_SHARED_CACHE_DIR:AJXP_CACHE_DIR)."/indexes/".$repositoryId."/last_id";
$this->lastIdPath = (defined('AJXP_SHARED_CACHE_DIR')?AJXP_SHARED_CACHE_DIR:AJXP_CACHE_DIR)."/indexes/".$specificId."/last_id";
if (file_exists($this->lastIdPath)) {
$file = fopen($this->lastIdPath, "r");
$this->nextId = floatval(fgets($file)) + 1;
Expand Down
2 changes: 1 addition & 1 deletion core/src/plugins/index.elasticsearch/manifest.xml
Expand Up @@ -23,7 +23,7 @@
<global_param name="PARSE_CONTENT_TXT" type="string" label="CONF_MESSAGE[Text files]" description="CONF_MESSAGE[List of extensions to consider as Text file and parse content]" mandatory="true" default="txt"/>
<global_param name="UNOCONV" type="string" label="CONF_MESSAGE[Unoconv Path]" description="CONF_MESSAGE[Full path on the server to the 'unoconv' binary]" default="" mandatory="false"/>
<global_param name="PDFTOTEXT" type="string" label="CONF_MESSAGE[PdftoText Path]" description="CONF_MESSAGE[Full path on the server to the 'pdftotext' binary]" default="" mandatory="false"/>
<global_param name="QUERY_ANALYSER" type="select" choices="utf8num_insensitive|UTF8 Text/Num (case insensitive),utf8num_sensitive|UTF8 Text/Num (case sensitive),utf8_insensitive|UTF8 Text (case insensitive),utf8_sensitive|UTF8 Text (case sensitive),textnum_insensitive|Text/Num (case insensitive),textnum_sensitive|Text/Num (case sensitive),text_insensitive|Text (case insensitive),text_sensitive|Text (case sensitive)" label="CONF_MESSAGE[Query Analyzer]" description="CONF_MESSAGE[Analyzer used by Zend to parse the queries. Warning, the UTF8 analyzers require the php mbstring extension.]" default="textnum_insensitive" mandatory="false"/>
<global_param name="AUTO_WILDCARD" type="boolean" label="CONF_MESSAGE[Auto-Wildcard]" description="CONF_MESSAGE[Automatically append a * after the user query to make the search broader]" default="false" mandatory="false"/>
<global_param name="WILDCARD_LIMITATION" type="integer" label="CONF_MESSAGE[Wildcard limitation]" description="CONF_MESSAGE[For the sake of performances, it is not recommanded to use wildcard as a very first character of a query string. Lucene recommends asking the user minimum 3 characters before wildcard. Still, you can set it to 0 if necessary for your usecases.]" default="3" mandatory="false"/>
</server_settings>
<registry_contributions>
Expand Down

0 comments on commit b790df3

Please sign in to comment.