Skip to content

Commit

Permalink
[DS-2462] query.filter.spiderIp is redundant, incomplete, scales poorly
Browse files Browse the repository at this point in the history
Remove IP-only, agent-only usage grooming.  Use SpiderDetector to make
all decisions based on the full array of detectors.
  • Loading branch information
mwoodiupui committed Jul 3, 2020
1 parent cc7a6c3 commit 6b1b1a5
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 89 deletions.
Expand Up @@ -701,83 +701,40 @@ public void process(SolrInputDocument doc) throws IOException, SolrServerExcepti
}
}


@Override
public void markRobotsByIP() {
for (String ip : SpiderDetector.getSpiderIpAddresses()) {

try {

/* Result Process to alter record to be identified as a bot */
ResultProcessor processor = new ResultProcessor() {
@Override
public void process(SolrInputDocument doc) throws IOException, SolrServerException {
doc.removeField("isBot");
doc.addField("isBot", true);
solr.add(doc);
log.info("Marked " + doc.getFieldValue("ip") + " as bot");
}
};

/* query for ip, exclude results previously set as bots. */
processor.execute("ip:" + ip + "* AND -isBot:true");

solr.commit();

} catch (Exception e) {
log.error(e.getMessage(), e);
}


}

}

@Override
public void markRobotByUserAgent(String agent) {
try {

/* Result Process to alter record to be identified as a bot */
ResultProcessor processor = new ResultProcessor() {
@Override
public void process(SolrInputDocument doc) throws IOException, SolrServerException {
public void markRobots() {
ResultProcessor processor = new ResultProcessor() {
@Override
public void process(SolrInputDocument doc)
throws IOException, SolrServerException {
String clientIP = (String) doc.getField("ip").getValue();
String hostname = (String) doc.getField("dns").getValue();
String agent = (String) doc.getField("userAgent").getValue();
if (SpiderDetector.isSpider(clientIP, null, hostname, agent)) {
doc.removeField("isBot");
doc.addField("isBot", true);
solr.add(doc);
log.info("Marked {} / {} / {} as a robot in record {}.",
clientIP, hostname, agent,
doc.getField("uid").getValue());
}
};

/* query for ip, exclude results previously set as bots. */
processor.execute("userAgent:" + agent + " AND -isBot:true");
}
};

try {
processor.execute("-isBot:true");
solr.commit();
} catch (Exception e) {
log.error(e.getMessage(), e);
} catch (SolrServerException | IOException ex) {
log.error("Failed while marking robot accesses.", ex);
}
}

@Override
public void deleteRobotsByIsBotFlag() {
public void deleteRobots() {
try {
solr.deleteByQuery("isBot:true");
} catch (Exception e) {
log.error(e.getMessage(), e);
}
}

@Override
public void deleteIP(String ip) {
try {
solr.deleteByQuery("ip:" + ip + "*");
} catch (Exception e) {
log.error(e.getMessage(), e);
}
}

@Override
public void deleteRobotsByIP() {
for (String ip : SpiderDetector.getSpiderIpAddresses()) {
deleteIP(ip);
} catch (IOException | SolrServerException e) {
log.error("Failed while deleting robot accesses.", e);
}
}

Expand Down Expand Up @@ -1033,11 +990,6 @@ public QueryResponse query(String query, String filterQuery,
// performance and ensure the search result ordering will
// not be influenced

// Choose to filter by the Legacy spider IP list (may get too long to properly filter all IP's
if (configurationService.getBooleanProperty("solr-statistics.query.filter.spiderIp", false)) {
solrQuery.addFilterQuery(getIgnoreSpiderIPs());
}

// Choose to filter by isBot field, may be overriden in future
// to allow views on stats based on bots.
if (configurationService.getBooleanProperty("solr-statistics.query.filter.isBot", true)) {
Expand All @@ -1052,7 +1004,7 @@ public QueryResponse query(String query, String filterQuery,
if (bundles != null && bundles.length > 0) {

/**
* The code below creates a query that will allow only records which do not have a bundlename
* The code below creates a query that will allow only records which do not have a bundle name
* (items, collections, ...) or bitstreams that have a configured bundle name
*/
StringBuffer bundleQuery = new StringBuffer();
Expand Down
Expand Up @@ -34,13 +34,14 @@
public interface SolrLoggerService {

/**
* Old post method, use the new postview method instead !
* Old post method, use the new postView method instead!
*
* @param dspaceObject the object used.
* @param request the current request context.
* @param currentUser the current session's user.
* @deprecated
*/
@Deprecated
public void post(DSpaceObject dspaceObject, HttpServletRequest request,
EPerson currentUser);

Expand Down Expand Up @@ -95,15 +96,18 @@ public Map<String, List<String>> queryField(String query,
List oldFieldVals, String field)
throws IOException;

public void markRobotsByIP();

public void markRobotByUserAgent(String agent);

public void deleteRobotsByIsBotFlag();

public void deleteIP(String ip);
/**
* Scan the entire 'statistics' collection for documents that should be
* marked 'isBot:true' according to
* {@link org.dspace.statistics.util.SpiderDetector#isSpider(java.lang.String,
* java.lang.String, java.lang.String, java.lang.String)}.
*/
public void markRobots();

public void deleteRobotsByIP();
/**
* Delete all 'statistics' documents having 'isBot:true'.
*/
public void deleteRobots();

/*
* update(String query, boolean addField, String fieldName, Object
Expand Down
Expand Up @@ -65,7 +65,6 @@ public static void main(String[] args) throws Exception {

options.addOption("m", "mark-spiders", false, "Update isBot Flag in Solr");
options.addOption("f", "delete-spiders-by-flag", false, "Delete Spiders in Solr By isBot Flag");
options.addOption("i", "delete-spiders-by-ip", false, "Delete Spiders in Solr By IP Address");
options.addOption("o", "optimize", false, "Run maintenance on the SOLR index");
options.addOption("b", "reindex-bitstreams", false, "Reindex the bitstreams to ensure we have the bundle name");
options.addOption("e", "export", false,
Expand All @@ -87,11 +86,9 @@ public static void main(String[] args) throws Exception {
if (line.hasOption("u")) {
StatisticsClient.updateSpiderFiles();
} else if (line.hasOption('m')) {
solrLoggerService.markRobotsByIP();
solrLoggerService.markRobots();
} else if (line.hasOption('f')) {
solrLoggerService.deleteRobotsByIsBotFlag();
} else if (line.hasOption('i')) {
solrLoggerService.deleteRobotsByIP();
solrLoggerService.deleteRobots();
} else if (line.hasOption('o')) {
solrLoggerService.optimizeSOLR();
} else if (line.hasOption('b')) {
Expand All @@ -106,7 +103,7 @@ public static void main(String[] args) throws Exception {
}

/**
* Method to update Spiders in config directory.
* Method to update Spiders in configuration directory.
*/
private static void updateSpiderFiles() {
try {
Expand Down
4 changes: 0 additions & 4 deletions dspace/config/modules/solr-statistics.cfg
Expand Up @@ -19,10 +19,6 @@ solr-statistics.query.filter.bundles=ORIGINAL
# create new Solr cores when sharding the statistics data.
solr-statistics.configset = statistics

# control solr statistics querying to filter out spider IPs
# false by default
#solr-statistics.query.filter.spiderIp = false

# control solr statistics querying to look at "isBot" field to determine
# if record is a bot. true by default.
#solr-statistics.query.filter.isBot = true
Expand Down

0 comments on commit 6b1b1a5

Please sign in to comment.