Skip to content

Commit

Permalink
Merge pull request #290 from ArcBees/mi_crawlerEnhance
Browse files Browse the repository at this point in the history
Enhancement on GWTP Crawler
  • Loading branch information
imrabti committed Jul 17, 2013
2 parents f19df0a + 581eb27 commit 306000f
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 30 deletions.
2 changes: 2 additions & 0 deletions gwtp-carstore/pom.xml
Expand Up @@ -86,6 +86,8 @@
<server>com.google.appengine.tools.development.gwt.AppEngineLauncher</server>
<appEngineVersion>${gae.version}</appEngineVersion>
<appEngineHome>${gae.home}</appEngineHome>
<extraJvmArgs>-Xss2048k -Xmx1024M -XX:MaxPermSize=512m</extraJvmArgs>
<localWorkers>2</localWorkers>

<runTarget>CarStore.html</runTarget>
<modules>
Expand Down
Expand Up @@ -51,9 +51,10 @@ public class LazyActionHandlerValidatorRegistryImpl implements
@Inject
LazyActionHandlerValidatorRegistryImpl(Injector injector) {
this.injector = injector;
actionHandlerValidatorClasses = new ConcurrentHashMap<Class<? extends Action<?>>, ActionHandlerValidatorClass<? extends
Action<?>, ? extends Result>>();
actionHandlerValidatorInstances = new ConcurrentHashMap<Class<? extends Action<?>>, ActionHandlerValidatorInstance>();
actionHandlerValidatorClasses = new ConcurrentHashMap<Class<? extends Action<?>>,
ActionHandlerValidatorClass<? extends Action<?>, ? extends Result>>();
actionHandlerValidatorInstances = new ConcurrentHashMap<Class<? extends Action<?>>,
ActionHandlerValidatorInstance>();
validators = new ConcurrentHashMap<Class<? extends ActionValidator>, ActionValidator>();
}

Expand Down Expand Up @@ -105,8 +106,8 @@ public <A extends Action<R>, R extends Result> void removeActionHandlerValidator
Class<A> actionClass,
ActionHandlerValidatorClass<A, R> actionHandlerValidatorClass) {

ActionHandlerValidatorClass<?, ?> oldActionHandlerValidatorClass = actionHandlerValidatorClasses.get
(actionClass);
ActionHandlerValidatorClass<?, ?> oldActionHandlerValidatorClass = actionHandlerValidatorClasses.get(
actionClass);

if (oldActionHandlerValidatorClass == actionHandlerValidatorClass) {
actionHandlerValidatorClasses.remove(actionClass);
Expand Down
Expand Up @@ -46,9 +46,10 @@ public class LazyActionHandlerValidatorRegistryImpl implements LazyActionHandler
private final Map<Class<? extends ActionValidator>, ActionValidator> validators;

public LazyActionHandlerValidatorRegistryImpl() {
actionHandlerValidatorClasses = new ConcurrentHashMap<Class<? extends Action<?>>, ActionHandlerValidatorClass<? extends
Action<?>, ? extends Result>>();
actionHandlerValidatorInstances = new ConcurrentHashMap<Class<? extends Action<?>>, ActionHandlerValidatorInstance>();
actionHandlerValidatorClasses = new ConcurrentHashMap<Class<? extends Action<?>>,
ActionHandlerValidatorClass<? extends Action<?>, ? extends Result>>();
actionHandlerValidatorInstances = new ConcurrentHashMap<Class<? extends Action<?>>,
ActionHandlerValidatorInstance>();
validators = new ConcurrentHashMap<Class<? extends ActionValidator>, ActionValidator>();
}

Expand Down Expand Up @@ -96,8 +97,8 @@ public ActionValidator findActionValidator(Class<? extends ActionValidator> acti
public <A extends Action<R>, R extends Result> void removeActionHandlerValidatorClass(Class<A> actionClass,
ActionHandlerValidatorClass<A, R> actionHandlerValidatorClass) {

ActionHandlerValidatorClass<?, ?> oldActionHandlerValidatorClass = actionHandlerValidatorClasses.get
(actionClass);
ActionHandlerValidatorClass<?, ?> oldActionHandlerValidatorClass = actionHandlerValidatorClasses.get(
actionClass);

if (oldActionHandlerValidatorClass == actionHandlerValidatorClass) {
actionHandlerValidatorClasses.remove(actionClass);
Expand Down
Expand Up @@ -18,19 +18,23 @@

import java.io.IOException;
import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;

import javax.inject.Provider;
import javax.inject.Singleton;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.SilentCssErrorHandler;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebRequest;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.google.inject.Inject;
import com.googlecode.objectify.Key;
Expand All @@ -44,36 +48,50 @@
@Singleton
public class CrawlServiceServlet extends HttpServlet {

private class SyncAllAjaxController extends NicelyResynchronizingAjaxController {
private static final long serialVersionUID = 1L;

@Override
public boolean processSynchron(HtmlPage page, WebRequest request, boolean async) {
return true;
}
}

private static final String CHAR_ENCODING = "UTF-8";

private static final long serialVersionUID = -6129110224710383122L;

@Inject(optional = true)
@HtmlUnitTimeoutMillis
private long timeoutMillis = 12000;
private long jsTimeoutMillis = 1000;
private long pageWaitMillis = 200;
private int maxLoopChecks = 2;

@Inject(optional = true)
@CachedPageTimeoutSec
private long cachedPageTimeoutSec = 15 * 60;

private final Logger log;
private final Provider<WebClient> webClientProvider;

private final String key;

private final CachedPageDao cachedPageDao;

@Inject
CrawlServiceServlet(final Provider<WebClient> webClientProvider,
@ServiceKey String key,
CachedPageDao cachedPageDao) {
CrawlServiceServlet(Provider<WebClient> webClientProvider,
Logger log,
CachedPageDao cachedPageDao,
@ServiceKey String key) {
this.webClientProvider = webClientProvider;
this.log = log;
this.key = key;
this.cachedPageDao = cachedPageDao;
}

@Override
protected void doGet(HttpServletRequest req, HttpServletResponse resp) {

PrintWriter out = null;
try {
resp.setCharacterEncoding(CHAR_ENCODING);
Expand Down Expand Up @@ -128,9 +146,7 @@ private void storeFetchedPage(CachedPage cachedPage,
* @param out The {@link PrintWriter} to write to, if needed.
* @return {@code true} if the page needs to be fetched, {@code false} otherwise.
*/
private boolean needToFetchPage(CachedPage matchingPage,
Date currDate, PrintWriter out) {

private boolean needToFetchPage(CachedPage matchingPage, Date currDate, PrintWriter out) {
if (matchingPage == null) {
return true;
}
Expand Down Expand Up @@ -175,19 +191,42 @@ private CachedPage createPlaceholderPage(String url, Date currDate) {
* @throws IOException
* @throws MalformedURLException
*/
private StringBuilder renderPage(String url) throws IOException,
MalformedURLException {
private StringBuilder renderPage(String url) throws IOException {
WebClient webClient = webClientProvider.get();

webClient.setCssEnabled(false);
webClient.setJavaScriptTimeout(0);
webClient.setJavaScriptTimeout(0);
webClient.setThrowExceptionOnScriptError(false);
webClient.setThrowExceptionOnFailingStatusCode(false);
webClient.setJavaScriptEnabled(true);
webClient.getCache().clear();
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setRedirectEnabled(false);
webClient.setAjaxController(new SyncAllAjaxController());
webClient.setCssErrorHandler(new SilentCssErrorHandler());

HtmlPage page = webClient.getPage(url);
webClient.getJavaScriptEngine().pumpEventLoop(timeoutMillis);

int waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(jsTimeoutMillis);
int loopCount = 0;

while (waitForBackgroundJavaScript > 0 && loopCount < maxLoopChecks) {
++loopCount;
waitForBackgroundJavaScript = webClient.waitForBackgroundJavaScript(jsTimeoutMillis);

if (waitForBackgroundJavaScript == 0) {
log.fine("HtmlUnit exits background javascript at loop counter " + loopCount);
break;
}

synchronized (page) {
log.fine("HtmlUnit waits for background javascript at loop counter " + loopCount);
try {
page.wait(pageWaitMillis);
} catch (InterruptedException e) {
log.log(Level.SEVERE, "HtmlUnit ERROR on page.wait at loop counter " + loopCount, e);
}
}
}

StringBuilder stringBuilder = new StringBuilder();
stringBuilder.append("<hr />\n");
stringBuilder.append("<center><h3>You are viewing a non-interactive page that is intended for the crawler. ");
Expand All @@ -197,6 +236,7 @@ private StringBuilder renderPage(String url) throws IOException,

stringBuilder.append(page.asXml());
webClient.closeAllWindows();

return stringBuilder;
}

Expand All @@ -209,8 +249,7 @@ private StringBuilder renderPage(String url) throws IOException,
* @param currDate The current date, to check for expiration.
* @return The non-expired matching page if found, {@code null} otherwise.
*/
private CachedPage extractMatchingPage(Map<Key<CachedPage>, CachedPage> deprecatedPages,
Date currDate) {
private CachedPage extractMatchingPage(Map<Key<CachedPage>, CachedPage> deprecatedPages, Date currDate) {
CachedPage matchingPage = findMostRecentPage(deprecatedPages);

// Keep the matching page only if it has not expired
Expand Down
Expand Up @@ -37,6 +37,6 @@ public void configureServlets() {
@Singleton
@Provides
WebClient getWebClient() {
return new WebClient(BrowserVersion.FIREFOX_3_6);
return new WebClient(BrowserVersion.FIREFOX_17);
}
}
2 changes: 1 addition & 1 deletion pom.xml
Expand Up @@ -255,7 +255,7 @@
<jukito.version>1.1.2</jukito.version>
<junit.version>4.11</junit.version>
<mockito.version>1.9.5</mockito.version>
<htmlunit.version>2.9</htmlunit.version>
<htmlunit.version>2.12</htmlunit.version>
<selenium.version>2.32.0</selenium.version>
<cucumber.version>1.1.3</cucumber.version>
<httpcore.version>4.2.3</httpcore.version>
Expand Down

0 comments on commit 306000f

Please sign in to comment.