Skip to content

Commit

Permalink
use urlcanon instead of crawler4j because we only used it for url can…
Browse files Browse the repository at this point in the history
…onization
  • Loading branch information
Athou committed Apr 29, 2023
1 parent 00f6c04 commit 0a99dac
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 11 deletions.
12 changes: 3 additions & 9 deletions commafeed-server/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -426,15 +426,9 @@
<version>0.9.30</version>
</dependency>
<dependency>
<groupId>edu.uci.ics</groupId>
<artifactId>crawler4j</artifactId>
<version>3.5</version>
<exclusions>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
</exclusions>
<groupId>org.netpreserve</groupId>
<artifactId>urlcanon</artifactId>
<version>0.4.0</version>
</dependency>
<dependency>
<groupId>com.google.gwt</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
import org.jsoup.safety.Cleaner;
import org.jsoup.safety.Safelist;
import org.jsoup.select.Elements;
import org.netpreserve.urlcanon.Canonicalizer;
import org.netpreserve.urlcanon.ParsedUrl;
import org.w3c.css.sac.InputSource;
import org.w3c.dom.css.CSSStyleDeclaration;

Expand All @@ -41,7 +43,6 @@
import com.ibm.icu.text.CharsetMatch;
import com.steadystate.css.parser.CSSOMParser;

import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import lombok.extern.slf4j.Slf4j;

/**
Expand Down Expand Up @@ -179,7 +180,10 @@ public static String normalizeURL(String url) {
if (url == null) {
return null;
}
String normalized = URLCanonicalizer.getCanonicalURL(url);

ParsedUrl parsedUrl = ParsedUrl.parseUrl(url);
Canonicalizer.AGGRESSIVE.canonicalize(parsedUrl);
String normalized = parsedUrl.toString();
if (normalized == null) {
normalized = url;
}
Expand Down

0 comments on commit 0a99dac

Please sign in to comment.