Skip to content

Commit

Permalink
Enriched default seedlist to include https and optional www
Browse files Browse the repository at this point in the history
  • Loading branch information
csrster committed Aug 25, 2022
1 parent de90f73 commit ee7a55c
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,9 @@ protected void map(LongWritable lineNumber, Text filePath, Context context) thro
path = HadoopFileUtils.replaceWithCachedPathIfEnabled(context, path);

//TEST this but this fs should work for both local and hdfs files
try (FileSystem fs = path.getFileSystem(context.getConfiguration())) {
//try (FileSystem fs = path.getFileSystem(context.getConfiguration())) {
try {
FileSystem fs = path.getFileSystem(context.getConfiguration());
log.info("Opened FileSystem {}", fs);

log.info("Mapper processing {}", path);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,9 +171,13 @@ public static Domain getDefaultDomain(String domainName) {
SeedList seedlist;
if (Constants.IP_KEY_REGEXP.matcher(domainName).matches()) {
// IP domains should not get www
seedlist = new SeedList(defaultSeedListName, "http://" + domainName);
seedlist = new SeedList(defaultSeedListName, "http://" + domainName + "\nhttps://" + domainName);
} else {
seedlist = new SeedList(defaultSeedListName, "http://www." + domainName);
seedlist = new SeedList(defaultSeedListName,
"http://www." + domainName +
"\nhttps://www." + domainName +
"\nhttp://" + domainName +
"\nhttps://" + domainName);
}
myDomain.addSeedList(seedlist);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -774,13 +774,15 @@ public void testGetDefaultDomain() throws Exception {
assertTrue("Configuration should have a seedlist", conf.getSeedLists().hasNext());
SeedList seedlist = d.getSeedList((conf.getSeedLists().next()).getName());
assertNotNull("Default seedlist should exist", seedlist);
assertEquals("Default seedlist should contain the domain", "http://www.foo.dk", seedlist.getSeedsAsString()
String expectedSeedlistAsString = "http://www.foo.dk\nhttps://www.foo.dk\nhttp://foo.dk\nhttps://foo.dk";
assertEquals("Default seedlist should contain the domain", expectedSeedlistAsString, seedlist.getSeedsAsString()
.trim());
Domain d1 = Domain.getDefaultDomain("1.2.3.4");
assertNotNull("Default domain for IP should be obtainable", d1);
seedlist = d1.getSeedList((conf.getSeedLists().next()).getName());
assertNotNull("Default seedlist should exist", seedlist);
assertEquals("Default seedlist should contain the domain", "http://1.2.3.4", seedlist.getSeedsAsString().trim());
expectedSeedlistAsString = "http://1.2.3.4\nhttps://1.2.3.4";
assertEquals("Default seedlist should contain the domain", expectedSeedlistAsString, seedlist.getSeedsAsString().trim());
}

/**
Expand Down Expand Up @@ -1046,7 +1048,7 @@ public void testGetSortedSeedlistAndDomainConfigurationsAndPasswords() {
SeedList s7 = new SeedList("Åse liste", "http://plidder");
SeedList s8 = new SeedList("ø liste", "http://plidder");
SeedList s9 = new SeedList("Æble liste", "http://plidder");
SeedList defaultSeeds = new SeedList("defaultseeds", "http://www.unknowndomain.dk");
SeedList defaultSeeds = new SeedList("defaultseeds", "http://www.unknowndomain.dk\nhttps://www.unknowndomain.dk\nhttp://unknowndomain.dk\nhttps://unknowndomain.dk");
d.addSeedList(s1);
d.addSeedList(s2);
d.addSeedList(s3);
Expand Down

0 comments on commit ee7a55c

Please sign in to comment.