Skip to content

Commit

Permalink
Breaking: Caching
Browse files Browse the repository at this point in the history
  • Loading branch information
FullLifeGames committed Jan 11, 2023
1 parent 5dbcb69 commit 57365b5
Show file tree
Hide file tree
Showing 16 changed files with 210 additions and 157 deletions.
78 changes: 29 additions & 49 deletions SmogonTeamCrawler.Cmd/Program.cs
Original file line number Diff line number Diff line change
@@ -1,46 +1,22 @@
var teamCrawler = new SmogonTeamCrawler.Core.Crawler.SmogonTeamCrawler();
using NeoSmart.Caching.Sqlite;

var teamCrawler = new SmogonTeamCrawler.Core.Crawler.SmogonTeamCrawler(
new SqliteCache(
new SqliteCacheOptions()
{
MemoryOnly = false,
CachePath = "SmogonDump.db",
}
)
);

var crawlRequest = new SmogonTeamCrawler.Core.Data.CrawlRequest()
{
MainForum = true,
RMTForum = true,
};
var crawlResult = await teamCrawler.CrawlAsync(crawlRequest).ConfigureAwait(false);

if (crawlRequest.MainForum)
{
await File.WriteAllTextAsync(
"outputJson.txt",
Newtonsoft.Json.JsonConvert.SerializeObject(crawlResult.SmogonTeams)
).ConfigureAwait(false);

await File.WriteAllTextAsync(
"output.json",
Newtonsoft.Json.JsonConvert.SerializeObject(crawlResult.SmogonTeams)
).ConfigureAwait(false);

await File.WriteAllTextAsync(
"output.txt",
crawlResult.SmogonOutput
).ConfigureAwait(false);
}

if (crawlRequest.RMTForum)
{
await File.WriteAllTextAsync(
"outputRMTJson.txt",
Newtonsoft.Json.JsonConvert.SerializeObject(crawlResult.Rmts)
).ConfigureAwait(false);

await File.WriteAllTextAsync(
"outputRMT.json",
Newtonsoft.Json.JsonConvert.SerializeObject(crawlResult.Rmts)
).ConfigureAwait(false);

await File.WriteAllTextAsync(
"outputRMT.txt",
crawlResult.RmtsOutput
).ConfigureAwait(false);
}
var crawlResult = await teamCrawler.CrawlAsync(crawlRequest).ConfigureAwait(false);

foreach (var outputs in crawlResult.TeamsByTier)
{
Expand All @@ -63,17 +39,21 @@ await File.WriteAllTextAsync(
).ConfigureAwait(false);
}

await File.WriteAllTextAsync(
"finalJson.txt",
Newtonsoft.Json.JsonConvert.SerializeObject(crawlResult.TeamsByTier)
).ConfigureAwait(false);
var dumpEverything = false;
if (dumpEverything)
{
await File.WriteAllTextAsync(
"finalJson.txt",
Newtonsoft.Json.JsonConvert.SerializeObject(crawlResult.TeamsByTier)
).ConfigureAwait(false);

await File.WriteAllTextAsync(
"final.json",
Newtonsoft.Json.JsonConvert.SerializeObject(crawlResult.TeamsByTier)
).ConfigureAwait(false);
await File.WriteAllTextAsync(
"final.json",
Newtonsoft.Json.JsonConvert.SerializeObject(crawlResult.TeamsByTier)
).ConfigureAwait(false);

await File.WriteAllTextAsync(
"finalCollected.json",
Newtonsoft.Json.JsonConvert.SerializeObject(crawlResult.CreatedTeamsByTiers)
).ConfigureAwait(false);
await File.WriteAllTextAsync(
"finalCollected.json",
Newtonsoft.Json.JsonConvert.SerializeObject(crawlResult.CreatedTeamsByTiers)
).ConfigureAwait(false);
}
4 changes: 4 additions & 0 deletions SmogonTeamCrawler.Cmd/SmogonTeamCrawler.Cmd.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
<Nullable>enable</Nullable>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="NeoSmart.Caching.Sqlite" Version="6.0.1" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\SmogonTeamCrawler.Core\SmogonTeamCrawler.Core.csproj" />
</ItemGroup>
Expand Down
2 changes: 1 addition & 1 deletion SmogonTeamCrawler.Core/Collector/ICollector.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@ public interface ICollector
{
public Task<IDictionary<string, ICollection<Team>>> Collect(IDictionary<string, string> tierToLinks, bool prefixUsage);
public Task CollectFromForum(IDictionary<string, ICollection<Team>> collectedTeams, string tier, string url, bool prefixUsage);
public Task AnalyzeThread(IDictionary<string, ICollection<Team>> collectedTeams, string tier, string url, string prefix);
public Task<ThreadAnalyzeResult> AnalyzeThread(string url, string? prefix);
}
}
148 changes: 109 additions & 39 deletions SmogonTeamCrawler.Core/Collector/TeamCollector.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
using SmogonTeamCrawler.Core.Data;
using Microsoft.Extensions.Caching.Distributed;
using Newtonsoft.Json;
using SmogonTeamCrawler.Core.Data;
using SmogonTeamCrawler.Core.Util;
using System;
using System.Collections.Concurrent;
Expand All @@ -12,6 +14,16 @@ namespace SmogonTeamCrawler.Core.Collector
{
public class TeamCollector : ICollector
{
public TeamCollector() : this(null)
{
}

private readonly IDistributedCache? _cache;
public TeamCollector(IDistributedCache? cache)
{
_cache = cache;
}

public async Task<IDictionary<string, ICollection<Team>>> Collect(IDictionary<string, string> tierToLinks, bool prefixUsage)
{
var collectedTeams = new ConcurrentDictionary<string, ICollection<Team>>();
Expand Down Expand Up @@ -59,26 +71,94 @@ public async Task CollectFromForum(IDictionary<string, ICollection<Team>> collec
{
collectedTeams.Add(identifier, new List<Team>());
}
var beforeCount = collectedTeams[identifier].Count;
await AnalyzeThread(collectedTeams, tier: identifier, fullUrl, prefix).ConfigureAwait(false);
var afterCount = collectedTeams[identifier].Count;
Console.WriteLine("Added " + (afterCount - beforeCount) + " Teams");
var collectionList = new Dictionary<string, ICollection<Team>>();
ThreadAnalyzeResult analyzeResult;
if (await LastIdIsCurrent(fullUrl))
{
analyzeResult = JsonConvert.DeserializeObject<ThreadAnalyzeResult>(_cache!.GetString(fullUrl)!)!;
foreach (var entry in analyzeResult.CollectedTeams)
{
collectedTeams[identifier].Add(entry);
}
}
else
{
analyzeResult = await AnalyzeThread(fullUrl, prefix).ConfigureAwait(false);
foreach (var entry in analyzeResult.CollectedTeams)
{
collectedTeams[identifier].Add(entry);
}
_cache?.SetString(fullUrl, JsonConvert.SerializeObject(analyzeResult));
}
Console.WriteLine("Added " + (analyzeResult.CollectedTeams.Count) + " Teams");
Console.WriteLine();
prefix = "";
}
}
}
}

public async Task AnalyzeThread(IDictionary<string, ICollection<Team>> collectedTeams, string tier, string url, string prefix)
private async Task<bool> LastIdIsCurrent(string fullUrl)
{
if (!collectedTeams.ContainsKey(tier))
if (_cache == null)
{
return false;
}

var cachedResults = _cache.GetString(fullUrl);
if (cachedResults == null)
{
return false;
}

var analyzeResult = JsonConvert.DeserializeObject<ThreadAnalyzeResult>(cachedResults);
if (analyzeResult == null)
{
return false;
}

// If the last post is older than two years, assumption is that no new activity with relevant teams will be posted
if (analyzeResult.LastPost < DateTime.Now.AddYears(-2))
{
collectedTeams.Add(tier, new List<Team>());
return true;
}

var site = await Common.HttpClient.GetStringAsync(fullUrl + "page-" + analyzeResult.NumberOfPages).ConfigureAwait(false);
var numberOfPages = GetNumberOfPages(site);
if (numberOfPages != analyzeResult.NumberOfPages)
{
return false;
}

var lineDataHandler = new LineDataHandler();
foreach (var line in site.Split('\n'))
{
if (line.Contains("<header class=\"message-attribution message-attribution--split\">"))
{
lineDataHandler.TimerHeader = true;
}
else if (line.Contains("data-date-string=\"") && lineDataHandler.TimerHeader)
{
var temp = line[(line.IndexOf("data-date-string=\"") + "data-date-string=\"".Length)..];
temp = temp[(temp.IndexOf("title") + "title".Length)..];
temp = temp[(temp.IndexOf("\"") + 1)..];
temp = temp[..temp.IndexOf("\"")];
temp = temp.Replace("at ", "");
lineDataHandler.PostDate = DateTime.ParseExact(temp, "MMM d, yyyy h:mm tt", CultureInfo.GetCultureInfo("en-US"));
lineDataHandler.TimerHeader = false;
}
}

return lineDataHandler.PostDate == analyzeResult.LastPost;
}

public async Task<ThreadAnalyzeResult> AnalyzeThread(string url, string? prefix)
{
var pages = 1;
var latestPost = DateTime.UnixEpoch;
var collectedTeams = new List<Team>();
try
{
var pages = 1;
for (var pageCount = 1; pageCount <= pages; pageCount++)
{
var site = await Common.HttpClient.GetStringAsync(url + "page-" + pageCount).ConfigureAwait(false);
Expand All @@ -87,38 +167,29 @@ public async Task AnalyzeThread(IDictionary<string, ICollection<Team>> collected
pages = GetNumberOfPages(site);
}

var lineDataHandler = new LineDataHandler()
{
BlockStarted = false,
BlockText = "",

PostStarted = false,
PostLink = "",
PostLikes = 0,
PostDate = DateTime.Now,

PostedBy = "",

LikeStarted = false,

TimerHeader = false,

LastLine = "",
};
var lineDataHandler = new LineDataHandler();

var currentTeams = new List<string>();

foreach (var line in site.Split('\n'))
{
await HandleLine(url, tier, collectedTeams, pageCount, currentTeams, line, prefix, lineDataHandler).ConfigureAwait(false);
await HandleLine(url, collectedTeams, pageCount, currentTeams, line, prefix, lineDataHandler).ConfigureAwait(false);
}
latestPost = lineDataHandler.PostDate;
}
}
catch (HttpRequestException e)
{
Console.WriteLine("HttpRequestException at: " + url);
Console.WriteLine(e.Message);
}

return new ThreadAnalyzeResult()
{
CollectedTeams = collectedTeams,
LastPost = latestPost,
NumberOfPages = pages,
};
}

private static int GetNumberOfPages(string site)
Expand All @@ -143,18 +214,18 @@ private static int GetNumberOfPages(string site)
public class LineDataHandler
{
public bool BlockStarted { get; set; }
public string BlockText { get; set; }
public string BlockText { get; set; } = "";
public bool PostStarted { get; set; }
public int PostLikes { get; set; }
public DateTime PostDate { get; set; }
public string PostedBy { get; set; }
public string LastLine { get; set; }
public string PostLink { get; set; }
public DateTime PostDate { get; set; } = DateTime.Now;
public string PostedBy { get; set; } = "";
public string LastLine { get; set; } = "";
public string PostLink { get; set; } = "";
public bool LikeStarted { get; set; }
public bool TimerHeader { get; set; }
}

private async Task HandleLine(string url, string tier, IDictionary<string, ICollection<Team>> collectedTeams, int pageCount, ICollection<string> currentTeams, string line, string prefix, LineDataHandler lineDataHandler)
private async Task HandleLine(string url, ICollection<Team> collectedTeams, int pageCount, ICollection<string> currentTeams, string line, string? prefix, LineDataHandler lineDataHandler)
{
if (!lineDataHandler.PostStarted)
{
Expand Down Expand Up @@ -192,7 +263,7 @@ private async Task HandleLine(string url, string tier, IDictionary<string, IColl
}
teamLine = teamLine[..teamLine.IndexOf("\n")];

string teamTier = null;
string? teamTier = null;
if (teamLine.Contains('[') && teamLine.Contains(']'))
{
teamTier = teamLine.Substring(teamLine.IndexOf("[") + 1, teamLine.IndexOf("]") - teamLine.IndexOf("[") - 1);
Expand All @@ -219,7 +290,7 @@ private async Task HandleLine(string url, string tier, IDictionary<string, IColl
TeamTier = teamTier,
TeamTitle = teamTitle
};
collectedTeams[tier].Add(teamObject);
collectedTeams.Add(teamObject);

tmpTeam = tmpTeam[(tmpTeam.IndexOf("===") + "===".Length)..];
tmpTeam = tmpTeam[(tmpTeam.IndexOf("\n") + 1)..];
Expand All @@ -232,14 +303,13 @@ private async Task HandleLine(string url, string tier, IDictionary<string, IColl
if (!moreTeams)
{
var teamObject = new Team(team, lineDataHandler.PostLikes, lineDataHandler.PostDate, url + "page-" + pageCount + "#" + lineDataHandler.PostLink, lineDataHandler.PostedBy, prefix);
collectedTeams[tier].Add(teamObject);
collectedTeams.Add(teamObject);
}
}
currentTeams.Clear();
lineDataHandler.PostLikes = 0;
lineDataHandler.BlockText = "";
lineDataHandler.PostedBy = "";
lineDataHandler.PostDate = DateTime.Now;
}
else if (line.Contains("<header class=\"message-attribution message-attribution--split\">"))
{
Expand Down Expand Up @@ -359,7 +429,7 @@ private async Task HandleLine(string url, string tier, IDictionary<string, IColl
lineDataHandler.LastLine = line;
}

private async Task<string> GetTeamFromBinner(string pasteUrl, string urlRoot)
private async Task<string?> GetTeamFromBinner(string pasteUrl, string urlRoot)
{
if (pasteUrl.Contains(' ') || pasteUrl.Contains('"') || pasteUrl.Contains('<'))
{
Expand Down
13 changes: 11 additions & 2 deletions SmogonTeamCrawler.Core/Crawler/SmogonTeamCrawler.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using SmogonTeamCrawler.Core.Collector;
using Microsoft.Extensions.Caching.Distributed;
using SmogonTeamCrawler.Core.Collector;
using SmogonTeamCrawler.Core.Formatter;
using SmogonTeamCrawler.Core.Scanner;
using SmogonTeamCrawler.Core.Transformer;
Expand All @@ -7,7 +8,15 @@ namespace SmogonTeamCrawler.Core.Crawler
{
public class SmogonTeamCrawler : TeamCrawler
{
public override ICollector Collector => new TeamCollector();
public SmogonTeamCrawler() : this(null) { }

private readonly IDistributedCache? _cache;
public SmogonTeamCrawler(IDistributedCache? cache)
{
_cache = cache;
}

public override ICollector Collector => new TeamCollector(_cache);

public override IFormatter Formatter => new TeamFormatter();

Expand Down
Loading

0 comments on commit 57365b5

Please sign in to comment.