From 264eaca2d73f926dd9b8daf01db1aec1d913cd4f Mon Sep 17 00:00:00 2001 From: hritishj <122864837+hritishj@users.noreply.github.com> Date: Wed, 28 Jun 2023 20:42:29 +0530 Subject: [PATCH 1/2] periodic downloader with error channel added --- cogs/discord_data_scraper.py | 121 +++++++++++++++++------------------ config.json | 6 ++ utils/db.py | 4 ++ 3 files changed, 69 insertions(+), 62 deletions(-) create mode 100644 config.json diff --git a/cogs/discord_data_scraper.py b/cogs/discord_data_scraper.py index 2d1f22a..1083cd5 100644 --- a/cogs/discord_data_scraper.py +++ b/cogs/discord_data_scraper.py @@ -1,6 +1,7 @@ from discord.ext import commands, tasks from discord.channel import TextChannel from discord import Member +import discord import os, dateutil, json, sys from datetime import datetime @@ -8,28 +9,19 @@ from utils.api import GithubAPI import csv -#CONSTANTS -RUNTIME_DATA_DIRECTORY = 'scraping-runtime-data' -RUNTIME_DATA_FILE = 'discordScraperRuntimeData.json' -CONTRIBUTOR_ROLE_ID = 973852365188907048 -INTRODUCTIONS_CHANNEL_ID =1107343423167541328 - -#check id directory exists for scraping runtime data and create one if it doesn't -def createRuntimeDataDirectory(): - cwd = os.getcwd() - path = f'{cwd}/{RUNTIME_DATA_DIRECTORY}' - if not os.path.isdir(path): - os.mkdir(path) - - return path - - +with open('config.json') as config_file: + config_data = json.load(config_file) +#CONSTANTS +CONTRIBUTOR_ROLE_ID = config_data['CONTRIBUTOR_ROLE_ID'] +INTRODUCTIONS_CHANNEL_ID =config_data['INTRODUCTIONS_CHANNEL_ID'] +ERROR_CHANNEL_ID = config_data['ERROR_CHANNEL_ID'] +TIME_DURATION = config_data['TIME_DURATION'] class DiscordDataScaper(commands.Cog): def __init__(self, bot) -> None: self.bot = bot - self.runtimeDataDirectory = createRuntimeDataDirectory() + self.collect_all_messages.start() # @commands.command() # async def introductions(self, ctx): @@ -151,58 +143,63 @@ def addEngagmentData(data): # writer.writerows(data) #Store all messages on Text Channels in the Discord Server to SupaBase - @commands.command() - async def add_messages(self,ctx): + + @tasks.loop(hours=TIME_DURATION) + async def collect_all_messages(self): + print(f"Collecting all messages as of {datetime.now()}") + await self.add_messages() + + async def add_messages(self): def addMessageData(data): client = SupabaseInterface("unstructured discord data") client.insert(data) return - - def recordLastRunTime(data, directory): - with open(f'{directory}/{RUNTIME_DATA_FILE}', 'w+') as file: - json.dump(data, file) - - def getLastRunTime(channelId): - with open(f'{self.runtimeDataDirectory}/{RUNTIME_DATA_FILE}', 'r') as file: - data = json.load(file) - lastRuntime = data.get(str(channelId)) - if lastRuntime is None: - #all messages will be read - return None - else: - return dateutil.parser.parse(lastRuntime) - - - guild = await self.bot.fetch_guild(os.getenv("SERVER_ID")) #SERVER_ID Should be C4GT Server ID - channels = await guild.fetch_channels() - runtimeData = {} + def getLastMessageObject(channelId): + last_message = SupabaseInterface("unstructured discord data").read_by_order_limit(query_key="channel",query_value=channelId,order_column="id.desc") # fetching the record for the lastest message downloaded from a particular channel, the most recent message has the largest message_id + if len(last_message)>0: + print(f"Last message details for {channelId} is {last_message[0]}") + return discord.Object(id=last_message[0]['id'] ) + else: + print(f"No previous messages obtained for {channelId}") + return None + + try: + guild = await self.bot.fetch_guild(os.getenv("SERVER_ID")) #SERVER_ID Should be C4GT Server ID + channels = await guild.fetch_channels() + + for channel in channels: + print(f"Downloading messages for '{channel.name}' channel") + if isinstance(channel, TextChannel): #See Channel Types for info on text channels https://discordpy.readthedocs.io/en/stable/api.html?highlight=guild#discord.ChannelType + messages = [] + last_message_object = getLastMessageObject(channel.id) + # fetching only the messages after the last message id, if None, then all the messages are fetched + async for message in channel.history(limit=None, after=last_message_object): + if message.content=='': + continue + msg_data = { + "channel": channel.id, + "channel_name": channel.name, + "text": message.content, + "author": message.author.id, + "author_name": message.author.name, + "author_roles": message.author.roles if isinstance(message.author, Member) else [], + "sent_at":str(message.created_at), + "id": message.id + } + messages.append(msg_data) + print(f"{len(messages)} new messages found ") + addMessageData(messages) + else: + print(f"{channel.name} not a text channel") + print(f"Downloaded all messages as of {datetime.now()}") + except Exception as e: + error_channel = await guild.fetch_channel(ERROR_CHANNEL_ID) + error_message = f'Error occurred while downloading messages: {e}' + await error_channel.send(error_message) + print(error_message) - for channel in channels: - print(channel.name) - if isinstance(channel, TextChannel): #See Channel Types for info on text channels https://discordpy.readthedocs.io/en/stable/api.html?highlight=guild#discord.ChannelType - messages = [] - last_run = getLastRunTime(channel.id) - print(last_run) - async for message in channel.history(limit=None, after =last_run ): - if message.content=='': - continue - msg_data = { - "channel": channel.id, - "channel_name": channel.name, - "text": message.content, - "author": message.author.id, - "author_name": message.author.name, - "author_roles": message.author.roles if isinstance(message.author, Member) else [], - "sent_at":str(message.created_at) - } - messages.append(msg_data) - print(len(messages)) - addMessageData(messages) - runtimeData[channel.id] = datetime.now().isoformat() - recordLastRunTime(runtimeData, self.runtimeDataDirectory) - print("Complete!") async def setup(bot): await bot.add_cog(DiscordDataScaper(bot)) \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..66ed946 --- /dev/null +++ b/config.json @@ -0,0 +1,6 @@ +{ + "CONTRIBUTOR_ROLE_ID": 973852365188907048, + "INTRODUCTIONS_CHANNEL_ID": 1107343423167541328, + "ERROR_CHANNEL_ID": 0, + "TIME_DURATION": 10 +} \ No newline at end of file diff --git a/utils/db.py b/utils/db.py index 5c76be0..4fc8cd1 100644 --- a/utils/db.py +++ b/utils/db.py @@ -14,6 +14,10 @@ def read(self, query_key, query_value, columns="*"): data = self.client.table(self.table).select(columns).eq(query_key, query_value).execute() #data.data returns a list of dictionaries with keys being column names and values being row values return data.data + + def read_by_order_limit(self, query_key, query_value, order_column, order_by=False, limit=1, columns="*"): + data = self.client.table(self.table).select(columns).eq(query_key, query_value).order(order_column).limit(limit).execute() + return data.data def read_all(self): data = self.client.table(self.table).select("*").execute() From 4c4efbd35efc2e569103fa56f9583f6a08f01e99 Mon Sep 17 00:00:00 2001 From: starks-and-wolves Date: Thu, 29 Jun 2023 19:58:46 +0530 Subject: [PATCH 2/2] added start-stop commands for message collector --- cogs/discord_data_scraper.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/cogs/discord_data_scraper.py b/cogs/discord_data_scraper.py index 1083cd5..50ec122 100644 --- a/cogs/discord_data_scraper.py +++ b/cogs/discord_data_scraper.py @@ -21,7 +21,6 @@ class DiscordDataScaper(commands.Cog): def __init__(self, bot) -> None: self.bot = bot - self.collect_all_messages.start() # @commands.command() # async def introductions(self, ctx): @@ -65,10 +64,7 @@ async def on_reaction_add(self, reaction, user): "total_reaction_count": 1}) return print("reaction") - SupabaseInterface("discord_engagement").update({"total_reaction_count":contributor["total_reaction_count"]+1}, "contributor", message.author.id) - - - + SupabaseInterface("discord_engagement").update({"total_reaction_count":contributor["total_reaction_count"]+1}, "contributor", message.author.id) @commands.command() async def add_engagement(self, ctx): @@ -144,7 +140,28 @@ def addEngagmentData(data): #Store all messages on Text Channels in the Discord Server to SupaBase - @tasks.loop(hours=TIME_DURATION) + # command to run the message collector + @commands.command() + async def start_collecting_messages(self, ctx): + if not self.collect_all_messages.is_running(): + self.collect_all_messages.start() + print("Initiating message collection") + await ctx.send("Message collection started.") + else: + await ctx.send("Message collection already in progress.") + + # command to stop the message collector + @commands.command() + async def stop_collecting_messages(self, ctx): + if self.collect_all_messages.is_running(): + self.collect_all_messages.cancel() + print("Stopping message collection") + await ctx.send("Message collection stopped.") + else: + await ctx.send("Message collection is not running.") + + # recurring job to collect all the messages + @tasks.loop(seconds=TIME_DURATION) async def collect_all_messages(self): print(f"Collecting all messages as of {datetime.now()}") await self.add_messages()