Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 79 additions & 65 deletions cogs/discord_data_scraper.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,26 @@
from discord.ext import commands, tasks
from discord.channel import TextChannel
from discord import Member
import discord
import os, dateutil, json, sys
from datetime import datetime

from utils.db import SupabaseInterface
from utils.api import GithubAPI
import csv

#CONSTANTS
RUNTIME_DATA_DIRECTORY = 'scraping-runtime-data'
RUNTIME_DATA_FILE = 'discordScraperRuntimeData.json'
CONTRIBUTOR_ROLE_ID = 973852365188907048
INTRODUCTIONS_CHANNEL_ID =1107343423167541328

#check id directory exists for scraping runtime data and create one if it doesn't
def createRuntimeDataDirectory():
cwd = os.getcwd()
path = f'{cwd}/{RUNTIME_DATA_DIRECTORY}'
if not os.path.isdir(path):
os.mkdir(path)

return path


with open('config.json') as config_file:
config_data = json.load(config_file)

#CONSTANTS
CONTRIBUTOR_ROLE_ID = config_data['CONTRIBUTOR_ROLE_ID']
INTRODUCTIONS_CHANNEL_ID =config_data['INTRODUCTIONS_CHANNEL_ID']
ERROR_CHANNEL_ID = config_data['ERROR_CHANNEL_ID']
TIME_DURATION = config_data['TIME_DURATION']

class DiscordDataScaper(commands.Cog):
def __init__(self, bot) -> None:
self.bot = bot
self.runtimeDataDirectory = createRuntimeDataDirectory()

# @commands.command()
# async def introductions(self, ctx):
Expand Down Expand Up @@ -73,10 +64,7 @@ async def on_reaction_add(self, reaction, user):
"total_reaction_count": 1})
return
print("reaction")
SupabaseInterface("discord_engagement").update({"total_reaction_count":contributor["total_reaction_count"]+1}, "contributor", message.author.id)



SupabaseInterface("discord_engagement").update({"total_reaction_count":contributor["total_reaction_count"]+1}, "contributor", message.author.id)

@commands.command()
async def add_engagement(self, ctx):
Expand Down Expand Up @@ -151,58 +139,84 @@ def addEngagmentData(data):
# writer.writerows(data)

#Store all messages on Text Channels in the Discord Server to SupaBase

# command to run the message collector
@commands.command()
async def start_collecting_messages(self, ctx):
if not self.collect_all_messages.is_running():
self.collect_all_messages.start()
print("Initiating message collection")
await ctx.send("Message collection started.")
else:
await ctx.send("Message collection already in progress.")

# command to stop the message collector
@commands.command()
async def add_messages(self,ctx):
async def stop_collecting_messages(self, ctx):
if self.collect_all_messages.is_running():
self.collect_all_messages.cancel()
print("Stopping message collection")
await ctx.send("Message collection stopped.")
else:
await ctx.send("Message collection is not running.")

# recurring job to collect all the messages
@tasks.loop(seconds=TIME_DURATION)
async def collect_all_messages(self):
print(f"Collecting all messages as of {datetime.now()}")
await self.add_messages()

async def add_messages(self):

def addMessageData(data):
client = SupabaseInterface("unstructured discord data")
client.insert(data)
return

def recordLastRunTime(data, directory):
with open(f'{directory}/{RUNTIME_DATA_FILE}', 'w+') as file:
json.dump(data, file)

def getLastRunTime(channelId):
with open(f'{self.runtimeDataDirectory}/{RUNTIME_DATA_FILE}', 'r') as file:
data = json.load(file)
lastRuntime = data.get(str(channelId))
if lastRuntime is None:
#all messages will be read
return None
else:
return dateutil.parser.parse(lastRuntime)



guild = await self.bot.fetch_guild(os.getenv("SERVER_ID")) #SERVER_ID Should be C4GT Server ID
channels = await guild.fetch_channels()
runtimeData = {}
def getLastMessageObject(channelId):
last_message = SupabaseInterface("unstructured discord data").read_by_order_limit(query_key="channel",query_value=channelId,order_column="id.desc") # fetching the record for the lastest message downloaded from a particular channel, the most recent message has the largest message_id
if len(last_message)>0:
print(f"Last message details for {channelId} is {last_message[0]}")
return discord.Object(id=last_message[0]['id'] )
else:
print(f"No previous messages obtained for {channelId}")
return None

try:
guild = await self.bot.fetch_guild(os.getenv("SERVER_ID")) #SERVER_ID Should be C4GT Server ID
channels = await guild.fetch_channels()

for channel in channels:
print(f"Downloading messages for '{channel.name}' channel")
if isinstance(channel, TextChannel): #See Channel Types for info on text channels https://discordpy.readthedocs.io/en/stable/api.html?highlight=guild#discord.ChannelType
messages = []
last_message_object = getLastMessageObject(channel.id)
# fetching only the messages after the last message id, if None, then all the messages are fetched
async for message in channel.history(limit=None, after=last_message_object):
if message.content=='':
continue
msg_data = {
"channel": channel.id,
"channel_name": channel.name,
"text": message.content,
"author": message.author.id,
"author_name": message.author.name,
"author_roles": message.author.roles if isinstance(message.author, Member) else [],
"sent_at":str(message.created_at),
"id": message.id
}
messages.append(msg_data)
print(f"{len(messages)} new messages found ")
addMessageData(messages)
else:
print(f"{channel.name} not a text channel")
print(f"Downloaded all messages as of {datetime.now()}")
except Exception as e:
error_channel = await guild.fetch_channel(ERROR_CHANNEL_ID)
error_message = f'Error occurred while downloading messages: {e}'
await error_channel.send(error_message)
print(error_message)

for channel in channels:
print(channel.name)
if isinstance(channel, TextChannel): #See Channel Types for info on text channels https://discordpy.readthedocs.io/en/stable/api.html?highlight=guild#discord.ChannelType
messages = []
last_run = getLastRunTime(channel.id)
print(last_run)
async for message in channel.history(limit=None, after =last_run ):
if message.content=='':
continue
msg_data = {
"channel": channel.id,
"channel_name": channel.name,
"text": message.content,
"author": message.author.id,
"author_name": message.author.name,
"author_roles": message.author.roles if isinstance(message.author, Member) else [],
"sent_at":str(message.created_at)
}
messages.append(msg_data)
print(len(messages))
addMessageData(messages)
runtimeData[channel.id] = datetime.now().isoformat()
recordLastRunTime(runtimeData, self.runtimeDataDirectory)
print("Complete!")

async def setup(bot):
await bot.add_cog(DiscordDataScaper(bot))
6 changes: 6 additions & 0 deletions config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"CONTRIBUTOR_ROLE_ID": 973852365188907048,
"INTRODUCTIONS_CHANNEL_ID": 1107343423167541328,
"ERROR_CHANNEL_ID": 0,
"TIME_DURATION": 10
}
4 changes: 4 additions & 0 deletions utils/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ def read(self, query_key, query_value, columns="*"):
data = self.client.table(self.table).select(columns).eq(query_key, query_value).execute()
#data.data returns a list of dictionaries with keys being column names and values being row values
return data.data

def read_by_order_limit(self, query_key, query_value, order_column, order_by=False, limit=1, columns="*"):
data = self.client.table(self.table).select(columns).eq(query_key, query_value).order(order_column).limit(limit).execute()
return data.data

def read_all(self):
data = self.client.table(self.table).select("*").execute()
Expand Down