### Preprocess Discord JSON Data

This notebook is for preprocessing the extracted messages from the Autogen Discord. The purpose is to format and filter the data before putting it into a format that can be stored within a vector store for RAG operations.

In [1]:
import os
import json
import glob
import pandas as pd

# Discord message JSON files
path = './data/*.json'

In [2]:
# Function for 
def process_file(file_path):
    file_name = os.path.basename(file_path).split('.')[0]
    with open(file_path, 'r') as file:
        try:
            messages = json.load(file)
            return pd.DataFrame([{
                'channel': file_name,
                'author_username': item['author']['username'],
                'timestamp': item['timestamp'],
                'content': item['content'],
                'embeds': item['embeds']
            } for item in messages])
        except json.JSONDecodeError:
            print(f"Error decoding JSON from {file_path}")
        except Exception as e:
            print(f"Error processing {file_path}: {e}")

# Get number of tokens per string
def get_token_len(text):
    return len([word for word in text.split(' ')])

try:
    df = pd.concat((process_file(fp) for fp in glob.glob(path) if process_file(fp) is not None), ignore_index=True)
    df['timestamp'] =  pd.to_datetime(df['timestamp'],  errors='coerce').dt.strftime('%Y-%m-%d %H:%M:%S')
    df['num_tokens'] = df.content.apply(get_token_len)
    df['has_embedding'] = df.embeds.apply(lambda x: False if x == [] else True)
    # Remove redundant short messages by token lengths 
    df = df[(df['has_embedding']) | (~df['has_embedding'] & (df['num_tokens'] >= 5))]

except Exception as e:
    print(f"Error: {e}")

In [3]:
df.channel.value_counts()

channel
general                 5106
ideas-and-feedback      1077
created-with-autogen     443
forum-discussion         324
dev-contributors         295
issues-and-help          255
announcements             38
Name: count, dtype: int64

In [10]:
df

Unnamed: 0,channel,author_username,timestamp,content,embeds,num_tokens,has_embedding
0,issues-and-help,Lega,2023-11-15 10:41:54,im novice w programming so i may not have expl...,[],33,False
1,issues-and-help,razahin,2023-11-14 19:49:12,"Hi @.beibinli, thank you very much for your of...",[],75,False
2,issues-and-help,sonichi,2023-11-14 19:01:34,https://microsoft.github.io/autogen/docs/Insta...,"[{'type': 'article', 'url': 'https://microsoft...",1,True
3,issues-and-help,ariel.andres,2023-11-14 18:11:25,"Hi, I am trying to run the following example c...","[{'type': 'article', 'url': 'https://github.co...",148,True
4,issues-and-help,aaronward_,2023-11-14 15:50:34,"It was my fault, i didn't format the tool conf...",[],18,False
...,...,...,...,...,...,...,...
9225,ideas-and-feedback,qingyunwu,2023-09-26 19:17:31,Thanks for the question. llama2 and most open-...,"[{'type': 'article', 'url': 'https://microsoft...",45,True
9226,ideas-and-feedback,.geoffreya,2023-09-26 17:10:34,Integration with many LLMs seems important to ...,[],19,False
9227,ideas-and-feedback,.geoffreya,2023-09-26 17:08:55,Semantic Kernel is the other LLM lib that Msft...,[],60,False
9228,ideas-and-feedback,snoq_,2023-09-26 12:39:09,Will I be able to use any model like llama2 or...,[],16,False


In [48]:
# write_cols = ['channel', 'author_username', 'timestamp', 'content', 'embeds', has_embedding]

# Open a file in write mode
with open('./docs/15112023_chat_history.txt', 'w') as file:
    for index, row in df.iterrows():

        additional_context = ""
        try:
            if row.has_embedding:
                additional_context (f"""Additional information about the content linked by this user: 
                - Link title: {row.embeds[0].title}
                - Link description: {row.embeds[0].description}
                """)
        except:
            pass

        formatted_text = (f"""
        In {row.channel}, at {row.timestamp} a user named {row.author_username} said ```{row.content}```.\n {additional_context}
        \n """).strip()

        file.write(formatted_text.strip() + '\n\n')


In [None]:
ragprox


_get_context