### Preprocess Discord JSON Data

This notebook is for preprocessing the extracted messages from the Autogen Discord. The purpose is to format and filter the data before putting it into a format that can be stored within a vector store for RAG operations.

In [1]:
import os
import sys
import json
import glob
import pandas as pd

sys.path.append("../")

from utils import api_utils

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

# Discord message JSON files
path = '../data/chat_logs/*.json'

In [2]:
# Function for 
def process_file(file_path):
    file_name = os.path.basename(file_path).split('.')[0]
    with open(file_path, 'r') as file:
        try:
            messages = json.load(file)
            return pd.DataFrame([{
                'channel': file_name,
                'author_username': item['author']['username'],
                'timestamp': item['timestamp'],
                'content': item['content'],
                'embeds': item['embeds']
            } for item in messages])
        except json.JSONDecodeError:
            print(f"Error decoding JSON from {file_path}")
        except Exception as e:
            print(f"Error processing {file_path}: {e}")

# Get number of tokens per string
def get_token_len(text):
    return len([word for word in text.split(' ')])

try:
    df = pd.concat((process_file(fp) for fp in glob.glob(path) if process_file(fp) is not None), ignore_index=True)
    df['timestamp'] =  pd.to_datetime(df['timestamp'],  errors='coerce').dt.strftime('%Y-%m-%d %H:%M:%S')
    df['num_tokens'] = df.content.apply(get_token_len)
    df['has_embedding'] = df.embeds.apply(lambda x: False if x == [] else True)
    # Remove redundant short messages by token lengths 
    df = df[(df['has_embedding']) | (~df['has_embedding'] & (df['num_tokens'] >= 5))]

except Exception as e:
    print(f"Error: {e}")

In [3]:
df.channel.value_counts()

channel
general                 5106
ideas-and-feedback      1077
created-with-autogen     443
forum-discussion         324
dev-contributors         295
issues-and-help          255
announcements             38
Name: count, dtype: int64

In [4]:
df.sort_values(['channel', 'timestamp'])

Unnamed: 0,channel,author_username,timestamp,content,embeds,num_tokens,has_embedding
6669,announcements,sonichi,2023-09-21 02:57:45,*v0.1.1* is released to pypi. Make RetrieveAss...,[],8,False
6668,announcements,ToYari,2023-09-23 19:29:51,Hello everyone very interesting paper,[],5,False
6666,announcements,tonykipkemboi.,2023-09-26 11:50:39,Hi y'all! Excited to try this!,[],6,False
6661,announcements,jharleydev,2023-09-27 12:02:13,"Hi, very cool paper can't wait to try this out !",[],11,False
6660,announcements,.mcalpha,2023-09-27 13:37:23,Weird that this paper hasn't seen more attenti...,[],9,False
...,...,...,...,...,...,...,...
4,issues-and-help,aaronward_,2023-11-14 15:50:34,"It was my fault, i didn't format the tool conf...",[],18,False
3,issues-and-help,ariel.andres,2023-11-14 18:11:25,"Hi, I am trying to run the following example c...","[{'type': 'article', 'url': 'https://github.co...",148,True
2,issues-and-help,sonichi,2023-11-14 19:01:34,https://microsoft.github.io/autogen/docs/Insta...,"[{'type': 'article', 'url': 'https://microsoft...",1,True
1,issues-and-help,razahin,2023-11-14 19:49:12,"Hi @.beibinli, thank you very much for your of...",[],75,False


In [5]:
# write_cols = ['channel', 'author_username', 'timestamp', 'content', 'embeds', has_embedding]
output_file = "../data/docs/22112023_chat_history.txt"

with open(output_file, 'w') as file:
    for index, row in df.iterrows():

        additional_context = ""
        try:
            if row.has_embedding:
                additional_context (f"""Additional information about the content linked by this user: 
                - Link title: {row.embeds[0].title}
                - Link description: {row.embeds[0].description}
                """)
        except:
            pass

        formatted_text = (f"""In {row.channel}, at {row.timestamp} a user named {row.author_username} said ```{row.content}```.\n {additional_context}""").strip()
        file.write(formatted_text.strip().rstrip() + '\n')
        


---

In [7]:
chat_history_file = "../data/docs/22112023_chat_history.txt"
with open(chat_history_file, 'r') as file:
    chat_history = file.read()

In [None]:
output_file = "../data/docs/22112023_qa.txt"
CHUNK_SIZE = 3000  
RATE_LIMIT_DELAY = 20 
import time

def get_chunks(text, chunk_size):
    words = text.split()
    for i in range(0, len(words), chunk_size):
        yield ' '.join(words[i:i+chunk_size])

# Process the context in chunks
with open(output_file, 'w') as file:
    for chunk in get_chunks(chat_history, CHUNK_SIZE):
        try:
            # Send chunk to API and get response
            prompt_response = api_utils.prompt(context=chunk)
            print(prompt_response)

            # Write the response to the file
            file.write(prompt_response.strip().rstrip() + '\n')

            # Wait before sending the next request
            time.sleep(RATE_LIMIT_DELAY)

        except Exception as e:
            print(f"Error processing chunk: {e}")

print("Processing complete.")