In [1]:
import csv
import concurrent.futures
from typing import List

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

from langchain_groq import ChatGroq
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI

from dotenv import load_dotenv
import os

load_dotenv()

# Set the LANGCHAIN_TRACING_V2 environment variable to 'true'
os.environ['LANGCHAIN_TRACING_V2'] = 'true'

# Set the LANGCHAIN_PROJECT environment variable to the desired project name
os.environ['LANGCHAIN_PROJECT'] = 'Conversation30DayProject'


class Message(BaseModel):
    """
    A model for representing a single message within a message log.
    Inlcudes meta data like the user_name, message_type, and a message_id which acts not only as a unique identifier but conversation sequence identifier.
    """
    user_name: str = Field(..., description="User name who submitted the message")
    message: str = Field(..., description="Full message that the user sent, most likely a question, answer, or comment.")
    message_type: str = Field(...,description="Category of the type of interest the message provokes. Question, Answer, Comment, Spam")
    message_id: int = Field(...,description="7 digit number, first 5 is the conversations unique identifier and the last 2 is the message sequence number")


class Conversation(BaseModel):
    """The full message log to review the message history in sequential order."""
    message_history: List[Message]


def create_conv(input_prompt: str):
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                """You are an expert at generating synthetic coversations. Your task is to create a range of synthetic conversations that comes from the "Break Into Data" Discord Server.
                    The discord servers's purpose is to help people get data related skills and roles, including data analytics, data science, machine learning, data engineering and ai engineering.
                    The Server has a wide range of skillsets ranging from beginners who just started their data journey to proficient and experienced members. 
                    
                    The variety of backgrounds, experience, and interests helps to invoke interesting conversations throughout the discord server. The primary channels is a general, job search support, content creation, share your project, and resources.
                    - The channel is small enough that there are only 2-5 users active in a conversation at the same time before another topic get picked up on the channel. 
                    - Each conversation should be focused around a single general subject but can include tangents that it runs down before coming to a conclusion.
                    - While each conversation most likely focuses on question and answer conversations it can also include comments that people share about their expierence or even help to build/rephrase the question.
                    - Conversations can also include spam, while this does occure it is less than 15 percent of the messages.
                    - Conversations typically range between 3-15 messages
                    
                    Example of topics include:
                    resume help, interview preparation questions, hackathons, networking events, articles, news, youtube videos,
                    coding cookbooks, cheatsheets, Linear/logistical Regression, classification, clustering, neural networks, random forests, 
                """,
            ),
            ("human", "{text}"),
        ]
    )
    
    # llm = ChatGroq(model_name="llama3-70b-8192")
    llm = ChatAnthropic(api_key=os.getenv("ANTHROPIC_API_KEY") , model_name="claude-3-haiku-20240307")
    # llm = ChatGoogleGenerativeAI(model_name="gemini-1.5-pro")

    extractor = prompt | llm.with_structured_output(
        schema=Conversation,
        method="function_calling",
        include_raw=False,
    )
    
    return extractor.invoke(input_prompt)


def generate_conversation():
    return create_conv("Generate and save the created conversations.")


def write_conversation_to_csv(conversation, writer):
    for message in conversation.message_history:
        writer.writerow([message.user_name, message.message, message.message_type, message.message_id])


def main(conv_filepath, num_conversations=3):
    file_exists = os.path.exists(conv_filepath)

    with open(conv_filepath, mode='a', newline='') as file:
        writer = csv.writer(file)

        if not file_exists:
            writer.writerow(['user_name', 'message', 'message_type', 'message_id'])

        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = []
            for _ in range(num_conversations):
                future = executor.submit(generate_conversation)
                futures.append(future)

            for future in concurrent.futures.as_completed(futures):
                try:
                    conversation = future.result()
                    write_conversation_to_csv(conversation, writer)
                except Exception as e:
                    print(f"Error occurred while generating conversation: {str(e)}")
                    continue

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
conv_filepath = './data/conversations.csv'
for _ in range(40):
    main(conv_filepath, 2)
    # time.sleep(6) # Groq has a rate limit of 3k tokens per minute so need to slow it down sometimes

Error occurred while generating conversation: 1 validation error for Conversation
message_history
  value is not a valid list (type=type_error.list)
Error occurred while generating conversation: 1 validation error for Conversation
message_history
  value is not a valid list (type=type_error.list)
Error occurred while generating conversation: 1 validation error for Conversation
message_history
  field required (type=value_error.missing)


In [4]:
import pandas as pd

df = pd.read_csv(conv_filepath)

# get the value count of each message type
df['message_type'].value_counts()

message_type
Answer          605
Question        547
Comment         261
question         20
answer           19
comment           8
Response          5
Advice            3
Spam              2
Resource          1
Suggestion        1
Appreciation      1
Name: count, dtype: int64

In [6]:
df.head()

Unnamed: 0,user_name,message,message_type,message_id
0,DataNewbie,"Hey guys, I'm having trouble with my linear re...",Question,12345
1,DataWizard,What's your data look like? Are you using any ...,Comment,12346
2,DataNewbie,I'm using a dataset with 1000 samples and 10 f...,Answer,12347
3,DataWizard,Have you checked for multicollinearity? Maybe ...,Comment,12348
4,DataNewbie,"Yeah, I did check for multicollinearity and re...",Answer,12349
