In [None]:
!pip install together

In [None]:
import pandas as pd
import numpy as np
from together import Together


In [None]:
api_key = "1b35247990ac187e8443a7742235115710fa59f8cb599aac17d23fda6f897971"  # place your api key from together ai `https://api.together.ai/settings/api-keys`

model = "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"


## Current Working workflow

In [None]:
import json
from dataclasses import dataclass
from typing import List, Optional, Dict, Tuple
from datetime import datetime
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

#  topic hierarchies with more granular categories
TOPIC_HIERARCHIES = {
    "Knowledge": [
        "Academic", "Research", "History", "Philosophy", "Science",
        "Mathematics", "Literature", "Languages"
    ],
    "Technology": [
        "AI/ML", "Software", "Hardware", "Cybersecurity", "Cloud",
        "Blockchain", "IoT", "Data Science", "Programming Languages", "Mobile"
    ],
    "Current Affairs": [
        "Politics", "Economics", "Social Issues", "Environment",
        "International Relations", "Law", "Education"
    ],
    "Lifestyle": [
        "Health", "Fitness", "Food", "Travel", "Fashion",
        "Personal Finance", "Self Improvement"
    ],
    "Entertainment": [
        "Movies", "TV Shows", "Music", "Gaming", "Sports",
        "Art", "Books", "Pop Culture"
    ],
    "Professional": [
        "Career", "Business", "Management", "Marketing",
        "Entrepreneurship", "Industry Specific"
    ],
    "General": [
        "Casual Chat", "Greetings", "Personal", "Questions",
        "Recommendations", "Feedback", "Statement"
    ]
}

# Few-shot examples for better query understanding
FEW_SHOT_EXAMPLES = [
    {
        "query": "who is pm of india",
        "expanded": "Who is the pm of india?",
    },
    {
        "query": "what are his duties",
        "expanded": "what are duties of narendra modi?",
    },
    {
        "query": "Give me minute, I am coming back",
        "expanded": "Give me minute, I am coming back",
    },
    {
        "query": "what about uk",
        "expanded": "who is pm of uk and what are his duties? ",
        "topic": "Current Affairs-Politics"
    }
]

@dataclass
class Message:
    role: str
    content: str
    expanded_query: Optional[str] = None
    topic: Optional[str] = None
    timestamp: datetime = datetime.now()
    confidence_score: Optional[float] = None

class QueryProcessor:
    def __init__(self, api_key: str):
        """Initialize QueryProcessor with direct API key."""
        self.client = Together(api_key=api_key)
        self.conversation_history: List[Message] = []
        self.query_history: List[Tuple[str, str, str, float]] = []  # (query, expanded_query, topic, confidence_score)
        self.error_count = 0
        self.max_retries = 3

    def create_expansion_prompt(self, query: str) -> str:
        """Create a prompt with few-shot examples and conversation context for better query expansion."""
        # Get recent conversation context
        recent_context = self.conversation_history[-3:] if self.conversation_history else []
        context_str = "\n".join([
            f"{msg.role}: {msg.content} (Topic: {msg.topic})"
            for msg in recent_context
        ])

        # Get recent query history
        recent_queries = self.query_history[-5:]
        query_history_str = "\n".join([
            f"Query: {q}\nExpanded: {eq}\nTopic: {t} (Confidence: {c:.2f})"
            for q, eq, t, c in recent_queries
        ])

        # Format few-shot examples
        examples_str = "\n".join([
            f"Query: {ex['query']}\nExpanded: {ex['expanded']}\nTopic: {ex['topic']}\n"
            for ex in FEW_SHOT_EXAMPLES
        ])

        return f"""Given the conversation context, recent query history, and current query, expand the query to be more complete and identify its topic.
Use these few-shot examples as a guide:

{examples_str}

Available topic hierarchies (format as MainTopic-SubTopic):
{json.dumps(TOPIC_HIERARCHIES, indent=2)}

Recent conversation context:
{context_str}

Recent query history:
{query_history_str}

Current query: "{query}"

Return ONLY a JSON object in this format:
{{
    "expanded_query": "complete expanded query",
    "topic": "MainTopic-SubTopic",
    "confidence_score": 0.0 to 1.0
}}"""

    def process_query(self, query: str) -> dict:
        """Process a query with error handling, retries, and context awareness."""
        for attempt in range(self.max_retries):
            try:
                prompt = self.create_expansion_prompt(query)
                response = self.client.chat.completions.create(
                    model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
                    messages=[{"role": "user", "content": prompt}],
                    temperature=0.2,
                    max_tokens=2000
                )

                result = json.loads(response.choices[0].message.content)

                # Validate topic and provide fallback
                if not self.validate_topic(result["topic"]):
                    logger.warning(f"Invalid topic format: {result['topic']}")
                    if result["confidence_score"] < 0.75:
                        # Fallback to a more generic topic
                        result["topic"] = self.get_fallback_topic(result["expanded_query"])
                        result["confidence_score"] = 0.5
                    else:
                        # Use the provided topic, but log a warning
                        result["topic"] = "General-Casual Chat"
                        result["confidence_score"] = 0.75

                # Add to conversation and query history
                self.conversation_history.append(Message(
                    role="user",
                    content=query,
                    expanded_query=result["expanded_query"],
                    topic=result["topic"],
                    confidence_score=result.get("confidence_score", 1.0)
                ))
                self.query_history.append((query, result["expanded_query"], result["topic"], result.get("confidence_score", 1.0)))

                # Maintaining a maximum of 20 query history items
                self.query_history = self.query_history[-20:]

                self.error_count = 0  # Reset error count on success
                return result

            except Exception as e:
                logger.error(f"Error processing query (attempt {attempt + 1}): {str(e)}")
                self.error_count += 1

                if attempt == self.max_retries - 1:
                    return {
                        "expanded_query": query,
                        "topic": "General-Casual Chat",
                        "confidence_score": 0.3
                    }

    def get_fallback_topic(self, expanded_query: str) -> str:
        """Get a fallback topic based on the expanded query."""

        return "General-Casual Chat"

    def get_bot_response(self, expanded_query: str) -> str:
        """Get bot response with context awareness and error handling."""
        try:
            # Create context from recent conversation with topics
            context = "\n".join([
                f"{'User' if msg.role == 'user' else 'Assistant'}: {msg.content}"
                f"{f' (Topic: {msg.topic})' if msg.topic else ''}"
                for msg in self.conversation_history[-3:]
            ])

            prompt = f"""Previous conversation:
{context}

User question: {expanded_query}

Provide a helpful, accurate, and concise response while maintaining context awareness."""

            # Use the llama_run function to get the AI output
            bot_response = llama_run(prompt, context)

            # Add to conversation history
            self.conversation_history.append(Message(
                role="assistant",
                content=bot_response
            ))

            return bot_response

        except Exception as e:
            logger.error(f"Error getting bot response: {str(e)}")
            if self.error_count > self.max_retries:
                return "I'm experiencing technical difficulties. Please try again later."
            return "I apologize, but I'm having trouble generating a response right now."

    def validate_topic(self, topic: str) -> bool:
        """Validate if the topic follows the correct hierarchy."""
        try:
            main_topic, sub_topic = topic.split('-')
            return (main_topic in TOPIC_HIERARCHIES and
                   sub_topic in TOPIC_HIERARCHIES[main_topic])
        except:
            return False

def llama_run(question, context):
    api_key = "f281db651f1bd4e2ca85490920a993a5d9adc509dfd78274d055f4fbe3fcc89b"
  #  model = "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"
    formatted_prompt = f"What is {question} in this context: {context}"
    try:
        stream = client.chat.completions.create(
            model=model,
            temperature=.2,
            max_tokens=4096,
            top_p=1,
            top_k=40,
            messages=[{"role": "user", "content": formatted_prompt}],
            stream=True,
        )
        response = ""
        for chunk in stream:
            response_chunk = chunk.delta.content or ""
            response += response_chunk
        return response
    except Exception as e:
        print(f"An error occurred: {e}")
        # Handle the error appropriately, maybe log it or return an error message

def main():
    api_key = "f281db651f1bd4e2ca85490920a993a5d9adc509dfd78274d055f4fbe3fcc89b"
    processor = QueryProcessor(api_key)

    # Main loop
    while True:
        # Getting user input
        user_input = input("User: ")

        # Processing the query
        result = processor.process_query(user_input)
        expanded_query = result["expanded_query"]
        topic = result["topic"]
        confidence_score = result.get("confidence_score", 1.0)

        print(f"Expanded Query: {expanded_query}")
        print(f"Topic: {topic} (Confidence: {confidence_score:.2f})")

        # Getting the bot response
        bot_response = processor.get_bot_response(expanded_query)  # for now i have stopped it
        print(f"Assistant: {bot_response}")
        print("---")

if __name__ == "__main__":
    main()

User: hello how are you?
Expanded Query: Hello, how are you?
Topic: General-Greetings (Confidence: 0.90)
An error occurred: name 'client' is not defined
Assistant: None
---
User: tell me about nuclear power plants
Expanded Query: Tell me about nuclear power plants and how they work
Topic: Knowledge-Science (Confidence: 0.85)
An error occurred: name 'client' is not defined
Assistant: None
---
User: what was i asking about?
Expanded Query: What was I asking about before, regarding nuclear power plants?
Topic: General-Questions (Confidence: 0.95)
An error occurred: name 'client' is not defined
Assistant: None
---
User: how do they work?
Expanded Query: How do nuclear power plants work?
Topic: Knowledge-Science (Confidence: 0.95)
An error occurred: name 'client' is not defined
Assistant: None
---


KeyboardInterrupt: Interrupted by user

:# Dataset Genration Using llama 3.1 70b For chat Simulation




In [None]:
import json
import random
import pandas as pd
from tqdm import tqdm

# topic hierarchies
TOPIC_HIERARCHIES = {
            "Politics": ["Government", "Elections", "International", "Local", "Policy"],
            "Technology": ["AI", "Software", "Hardware", "Internet", "Innovation"],
            "Business": ["Finance", "Companies", "Economy", "Markets", "Startups"],
            "Science": ["Physics", "Biology", "Chemistry", "Space", "Environment"],
            "Entertainment": ["Movies", "Music", "Gaming", "Television", "Arts"],
            "Sports": ["Worldcup","Football", "Cricket", "Basketball", "Tennis", "Athletics"],
            "Health": ["Medical", "Fitness", "Nutrition", "Mental", "Wellness"],
            "Education": ["Academic", "Skills", "Training", "Research", "Learning"],
            "Lifestyle": ["Food", "Travel", "Fashion", "Home", "Relationships"],
            "General": ["Casual", "Greetings", "Small_Talk", "Personal", "Miscellaneous"]
}

# Example conversation template for the prompt
EXAMPLE_CONVERSATION = """{
  "conversation_id": "conv_001",
  "messages": [
    {
      "role": "user",
      "content": "who is pm of india",
      "expanded_query": "who is the prime minister of india",
      "topic": "Politics-India",
      "requires_context": false
    },
    {
      "role": "assistant",
      "content": "The Prime Minister of India is Narendra Modi."
    },
    {
      "role": "user",
      "content": "what are his main achievements",
      "expanded_query": "what are Narendra Modi's main achievements",
      "topic": "Politics-India",
      "requires_context": true
    },
    {
      "role": "assistant",
      "content": "Some of Narendra Modi's main achievements include implementing GST, Digital India initiative, and various economic reforms."
    },
    {
      "role": "user",
      "content": "gtg, will be back in 5",
      "expanded_query": "I need to go, will be back in 5 minutes",
      "topic": "General-Statement",
      "requires_context": false
    },
    {
      "role": "assistant",
      "content": "Sure, take your time. I'll be here when you return."
    }
  ]
}"""

#  generation prompt for the Llama model
def create_generation_prompt(topic_focus: str = None) -> str:
    prompt = f"""Generate a new conversation similar to the example below. The conversation should:
1. Include natural topic transitions
2. Use context-dependent queries
3. Sometimes include general conversation elements
4. Focus on {topic_focus if topic_focus else 'any topic from the available topics'}

Example format:
{EXAMPLE_CONVERSATION}

Generate a new, unique conversation following this format but with different content and topics."""
    return prompt

class DatasetGenerator:
    def __init__(self, api_key: str, model: str, temp: float):
        self.api_key = api_key
        self.model = model
        self.temp = temp
        self.generated_conversations = []

    def llama_run(self, topic_focus: str = None) -> str:
        # prompt for a specific topic focus
        formatted_prompt = create_generation_prompt(topic_focus)

        # Simulate Together API call with generated prompt
        client = Together(api_key=self.api_key)
        try:
            stream = client.chat.completions.create(
                model=self.model,
                temperature=self.temp,
                max_tokens=4096,
                top_p=1,
                top_k=40,
                messages=[{"role": "user", "content": formatted_prompt}],
                stream=True,
            )

            response = ""
            for chunk in stream:
                response_chunk = chunk.choices[0].delta.content or ""
                response += response_chunk
            print(response)
            conversation = json.loads(response)
            print(conversation)
            return conversation

        except Exception as e:
            print(f"Error generating conversation: {e}")
            return None

    def generate_dataset(self, num_conversations: int) -> list:
        conversations = []

        for _ in tqdm(range(num_conversations)):
            topic_focus = random.choice(list(TOPIC_HIERARCHIES.keys()))
            print(topic_focus)
            conversation = self.llama_run(topic_focus)
            print(conversation)
            if conversation:
                conversations.append(conversation)

        self.generated_conversations = conversations
        return conversations

    def save_dataset(self, filename: str = "conversation_dataset.json"):
        # Save as JSON
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(self.generated_conversations, f, indent=2)

        # Also saving as CSV for easier inspection
        flat_data = []
        for conv in self.generated_conversations:
            for msg in conv['messages']:
                if msg['role'] == 'user':
                    flat_data.append({
                        'conversation_id': conv['conversation_id'],
                        'query': msg['content'],
                        'expanded_query': msg.get('expanded_query', ''),
                        'topic': msg.get('topic', ''),
                        'requires_context': msg.get('requires_context', False)
                    })

        df = pd.DataFrame(flat_data)
        df.to_csv(filename.replace('.json', '.csv'), index=False)

# Usage
def main():
    generator = DatasetGenerator(api_key="f281db651f1bd4e2ca85490920a993a5d9adc509dfd78274d055f4fbe3fcc89b", model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", temp=0.2)
    conversations = generator.generate_dataset(100)  # Generate 100 conversations

    # Saving the dataset
    generator.save_dataset()

if __name__ == "__main__":
    main()


In [None]:
import json
import random
import pandas as pd
from tqdm import tqdm
from collections import deque
from typing import List, Dict, Any

# Define topic hierarchies (unchanged)
TOPIC_HIERARCHIES = {
    "Politics": ["India", "UK", "USA", "China", "Global"],
    "Sports": ["Cricket", "Football", "Tennis", "Basketball", "Olympics"],
    "Technology": ["AI", "Mobile", "Gaming", "Internet", "Gadgets"],
    "Entertainment": ["Movies", "Music", "TV", "Celebrity", "Gaming"],
    "Business": ["Finance", "Startups", "Markets", "Economy", "Companies"],
    "Science": ["Space", "Medicine", "Environment", "Research", "Innovation"],
    "General": ["Greeting", "Casual", "Question", "Statement"]
}

# Modified conversation template with context tracking
EXAMPLE_CONVERSATION = """{
  "conversation_id": "conv_001",
  "messages": [
    {
      "role": "user",
      "content": "who is pm of india",
      "expanded_query": "who is the prime minister of india",
      "topic": "Politics-India",
      "requires_context": false,
      "context_messages": []
    },
    {
      "role": "assistant",
      "content": "The Prime Minister of India is Narendra Modi."
    },
    {
      "role": "user",
      "content": "what are his main achievements",
      "expanded_query": "what are Narendra Modi's main achievements",
      "topic": "Politics-India",
      "requires_context": true,
      "context_messages": [
        {"role": "user", "content": "who is pm of india"},
        {"role": "assistant", "content": "The Prime Minister of India is Narendra Modi."}
      ]
    }
  ]
}"""

def create_generation_prompt(topic_focus: str = None) -> str:
    prompt = f"""Generate a new conversation similar to the example below. The conversation should:
1. Include natural topic transitions
2. Use context-dependent queries that reference previous messages
3. Sometimes include general conversation elements
4. Have at least 20 messages (10 exchanges between user and assistant)
5. Focus on {topic_focus if topic_focus else 'any topic from the available topics'}

Example format:
{EXAMPLE_CONVERSATION}

Generate a new, unique conversation following this format but with different content and topics.
Make sure to maintain proper context tracking for each user message."""
    return prompt

class DatasetGenerator:
    def __init__(self, api_key: str, model: str, temp: float):
        self.api_key = api_key
        self.model = model
        self.temp = temp
        self.generated_conversations = []

    def get_context_messages(self, messages: List[Dict[str, Any]], current_idx: int,
                           context_window: int = 20) -> List[Dict[str, Any]]:
        """
        Get the previous messages as context for the current message.

        Args:
            messages: List of all messages in the conversation
            current_idx: Index of the current message
            context_window: Number of previous messages to include as context

        Returns:
            List of context messages
        """
        context_start = max(0, current_idx - context_window)
        return messages[context_start:current_idx]

    def process_conversation(self, conversation: Dict[str, Any]) -> Dict[str, Any]:
        """
        Process a generated conversation to add proper context tracking.
        """
        messages = conversation['messages']
        context_queue = deque(maxlen=20)  # Keep track of last 20 messages

        for i, message in enumerate(messages):
            if message['role'] == 'user':
                # Add context tracking for user messages
                message['context_messages'] = list(context_queue)

                # Determine if the message requires context
                message['requires_context'] = any(
                    word in message['content'].lower()
                    for word in ['this', 'that', 'it', 'they', 'he', 'she', 'their', 'these', 'those']
                ) or i > 0

            # Add message to context queue
            context_queue.append({
                'role': message['role'],
                'content': message['content']
            })

        return conversation

    def llama_run(self, topic_focus: str = None) -> str:
        formatted_prompt = create_generation_prompt(topic_focus)

        # Simulate Together API call with generated prompt
        client = Together(api_key=self.api_key)
        try:
            stream = client.chat.completions.create(
                model=self.model,
                temperature=self.temp,
                max_tokens=4096,  # Increased to accommodate longer conversations
                top_p=1,
                top_k=40,
                messages=[{"role": "user", "content": formatted_prompt}],
                stream=True,
            )

            response = ""
            for chunk in stream:
                response_chunk = chunk.choices[0].delta.content or ""
                response += response_chunk

            conversation = json.loads(response)
            processed_conversation = self.process_conversation(conversation)
            return processed_conversation

        except Exception as e:
            print(f"Error generating conversation: {e}")
            return None

    def generate_dataset(self, num_conversations: int) -> list:
        conversations = []

        for _ in tqdm(range(num_conversations)):
            topic_focus = random.choice(list(TOPIC_HIERARCHIES.keys()))
            print(f"Generating conversation with topic focus: {topic_focus}")

            conversation = self.llama_run(topic_focus)
            if conversation:
                # Verify conversation length
                if len(conversation['messages']) >= 10:
                    conversations.append(conversation)
                else:
                    print(f"Skipping conversation with insufficient messages: {len(conversation['messages'])}")

        self.generated_conversations = conversations
        return conversations

    def save_dataset(self, filename: str = "conversation_dataset.json"):
        # Save as JSON
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(self.generated_conversations, f, indent=2)

        # Save as CSV with context information
        flat_data = []
        for conv in self.generated_conversations:
            for msg in conv['messages']:
                if msg['role'] == 'user':
                    flat_data.append({
                        'conversation_id': conv['conversation_id'],
                        'query': msg['content'],
                        'expanded_query': msg.get('expanded_query', ''),
                        'topic': msg.get('topic', ''),
                        'requires_context': msg.get('requires_context', False),
                        'context_messages': json.dumps(msg.get('context_messages', []), ensure_ascii=False),
                        'num_context_messages': len(msg.get('context_messages', [])),
                    })

        df = pd.DataFrame(flat_data)
        df.to_csv(filename.replace('.json', '.csv'), index=False)

def main():
    generator = DatasetGenerator(
        api_key="f281db651f1bd4e2ca85490920a993a5d9adc509dfd78274d055f4fbe3fcc89b",
        model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
        temp=0.2
    )
    conversations = generator.generate_dataset(num_conversations=50)
    generator.save_dataset()

if __name__ == "__main__":
    main()

  0%|          | 0/50 [00:00<?, ?it/s]

Generating conversation with topic focus: General


  2%|▏         | 1/50 [00:17<14:24, 17.64s/it]

Generating conversation with topic focus: Science


  4%|▍         | 2/50 [00:33<13:24, 16.75s/it]

Generating conversation with topic focus: Entertainment


  6%|▌         | 3/50 [00:46<11:32, 14.73s/it]

Generating conversation with topic focus: Sports


  8%|▊         | 4/50 [00:55<09:40, 12.62s/it]

Generating conversation with topic focus: Entertainment


 10%|█         | 5/50 [01:10<10:01, 13.37s/it]

Generating conversation with topic focus: Science


 12%|█▏        | 6/50 [01:28<10:57, 14.94s/it]

Generating conversation with topic focus: General


 14%|█▍        | 7/50 [01:47<11:45, 16.41s/it]

Generating conversation with topic focus: General


 16%|█▌        | 8/50 [02:00<10:36, 15.16s/it]

Generating conversation with topic focus: Sports


 18%|█▊        | 9/50 [02:12<09:44, 14.26s/it]

Generating conversation with topic focus: Sports


 20%|██        | 10/50 [02:24<09:09, 13.74s/it]

Generating conversation with topic focus: Sports


 22%|██▏       | 11/50 [02:37<08:47, 13.52s/it]

Generating conversation with topic focus: Science


 24%|██▍       | 12/50 [02:51<08:33, 13.52s/it]

Error generating conversation: Expecting value: line 1 column 1 (char 0)
Generating conversation with topic focus: Sports


 26%|██▌       | 13/50 [03:03<08:05, 13.13s/it]

Generating conversation with topic focus: Entertainment


 28%|██▊       | 14/50 [03:19<08:16, 13.79s/it]

Generating conversation with topic focus: Science


 30%|███       | 15/50 [03:35<08:35, 14.74s/it]

Generating conversation with topic focus: Sports


 32%|███▏      | 16/50 [03:53<08:50, 15.61s/it]

Generating conversation with topic focus: Entertainment


 34%|███▍      | 17/50 [04:09<08:35, 15.62s/it]

Generating conversation with topic focus: Business


 36%|███▌      | 18/50 [04:27<08:44, 16.39s/it]

Generating conversation with topic focus: Technology


 38%|███▊      | 19/50 [04:38<07:34, 14.66s/it]

Generating conversation with topic focus: Business


 40%|████      | 20/50 [04:48<06:41, 13.37s/it]

Generating conversation with topic focus: Politics


 42%|████▏     | 21/50 [05:13<08:10, 16.92s/it]

Generating conversation with topic focus: Politics


 44%|████▍     | 22/50 [05:26<07:23, 15.84s/it]

Generating conversation with topic focus: General


 46%|████▌     | 23/50 [05:36<06:15, 13.90s/it]

Generating conversation with topic focus: General


 48%|████▊     | 24/50 [06:04<07:49, 18.06s/it]

Generating conversation with topic focus: Politics


 50%|█████     | 25/50 [06:17<06:57, 16.70s/it]

Generating conversation with topic focus: Entertainment


 52%|█████▏    | 26/50 [06:31<06:21, 15.92s/it]

Generating conversation with topic focus: Science


 54%|█████▍    | 27/50 [06:43<05:37, 14.68s/it]

Generating conversation with topic focus: Sports


 56%|█████▌    | 28/50 [06:55<05:06, 13.93s/it]

Generating conversation with topic focus: Science


 58%|█████▊    | 29/50 [07:12<05:10, 14.78s/it]

Generating conversation with topic focus: Science


 60%|██████    | 30/50 [07:26<04:52, 14.65s/it]

Error generating conversation: Expecting value: line 1 column 1 (char 0)
Generating conversation with topic focus: Sports


 62%|██████▏   | 31/50 [07:38<04:20, 13.70s/it]

Generating conversation with topic focus: Entertainment


 64%|██████▍   | 32/50 [07:49<03:51, 12.88s/it]

Generating conversation with topic focus: Entertainment


 66%|██████▌   | 33/50 [08:04<03:49, 13.47s/it]

Generating conversation with topic focus: Politics


 68%|██████▊   | 34/50 [08:18<03:42, 13.88s/it]

Generating conversation with topic focus: General


 70%|███████   | 35/50 [08:44<04:20, 17.39s/it]

Generating conversation with topic focus: Technology


 72%|███████▏  | 36/50 [08:55<03:37, 15.57s/it]

Generating conversation with topic focus: Entertainment


 74%|███████▍  | 37/50 [09:12<03:25, 15.83s/it]

Generating conversation with topic focus: Science


 76%|███████▌  | 38/50 [09:30<03:20, 16.69s/it]

Generating conversation with topic focus: Technology


 78%|███████▊  | 39/50 [09:45<02:57, 16.11s/it]

Generating conversation with topic focus: Entertainment


 80%|████████  | 40/50 [10:05<02:52, 17.20s/it]

Generating conversation with topic focus: Politics


 82%|████████▏ | 41/50 [10:19<02:26, 16.31s/it]

Generating conversation with topic focus: Sports


 84%|████████▍ | 42/50 [10:32<02:02, 15.37s/it]

Generating conversation with topic focus: Technology


 86%|████████▌ | 43/50 [10:44<01:40, 14.29s/it]

Generating conversation with topic focus: Entertainment


 88%|████████▊ | 44/50 [10:58<01:24, 14.03s/it]

Generating conversation with topic focus: Science


 90%|█████████ | 45/50 [11:16<01:17, 15.43s/it]

Generating conversation with topic focus: Sports


 92%|█████████▏| 46/50 [11:29<00:58, 14.61s/it]

Generating conversation with topic focus: Entertainment


 94%|█████████▍| 47/50 [11:39<00:39, 13.27s/it]

Generating conversation with topic focus: Business


 96%|█████████▌| 48/50 [11:49<00:24, 12.27s/it]

Generating conversation with topic focus: Technology


 98%|█████████▊| 49/50 [12:04<00:13, 13.16s/it]

Generating conversation with topic focus: Science


100%|██████████| 50/50 [12:21<00:00, 14.83s/it]


# Dataset Genration Using llama 3.1 70b For chat Simulation



In [None]:
# generation prompt for the Llama model
import json
import random
import pandas as pd
from tqdm import tqdm
# Define topic hierarchies (unchanged)
# Define topic hierarchies (unchanged)
TOPIC_HIERARCHIES = {
    "Politics": ["India", "UK", "USA", "China", "Global"],
    "Sports": ["Cricket", "Football", "Tennis", "Basketball", "Olympics"],
    "Technology": ["AI", "Mobile", "Gaming", "Internet", "Gadgets"],
    "Entertainment": ["Movies", "Music", "TV", "Celebrity", "Gaming"],
    "Business": ["Finance", "Startups", "Markets", "Economy", "Companies"],
    "Science": ["Space", "Medicine", "Environment", "Research", "Innovation"],
    "General": ["Greeting", "Casual", "Question", "Statement"]
}

EXAMPLE_CONVERSATION ="""{
  "conversation_id": "conv_001",
  "messages": [
    {
      "role": "user",
      "content": "who is pm of india",
      "expanded_query": "who is the prime minister of india?",
      "requires_context": false
    },
    {
      "role": "user",
      "content": "what are his main achievements?",
      "expanded_query": "what are the main achievements of the prime minister of india?",
      "requires_context": true
    },
    {
      "role": "user",
      "content": "Okay bye, will talk later",
      "expanded_query": "Okay bye, will talk later",
      "requires_context": false
    },

    {
      "role": "user",
      "content": "What about Pakistan?",
      "expanded_query": "Who is the pm of Pakistan and what are his main achievement?",
      "requires_context": True
    },
    {
      "role": "user",
      "content": "What is 2+2?",
      "expanded_query": "What is 2+2?",
      "requires_context": false
  ]
}
"""
def create_generation_prompt(topic_focus: str = None) -> str:
    prompt = f"""Generate a new conversation similar to the example below. The conversation should:
1. Include only user messages.
2. Expand queries based on context from previous user queries when necessary.
3. Sometimes include general conversation elements
4. Have at least 20 messages
5. Avoid assistant responses entirely.
6. Focus on {topic_focus if topic_focus else 'any topic from the available topics'}

Example format:
{EXAMPLE_CONVERSATION}

Generate a new, unique conversation following this format but with different content and topics."""
    return prompt
class DatasetGenerator:
    def __init__(self, api_key: str, model: str, temp: float):
        self.api_key = api_key
        self.model = model
        self.temp = temp
        self.generated_conversations = []

    def llama_run(self, topic_focus: str = None) -> str:
        # Generate prompt for a specific topic focus
        formatted_prompt = create_generation_prompt(topic_focus)

        # Simulate Together API call with generated prompt
        client = Together(api_key=self.api_key)
        try:
            stream = client.chat.completions.create(
                model=self.model,
                temperature=self.temp,
                max_tokens=4096,
                top_p=1,
                top_k=40,
                messages=[{"role": "user", "content": formatted_prompt}],
                stream=True,
            )

            response = ""
            for chunk in stream:
                response_chunk = chunk.choices[0].delta.content or ""
                response += response_chunk

            # Parse the conversation, filtering out any assistant content
            conversation = json.loads(response)
            filtered_conversation = {
                "conversation_id": conversation["conversation_id"],
                "messages": [
                    {
                        "role": msg["role"],
                        "content": msg["content"],
                        "expanded_query": msg["expanded_query"],
                        "requires_context": msg["requires_context"]
                    }
                    for msg in conversation["messages"]
                    if msg["role"] == "user"
                ]
            }
            return filtered_conversation

        except Exception as e:
            print(f"Error generating conversation: {e}")
            return None

    def generate_dataset(self, num_conversations: int) -> list:
      conversations = []
      topic_subtopics = [(topic, subtopic) for topic, subtopics in TOPIC_HIERARCHIES.items() for subtopic in subtopics]

      # Determine how many conversations per topic-subtopic pair
      total_combinations = len(topic_subtopics)
      num_per_combination = max(1, num_conversations // total_combinations)

      for topic, subtopic in topic_subtopics:
          print(f"Generating conversations for topic: {topic}, subtopic: {subtopic}")
          for _ in range(num_per_combination):
              topic_focus = f"{topic} - {subtopic}"
              conversation = self.llama_run(topic_focus)
              if conversation:
                  conversations.append(conversation)

      # Generate remaining conversations if num_conversations is not perfectly divisible
      remaining_conversations = num_conversations - len(conversations)
      if remaining_conversations > 0:
          print("Generating remaining conversations to match total count.")
          for _ in range(remaining_conversations):
              topic, subtopic = random.choice(topic_subtopics)
              topic_focus = f"{topic} - {subtopic}"
              conversation = self.llama_run(topic_focus)
              if conversation:
                  conversations.append(conversation)

      self.generated_conversations = conversations
      return conversations


    def save_dataset(self, filename: str = "conversation_dataset.json"):
        # Save as JSON
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(self.generated_conversations, f, indent=2)

        # Also saving as CSV for easier inspection
        flat_data = []
        for conv in self.generated_conversations:
            for msg in conv['messages']:
                if msg['role'] == 'user':
                    flat_data.append({
                        'conversation_id': conv['conversation_id'],
                        'query': msg['content'],
                        'expanded_query': msg.get('expanded_query', ''),
                        'topic': msg.get('topic', ''),
                        'requires_context': msg.get('requires_context', False)
                    })

        df = pd.DataFrame(flat_data)
        df.to_csv(filename.replace('.json', '.csv'), index=False)

# Usage
def main():
    generator = DatasetGenerator(api_key="f281db651f1bd4e2ca85490920a993a5d9adc509dfd78274d055f4fbe3fcc89b", model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", temp=0.2)
    conversations = generator.generate_dataset(1)  # Generate 100 conversations

    # Saving the dataset
    generator.save_dataset()

if __name__ == "__main__":
    main()


Generating conversations for topic: Politics, subtopic: India
Generating conversations for topic: Politics, subtopic: UK
Generating conversations for topic: Politics, subtopic: USA
Generating conversations for topic: Politics, subtopic: China
Generating conversations for topic: Politics, subtopic: Global
Generating conversations for topic: Sports, subtopic: Cricket
Generating conversations for topic: Sports, subtopic: Football
Generating conversations for topic: Sports, subtopic: Tennis
Generating conversations for topic: Sports, subtopic: Basketball
Generating conversations for topic: Sports, subtopic: Olympics
Generating conversations for topic: Technology, subtopic: AI
Generating conversations for topic: Technology, subtopic: Mobile
Generating conversations for topic: Technology, subtopic: Gaming
Generating conversations for topic: Technology, subtopic: Internet
Generating conversations for topic: Technology, subtopic: Gadgets
Generating conversations for topic: Entertainment, subto

In [None]:
!pip install datasets

In [None]:
import json
from typing import List, Dict
from dataclasses import dataclass
import datasets
from datasets import Dataset

@dataclass
class Message:
    role: str
    content: str
    expanded_query: str
    requires_context: bool

@dataclass
class Conversation:
    conversation_id: str
    messages: List[Message]

def process_conversations(raw_data: List[Dict]) -> List[Dict]:
    """
    Process raw conversation data into training examples.
    Each example includes conversation history and the target expanded query.
    """
    training_examples = []

    for conv in raw_data:
        messages = conv['messages']
        conversation_history = []

        for idx, msg in enumerate(messages):
            if msg['role'] != 'user':
                continue

            # Create the conversation history
            history = ""
            if conversation_history:
                history = " [SEP] ".join(conversation_history)

            # Create training example
            example = {
                'conversation_id': conv['conversation_id'],
                'input_query': msg['content'],
                'conversation_history': history,
                'expanded_query': msg['expanded_query'],
                'requires_context': msg['requires_context']
            }

            training_examples.append(example)
            conversation_history.append(f"User: {msg['content']}")

    return training_examples

def create_huggingface_dataset(training_examples: List[Dict]) -> Dataset:
    """
    Convert the processed examples into a Hugging Face dataset.
    """
    dataset_dict = {
        'conversation_id': [],
        'input_query': [],
        'conversation_history': [],
        'expanded_query': [],
        'requires_context': []
    }

    for example in training_examples:
        for key in dataset_dict.keys():
            dataset_dict[key].append(example[key])

    return Dataset.from_dict(dataset_dict)

def format_for_training(dataset: Dataset) -> Dataset:
    """
    Format the dataset for training by creating input and target text fields.
    """
    def format_example(example):
        # Format input text
        input_text = f"Query: {example['input_query']}\n"
        if example['conversation_history']:
            input_text = f"Context: {example['conversation_history']}\n" + input_text

        # Format target text
        target_text = example['expanded_query']

        return {
            'input_text': input_text,
            'target_text': target_text
        }

    return dataset.map(format_example)

# Example usage:
# Load your JSON data
with open('/content/conversation_dataset (2).json', 'r') as f:
    raw_data = json.load(f)

# Process the conversations
training_examples = process_conversations(raw_data)

# Create Hugging Face dataset
dataset = create_huggingface_dataset(training_examples)

# Format for training
training_dataset = format_for_training(dataset)

# Save the dataset
training_dataset.save_to_disk('query_expansion_dataset')


Map:   0%|          | 0/653 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/653 [00:00<?, ? examples/s]

In [None]:

dataset = datasets.load_from_disk('query_expansion_dataset')

In [None]:
dataset

Dataset({
    features: ['conversation_id', 'input_query', 'conversation_history', 'expanded_query', 'requires_context', 'input_text', 'target_text'],
    num_rows: 653
})

In [None]:
# prompt: print two example dataset from the abaove
dataset[10]

{'conversation_id': 'conv_002',
 'input_query': 'can he be removed',
 'conversation_history': 'User: who is the president of india [SEP] User: what are his powers [SEP] User: can he dissolve parliament [SEP] User: what about the lok sabha [SEP] User: who is the speaker of lok sabha [SEP] User: what are his responsibilities [SEP] User: how is he elected [SEP] User: what about the rajya sabha [SEP] User: who is the chairman of rajya sabha [SEP] User: what are his powers',
 'expanded_query': 'can the chairman of the rajya sabha be removed from office?',
 'requires_context': True,
 'input_text': 'Context: User: who is the president of india [SEP] User: what are his powers [SEP] User: can he dissolve parliament [SEP] User: what about the lok sabha [SEP] User: who is the speaker of lok sabha [SEP] User: what are his responsibilities [SEP] User: how is he elected [SEP] User: what about the rajya sabha [SEP] User: who is the chairman of rajya sabha [SEP] User: what are his powers\nQuery: can h

In [None]:
import json
from datasets import Dataset
from typing import Dict, List

# Define the Alpaca prompt template
ALPACA_PROMPT = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Expand the given query based on the conversation context if available.

### Input:
{input}

### Response:
{output}"""

def prepare_dataset_for_alpaca(examples: Dict) -> Dict:
    """
    Converts the query expansion dataset into Alpaca format.
    """
    texts = []

    for input_text, target_text in zip(examples['input_text'], examples['target_text']):
        # Format the text according to Alpaca template
        text = ALPACA_PROMPT.format(
            input=input_text.strip(),
            output=target_text.strip()
        )
        texts.append(text)

    return {"text": texts}

def format_dataset(dataset: Dataset, tokenizer) -> Dataset:
    """
    Format the dataset and add EOS tokens.
    """
    # First convert to Alpaca format
    formatted_dataset = dataset.map(
        prepare_dataset_for_alpaca,
        batched=True,
        remove_columns=dataset.column_names  # Remove original columns
    )

    # Add EOS token to each example
    def add_eos_token(example):
        example['text'] = example['text'] + tokenizer.eos_token
        return example

    formatted_dataset = formatted_dataset.map(add_eos_token)

    return formatted_dataset

# Example usage:
"""
from transformers import AutoTokenizer

# Load your dataset
dataset = Dataset.from_dict({
    'conversation_id': [...],
    'input_text': [...],
    'target_text': [...],
    'requires_context': [...]
})

# Load your tokenizer
tokenizer = AutoTokenizer.from_pretrained("your-model-name")

# Format the dataset
formatted_dataset = format_dataset(dataset, tokenizer)

# Now your dataset is ready for fine-tuning!
"""

# For demonstration, here's how to process a single example:
def format_single_example(example: Dict, tokenizer) -> str:
    """
    Format a single example to show the output format.
    """
    text = ALPACA_PROMPT.format(
        input=example['input_text'].strip(),
        output=example['target_text'].strip()
    )
    return text + tokenizer.eos_token

# Example of how the formatting looks
example = {
    'input_text': 'Context: User: who is the president of india [SEP] User: what are his powers\nQuery: can he dissolve parliament\n',
    'target_text': 'can the president of india dissolve the parliament?'
}

print("Example of formatted output:")
print("-" * 50)
print(format_single_example(example, tokenizer))  # Note: This won't run without a tokenizer