In [1]:
import argparse
import json
import logging
import os
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List
from itertools import islice
from dotenv import load_dotenv
import traceback2 as traceback

In [2]:
os.chdir("/home/s2mudemi")

In [3]:
import gc 
gc.collect()

0

In [4]:
from research_case.analyzers.llm_client import LLMClient
from research_case.generators.post_generator import PostGenerator, StimulusGenerator, GenerationPrompt

In [5]:
# Setup logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [6]:
def generate_posts_for_user(
    user_id: str,
    persona: Dict,
    original_posts: List[Dict],
    post_generator: PostGenerator,
    stimulus_generator: StimulusGenerator,
    posts_per_persona: int
) -> List[Dict]:
    """
    Generate posts for a single user and return in flat structure.
    
    Args:
        user_id: User identifier
        persona: User's persona data
        original_posts: List of original posts
        post_generator: PostGenerator instance
        stimulus_generator: StimulusGenerator instance
        posts_per_persona: Number of posts to generate
        
    Returns:
        List of generated post records
    """
    generated_records = []
    timestamp = datetime.now(timezone.utc).isoformat()
    
    for i in range(posts_per_persona):
        if i < len(original_posts):
            # Extract original post content
            original_post = original_posts[i].get('full_text', '')
            original_post_id = original_posts[i].get('tweet_id', '')
            original_timestamp = original_posts[i].get('created_at', '')
            
            # Create stimulus and generate new post
            stimulus = stimulus_generator.create_post_stimulus(original_post)
            prompt = GenerationPrompt(persona=persona, stimulus=stimulus)
            generated_text = post_generator.generate_post(prompt)
            
            # Create flat record structure
            record = {
                "user_id": user_id,
                "generation_id": f"{user_id}_gen_{i}",  # Unique identifier for generated post
                "original_post_id": original_post_id,
                "original_text": original_post,
                "original_timestamp": original_timestamp,
                "stimulus": stimulus,
                "generated_text": generated_text,
                "generation_timestamp": timestamp,
                **{f"persona_{k}": v for k, v in persona.items()},
            }
            
            generated_records.append(record)
    
    return generated_records

In [7]:
def main():

    args = {
        "personas": "research_case/data/my_experiment/personas.json",
        "posts": "research_case/data/preprocessed/processed_users.json",
        "output": "research_case/data/my_experiment/generated_post.json",
        "posts_per_persona": 3
    }

    # Step 2: Set up file paths
    personas_path = args["personas"]
    posts_path = args["posts"]
    output_path = args["output"] or os.path.join(
        os.path.dirname(personas_path), "generated_posts.json"
    )

    # Validate input files
    for path in [personas_path, posts_path]:
        if not os.path.exists(path):
            logger.error(f"Input file not found: {path}")
            exit(1)

    load_dotenv()
    llm_client = LLMClient(model_name="llama3:70b")
    post_generator = PostGenerator(llm_client)
    stimulus_generator = StimulusGenerator(llm_client)

    try:
        logger.info("Starting post generation...")

        # Load personas and posts
        with open(personas_path, 'r') as f:
            personas = json.load(f)
        with open(posts_path, 'r') as f:
            original_posts = json.load(f)

        # Initialize results list for flat structure
        all_generated_records = []

        # Generate posts for each persona
        for user_id, persona_variations in personas.items():
            logger.info(f"Generating posts for user {user_id}")

            # Get user's original posts
            user_posts = original_posts.get(user_id, [])
            if not user_posts:
                logger.warning(f"No original posts found for user {user_id}")
                continue

            for persona_index, persona in enumerate(persona_variations):
                logger.info(f"  Generating posts for persona variation {persona_index + 1} for user {user_id}")

                user_records = generate_posts_for_user(
                    user_id=user_id,
                    persona=persona,
                    original_posts=user_posts,
                    post_generator=post_generator,
                    stimulus_generator=stimulus_generator,
                    posts_per_persona=args["posts_per_persona"]
                )

                for record in user_records:
                    record['persona_variation_index'] = persona_index + 1
                    used_fields = list(persona.keys()) 
                    record['persona_fields_used'] = used_fields

                all_generated_records.extend(user_records)

        output_data = {
            "metadata": {
                "num_users": len(personas),
                "posts_per_persona": args["posts_per_persona"],
                "total_posts_generated": len(all_generated_records)
            },
            "generated_posts": all_generated_records
        }

        # Save results
        with open(output_path, 'w') as f:
            json.dump(output_data, f, indent=4)

        logger.info(f"Post generation completed. Generated {len(all_generated_records)} posts.")
        logger.info(f"Results saved to {output_path}")

    except Exception as e:
        logger.error("Failed to generate posts:")
        logger.error(traceback.format_exc())
        raise

In [8]:
if __name__ == "__main__":
    main()

INFO:__main__:Starting post generation...
INFO:__main__:Generating posts for user 254185636
INFO:__main__:  Generating posts for persona variation 1 for user 254185636
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:__main__:  Generating posts for persona variation 2 for user 254185636
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
