In [37]:
# Tweet Realism Evaluation Experiments (using text, long text, and video/image prompts)

import os
import json
import pandas as pd
import numpy as np
from typing import List, Dict, Any
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm


from dotenv import dotenv_values
OPENAI_KEY = dotenv_values(".env")['OPENAI_API_KEY']

class ExperimentConfig:
    def __init__(self):
        self.base_prompt = """
You are evaluating the realism of tweets. Rate each tweet on a scale of 1-10 
based on the following criteria:
- Consistency with character profile
- Natural language use
- Contextual relevance
- Temporal coherence
Provide a brief explanation for your rating.
"""
        self.character_profiles = {
            "tech_enthusiast": {
                "bio": "25-year-old software engineer, passionate about AI and startups",
                "interests": ["technology", "programming", "startups", "AI"],
                "tone": "technical but accessible"
            },
            "fashion_influencer": {
                "bio": "22-year-old fashion influencer, loves the latest trends and styling tips",
                "interests": ["fashion", "design", "travel", "photography"],
                "tone": "trendy and engaging"
            },
            "foodie": {
                "bio": "30-year-old chef and food blogger, exploring global cuisines",
                "interests": ["cooking", "recipes", "travel", "food photography"],
                "tone": "descriptive and passionate"
            }
        }
        self.evaluation_criteria = [
            "Consistency with character profile",
            "Natural language use",
            "Contextual relevance",
            "Temporal coherence"
        ]
        # OpenAI API Configuration
        self.openai_api_key = OPENAI_KEY
        if not self.openai_api_key:
            raise ValueError("OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")
        openai.api_key = self.openai_api_key
        self.llm_model = "gpt-4o-mini"  # You can choose other models like "gpt-3.5-turbo"


In [46]:
class TweetGenerator:
    def __init__(self, config: ExperimentConfig):
        self.config = config
        self.openai_api_key = OPENAI_KEY
    def generate_tweet(self, 
                       character: str, 
                       context: List[str] = None, 
                       media_context: Dict[str, Any] = None) -> str:
        """
        Generate a tweet based on character profile and available context
        """
        profile = self.config.character_profiles.get(character)
        if not profile:
            raise ValueError(f"Character '{character}' not found in profiles.")

        prompt = f"Character Profile:\nBio: {profile['bio']}\nInterests: {', '.join(profile['interests'])}\nTone: {profile['tone']}\n\n"

        if context:
            prompt += "Historical Tweets:\n"
            for idx, ctx in enumerate(context, 1):
                prompt += f"{idx}. {ctx}\n"

        if media_context:
            prompt += f"\nMedia Context: {media_context.get('type', 'text')}, Description: {media_context.get('description', '')}\n"

        prompt += "\nGenerate a new tweet for this character based on the above information."

        try:
            response = openai.OpenAI(api_key=self.openai_api_key).chat.completions.create(
                model=self.config.llm_model,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that generates realistic tweets."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=100,
                temperature=0.7,
                n=1,
                stop=None
            )
            tweet = response.choices[0].message.content.strip()
            return tweet
        except Exception as e:
            print(f"Error generating tweet: {e}")
            return ""

In [53]:
class TweetEvaluator:
    def __init__(self, config: ExperimentConfig):
        self.config = config
        self.openai_api_key = OPENAI_KEY
    def evaluate_tweet(self, 
                       tweet: str, 
                       character: str, 
                       context: List[str] = None) -> Dict[str, Any]:
        """
        Evaluate tweet realism using an LLM
        """
        profile = self.config.character_profiles.get(character)
        if not profile:
            raise ValueError(f"Character '{character}' not found in profiles.")

        prompt = f"Character Profile:\nBio: {profile['bio']}\nInterests: {', '.join(profile['interests'])}\nTone: {profile['tone']}\n\n"
        if context:
            prompt += "Historical Tweets:\n"
            for idx, ctx in enumerate(context, 1):
                prompt += f"{idx}. {ctx}\n"

        prompt += f"\nTweet to Evaluate:\n\"{tweet}\"\n\n{self.config.base_prompt}"

        try:
            response = openai.OpenAI(api_key=self.openai_api_key).chat.completions.create(
                model=self.config.llm_model,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that evaluates tweet realism."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=150,
                temperature=0.0,  # Lower temperature for more deterministic outputs
                n=1,
                stop=None
            )
            evaluation_text = response.choices[0].message.content.strip()
            # Parse the evaluation_text to extract scores and explanations
            scores = {}
            explanations = ""
            for line in evaluation_text.split('\n'):
                if any(criteria in line for criteria in self.config.evaluation_criteria):
                    try:
                        criterion, score = line.split(':')
                        scores[criterion.strip()] = float(score.strip())
                    except:
                        continue
                elif "Provide a brief explanation" in line:
                    continue
                else:
                    explanations += line + " "
            return {
                "tweet": tweet,
                "character": character,
                "scores": scores,
                "explanation": explanations.strip()
            }
        except Exception as e:
            print(f"Error evaluating tweet: {e}")
            return {
                "tweet": tweet,
                "character": character,
                "scores": {},
                "explanation": f"Error evaluating tweet: {e}"
            }


# Experiment 1

In [54]:
def run_baseline_experiment(characters: List[str], num_tweets: int = 10):
    """
    Generate and evaluate tweets with minimal context
    """
    results = []
    config = ExperimentConfig()
    generator = TweetGenerator(config)
    evaluator = TweetEvaluator(config)
    
    for character in tqdm(characters, desc="Generating tweets"):
        for _ in range(num_tweets):
            tweet = generator.generate_tweet(character=character)
            if tweet:
                evaluation = evaluator.evaluate_tweet(tweet=tweet, character=character)
                results.append(evaluation)
    
    return pd.DataFrame(results)


# Experiment 2: Context-Enhanced Generation

In [55]:
def run_context_experiment(characters: List[str], context_levels: List[int], num_tweets: int = 10):
    """
    Generate and evaluate tweets with varying amounts of historical context
    """
    results = []
    config = ExperimentConfig()
    generator = TweetGenerator(config)
    evaluator = TweetEvaluator(config)
    
    for character in tqdm(characters, desc="Generating context-enhanced tweets"):
        # Generate historical context
        historical_context = []
        for _ in range(max(context_levels)):
            historical_tweet = generator.generate_tweet(character=character)
            if historical_tweet:
                historical_context.append(historical_tweet)
        
        for level in context_levels:
            current_context = historical_context[:level]
            for _ in range(num_tweets):
                tweet = generator.generate_tweet(character=character, context=current_context)
                if tweet:
                    evaluation = evaluator.evaluate_tweet(tweet=tweet, character=character, context=current_context)
                    evaluation["context_level"] = level
                    results.append(evaluation)
    
    return pd.DataFrame(results)

# Experiment 3: Multimedia Context Integration

In [56]:
def run_multimedia_experiment(characters: List[str], media_types: List[str], num_tweets: int = 10):
    """
    Generate and evaluate tweets with added video/image context
    """
    results = []
    config = ExperimentConfig()
    generator = TweetGenerator(config)
    evaluator = TweetEvaluator(config)
    
    for character in tqdm(characters, desc="Generating multimedia-enhanced tweets"):
        for media_type in media_types:
            media_context = {
                "type": media_type,
                "description": f"A {media_type} related to {character}"
            }
            for _ in range(num_tweets):
                tweet = generator.generate_tweet(character=character, media_context=media_context)
                if tweet:
                    evaluation = evaluator.evaluate_tweet(tweet=tweet, character=character, media_context=media_context)
                    evaluation["media_type"] = media_type
                    results.append(evaluation)
    
    return pd.DataFrame(results)

# Visualization Functions

In [57]:
import openai
def plot_realism_scores(df: pd.DataFrame, experiment_name: str):
    """
    Create visualizations for tweet realism scores
    """
    # Convert scores dictionary to separate columns
    scores_df = df['scores'].apply(pd.Series)
    
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=scores_df)
    plt.title(f"Tweet Realism Scores - {experiment_name}")
    plt.ylabel("Scores")
    plt.xlabel("Evaluation Criteria")
    plt.ylim(0, 10)
    plt.show()

def analyze_context_impact(df: pd.DataFrame, experiment_type: str):
    """
    Analyze how different types of context affect realism
    """
    if experiment_type == "Context Levels":
        plt.figure(figsize=(12, 6))
        sns.boxplot(x='context_level', y='scores.Consistency with character profile', data=df)
        plt.title("Consistency with Character Profile vs. Context Level")
        plt.show()
    elif experiment_type == "Multimedia Context":
        plt.figure(figsize=(12, 6))
        sns.boxplot(x='media_type', y='scores.Contextual relevance', data=df)
        plt.title("Contextual Relevance vs. Media Type")
        plt.show()
    else:
        print("Unknown experiment type for analysis.")

# Start the experiment

In [58]:
# Define characters to experiment with
characters = ["tech_enthusiast", "fashion_influencer", "foodie"]

# Run Baseline Experiment
print("Running baseline experiment...")
baseline_results = run_baseline_experiment(characters=characters, num_tweets=10)
plot_realism_scores(baseline_results, "Baseline")

# Run Context Experiment
print("Running context experiment...")
context_levels = [1]#, 5, 10, 20]
context_results = run_context_experiment(characters=characters, context_levels=context_levels, num_tweets=5)


Running baseline experiment...


Generating tweets:   0%|          | 0/3 [00:00<?, ?it/s]

generated respone ChatCompletion(id='chatcmpl-AUJVH33pRk3V7FMzjFvpN3PZSuzWx', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="🚀 Just wrapped up a deep dive into the latest advancements in AI. It's amazing to see how quickly tech is evolving! Can't wait to integrate some of these concepts into my next startup project. The future is bright for those willing to innovate! 💡 #AI #Startups #TechTrends", role='assistant', function_call=None, tool_calls=None, refusal=None))], created=1731788175, model='gpt-4o-mini-2024-07-18', object='chat.completion', system_fingerprint='fp_0ba0d124f1', usage=CompletionUsage(completion_tokens=60, prompt_tokens=69, total_tokens=129, prompt_tokens_details={'cached_tokens': 0, 'audio_tokens': 0}, completion_tokens_details={'reasoning_tokens': 0, 'audio_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}))
evaluation response ChatCompletion(id='chatcmpl-AUJVIpXQB8BdbjlJoe0kzTn5GxMhS', 

Generating tweets:   0%|          | 0/3 [00:08<?, ?it/s]


KeyboardInterrupt: 

In [None]:
plot_realism_scores(context_results, "Context Levels")
analyze_context_impact(context_results, "Context Levels")

# Run Multimedia Experiment
print("Running multimedia experiment...")
media_types = ["image", "video"]
multimedia_results = run_multimedia_experiment(characters=characters, media_types=media_types, num_tweets=5)
plot_realism_scores(multimedia_results, "Multimedia Context")
analyze_context_impact(multimedia_results, "Multimedia Context")

# Save Results
results = {
    "baseline": baseline_results.to_dict(orient='records'),
    "context": context_results.to_dict(orient='records'),
    "multimedia": multimedia_results.to_dict(orient='records')
}

# Export results to CSV and JSON
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
for key, df in [("baseline", baseline_results), 
                ("context", context_results), 
                ("multimedia", multimedia_results)]:
    df.to_csv(f"{key}_results_{timestamp}.csv", index=False)

with open(f"results_{timestamp}.json", "w") as f:
    json.dump(results, f, indent=4)

print("Experiments completed and results saved.")