# Comparison between ML & LLM methods

In [1]:
import json
import pandas as pd
import requests
import time
from typing import List, Dict, Any
import openai
import re
from datetime import datetime
import logging
import tiktoken  # For token counting
import time

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class EurovisionTweetAnalyzer:
    def __init__(self, openai_api_key: str = None):
        """
        Initialize the analyzer
        
        Args:
            openai_api_key: OpenAI API key
        """
        if openai_api_key:
            openai.api_key = openai_api_key
            self.openai_client = openai.OpenAI(api_key=openai_api_key)
        
        # Initialize tiktoken encoder for token counting
        self.encoder = tiktoken.encoding_for_model("gpt-4")
    
    def load_tweets(self, json_file_path: str) -> List[Dict]:
        """Load Twitter JSON data"""
        try:
            with open(json_file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # If it's a single object, convert to list
            if isinstance(data, dict):
                data = [data]
            
            logger.info(f"Successfully loaded {len(data)} tweets")
            return data
        except Exception as e:
            logger.error(f"Failed to load JSON file: {e}")
            return []
    
    def count_tokens(self, text: str) -> int:
        """Count tokens in a text string"""
        tokens = self.encoder.encode(text)
        return len(tokens)
    
    def analyze_with_chatgpt(self, tweet_text: str, tweet_metadata: Dict = None) -> Dict:
        """Analyze tweet with ChatGPT and track time and token usage"""
        try:
            # Create the prompt
            prompt = f"""
            Please analyze if the following tweet is related to the Eurovision Song Contest, and provide detailed analysis:

            Tweet content: "{tweet_text}"

            Please return results in this JSON format:
            {{
                "is_eurovision_related": true/false,
                "confidence_score": 0.0-1.0,
                "detected_language": "language code",
                "english_translation": "English translation (if needed)",
                "eurovision_keywords": ["relevant keyword list"],
                "location_mentions": ["mentioned locations"],
                "sentiment": "positive/negative/neutral",
                "sentiment_score": 0.0-1.0,
                "reasoning": "reasoning for judgment"
            }}
            """
            
            # Count prompt tokens
            system_message = "You are an expert in analyzing Eurovision-related content. Please accurately determine if tweets are related to the Eurovision Song Contest."
            prompt_tokens = self.count_tokens(system_message) + self.count_tokens(prompt)
            
            # Track start time
            start_time = time.time()
            
            # Make API call
            response = self.openai_client.chat.completions.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.1
            )
            
            # Calculate processing time
            processing_time = time.time() - start_time
            
            # Extract token usage from response
            completion_tokens = response.usage.completion_tokens
            prompt_tokens_reported = response.usage.prompt_tokens
            total_tokens = response.usage.total_tokens
            
            # Parse result
            result = json.loads(response.choices[0].message.content)
            
            # Add metadata about the API call
            result['api_used'] = 'chatgpt'
            result['processing_time_seconds'] = processing_time
            result['prompt_tokens'] = prompt_tokens_reported
            result['completion_tokens'] = completion_tokens
            result['total_tokens'] = total_tokens
            result['estimated_prompt_tokens'] = prompt_tokens  # Our pre-call estimate
            
            return result
            
        except Exception as e:
            logger.error(f"ChatGPT analysis failed: {e}")
            return {
                "error": str(e), 
                "api_used": "chatgpt",
                "processing_time_seconds": time.time() - start_time if 'start_time' in locals() else None
            }
    
    def traditional_keyword_filter(self, tweet_text: str) -> Dict:
        """Traditional keyword filtering method with time tracking"""
        # Track start time
        start_time = time.time()
        
        eurovision_keywords = [
            'eurovision'
        ]
        
        # Participating country keywords
        countries = [
            'sweden', 'ukraine', 'italy', 'netherlands', 'spain',
            'germany', 'france', 'united kingdom', 'australia',
            'israel', 'norway', 'finland', 'denmark', 'iceland'
        ]
        
        text_lower = tweet_text.lower()
        found_keywords = []
        found_countries = []
        
        for keyword in eurovision_keywords:
            if keyword in text_lower:
                found_keywords.append(keyword)
        
        for country in countries:
            if country in text_lower:
                found_countries.append(country)
        
        is_related = len(found_keywords) > 0 or len(found_countries) > 1
        confidence = min(1.0, (len(found_keywords) + len(found_countries) * 0.5) / 3)
        
        # Calculate processing time
        processing_time = time.time() - start_time
        
        return {
            "is_eurovision_related": is_related,
            "confidence_score": confidence,
            "eurovision_keywords": found_keywords,
            "location_mentions": found_countries,
            "method": "traditional_keywords",
            "processing_time_seconds": processing_time
        }
    
    def batch_analyze(self, tweets: List[Dict], method: str = "both", 
                     save_results: bool = True, output_file: str = None) -> pd.DataFrame:
        """
        Batch analyze tweets
        
        Args:
            tweets: List of tweet data
            method: Analysis method ("chatgpt", "both", "traditional")
            save_results: Whether to save results
            output_file: Output file path
        """
        results = []
        
        # Track overall processing time
        batch_start_time = time.time()
        
        # Track token and time usage for summary
        api_tokens_used = 0
        api_time_total = 0
        traditional_time_total = 0
        tweets_analyzed = 0
        
        for i, tweet in enumerate(tweets):
            try:
                # Extract tweet text
                tweet_text = tweet.get('text', tweet.get('full_text', ''))
                if not tweet_text:
                    continue
                
                tweets_analyzed += 1
                logger.info(f"Processing tweet {i+1}/{len(tweets)}")
                
                result = {
                    'tweet_id': tweet.get('id_str', tweet.get('id', i)),
                    'tweet_text': tweet_text,
                    'user_screen_name': tweet.get('user', {}).get('screen_name', ''),
                    'created_at': tweet.get('created_at', ''),
                    'original_tweet': tweet
                }
                
                # Analyze based on selected method
                if method in ["chatgpt", "both"]:
                    try:
                        chatgpt_result = self.analyze_with_chatgpt(tweet_text, tweet)
                        result['chatgpt_analysis'] = chatgpt_result
                        
                        # Track API usage statistics
                        if 'total_tokens' in chatgpt_result:
                            api_tokens_used += chatgpt_result['total_tokens']
                        if 'processing_time_seconds' in chatgpt_result:
                            api_time_total += chatgpt_result['processing_time_seconds']
                            
                        time.sleep(1)  # API rate limiting
                    except Exception as e:
                        logger.error(f"ChatGPT analysis failed: {e}")
                
                if method in ["traditional", "both"]:
                    traditional_result = self.traditional_keyword_filter(tweet_text)
                    result['traditional_analysis'] = traditional_result
                    
                    # Track traditional method time
                    if 'processing_time_seconds' in traditional_result:
                        traditional_time_total += traditional_result['processing_time_seconds']
                
                results.append(result)
                
            except Exception as e:
                logger.error(f"Error processing tweet {i}: {e}")
                continue
        
        # Calculate overall metrics
        batch_processing_time = time.time() - batch_start_time
        
        # Create summary statistics
        analysis_summary = {
            'total_tweets_analyzed': tweets_analyzed,
            'batch_processing_time_seconds': batch_processing_time,
            'api_total_tokens_used': api_tokens_used,
            'api_average_tokens_per_tweet': api_tokens_used / tweets_analyzed if tweets_analyzed > 0 and api_tokens_used > 0 else 0,
            'api_total_processing_time_seconds': api_time_total,
            'api_average_time_per_tweet_seconds': api_time_total / tweets_analyzed if tweets_analyzed > 0 and api_time_total > 0 else 0,
            'traditional_total_processing_time_seconds': traditional_time_total,
            'traditional_average_time_per_tweet_seconds': traditional_time_total / tweets_analyzed if tweets_analyzed > 0 and traditional_time_total > 0 else 0,
        }
        
        # Convert to DataFrame
        df = pd.json_normalize(results)
        
        # Add summary as metadata
        df.attrs['analysis_summary'] = analysis_summary
        
        # Print summary
        logger.info("\n===== Analysis Summary =====")
        for key, value in analysis_summary.items():
            if isinstance(value, float):
                logger.info(f"{key}: {value:.4f}")
            else:
                logger.info(f"{key}: {value}")
        
        # Save results
        if save_results:
            if not output_file:
                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                output_file = f"eurovision_analysis_{timestamp}.csv"
            
            df.to_csv(output_file, index=False, encoding='utf-8-sig')
            
            # Save summary to separate file
            summary_file = output_file.replace('.csv', '_summary.json')
            with open(summary_file, 'w') as f:
                json.dump(analysis_summary, f, indent=2)
                
            logger.info(f"Results saved to: {output_file}")
            logger.info(f"Summary saved to: {summary_file}")
        
        return df
    
    def compare_methods(self, results_df: pd.DataFrame) -> Dict:
        """Compare different methods' results"""
        comparison = {}
        
        if 'chatgpt_analysis.is_eurovision_related' in results_df.columns:
            chatgpt_positive = results_df['chatgpt_analysis.is_eurovision_related'].sum()
            chatgpt_tokens = results_df['chatgpt_analysis.total_tokens'].sum()
            chatgpt_time = results_df['chatgpt_analysis.processing_time_seconds'].sum()
            
            comparison['chatgpt'] = {
                'total_analyzed': len(results_df),
                'eurovision_related': chatgpt_positive,
                'percentage': chatgpt_positive / len(results_df) * 100,
                'total_tokens_used': chatgpt_tokens,
                'average_tokens_per_tweet': chatgpt_tokens / len(results_df),
                'total_processing_time': chatgpt_time,
                'average_time_per_tweet': chatgpt_time / len(results_df)
            }
        
        if 'traditional_analysis.is_eurovision_related' in results_df.columns:
            traditional_positive = results_df['traditional_analysis.is_eurovision_related'].sum()
            traditional_time = results_df['traditional_analysis.processing_time_seconds'].sum()
            
            comparison['traditional'] = {
                'total_analyzed': len(results_df),
                'eurovision_related': traditional_positive,
                'percentage': traditional_positive / len(results_df) * 100,
                'total_processing_time': traditional_time,
                'average_time_per_tweet': traditional_time / len(results_df)
            }
        
        # Add agreement statistics if both methods were used
        if 'chatgpt_analysis.is_eurovision_related' in results_df.columns and 'traditional_analysis.is_eurovision_related' in results_df.columns:
            agreement = (results_df['chatgpt_analysis.is_eurovision_related'] == results_df['traditional_analysis.is_eurovision_related']).sum()
            
            comparison['comparison'] = {
                'agreement_count': agreement,
                'agreement_percentage': agreement / len(results_df) * 100,
                'time_difference_factor': comparison['chatgpt']['average_time_per_tweet'] / comparison['traditional']['average_time_per_tweet'] if comparison['traditional']['average_time_per_tweet'] > 0 else float('inf')
            }
        
        return comparison


In [5]:

# 配置API密钥
OPENAI_API_KEY = ""  # 替换为你的OpenAI API密钥
    
    # 初始化分析器
analyzer = EurovisionTweetAnalyzer(
        openai_api_key=OPENAI_API_KEY
    )
    
tweets = analyzer.load_tweets("/media/ys_tum/T7 Shield/25SS/SDI_data/tweets_europe_west_2016_05_10.json")  # json path
if not tweets:
    print("没有加载到推文数据")
    raise SystemExit("程序终止：没有加载到推文数据")
    
# 选择分析方法和数量（用于测试）
sample_tweets = tweets[:100]  # 先测试前10条
    
print("开始分析推文...")
results_df = analyzer.batch_analyze(
        sample_tweets, 
        method="both",  # 使用两种LLM方法
        save_results=True
    )
    
    # 对比结果
comparison = analyzer.compare_methods(results_df)

print("\n=== 分析结果对比 ===")
for method, stats in comparison.items():
    if method == "comparison":
        continue  # 先跳过比较部分，最后单独显示
    
    print(f"{method.upper()}:")
    print(f"  总分析数量: {stats['total_analyzed']}")
    print(f"  Eurovision相关: {stats['eurovision_related']}")
    print(f"  相关比例: {stats['percentage']:.2f}%")
    
    # 添加处理时间信息
    if method == "chatgpt":
        print(f"  总Token消耗: {stats['total_tokens_used']}")
        print(f"  平均每条Token: {stats['average_tokens_per_tweet']:.2f}")
        print(f"  总处理时间: {stats['total_processing_time']:.2f}秒")
        print(f"  平均每条处理时间: {stats['average_time_per_tweet']:.4f}秒")
    elif method == "traditional":
        print(f"  总处理时间: {stats['total_processing_time']:.2f}秒")
        print(f"  平均每条处理时间: {stats['average_time_per_tweet']:.4f}秒")
    print()

# 添加方法比较部分
if "comparison" in comparison:
    comp = comparison["comparison"]
    print("两种方法比较:")
    print(f"  一致性: {comp['agreement_count']}/{stats['total_analyzed']} ({comp['agreement_percentage']:.2f}%)")
    print(f"  API方法比传统方法慢: {comp['time_difference_factor']:.2f}倍")
    print()

# 显示一些样例结果
print("=== 样例分析结果 ===")
for i in range(min(3, len(results_df))):
    row = results_df.iloc[i]
    print(f"\n推文 {i+1}: {row['tweet_text'][:100]}...")
    
    if 'chatgpt_analysis.is_eurovision_related' in row:
        print(f"ChatGPT判断: {row['chatgpt_analysis.is_eurovision_related']}")
        print(f"处理时间: {row['chatgpt_analysis.processing_time_seconds']:.4f}秒")
        print(f"Token消耗: {row['chatgpt_analysis.total_tokens']}")
    
    if 'traditional_analysis.is_eurovision_related' in row:
        print(f"传统方法判断: {row['traditional_analysis.is_eurovision_related']}")
        print(f"处理时间: {row['traditional_analysis.processing_time_seconds']:.4f}秒")


INFO:__main__:Successfully loaded 179983 tweets
INFO:__main__:Processing tweet 1/100


开始分析推文...


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:__main__:Processing tweet 2/100
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:__main__:Processing tweet 3/100
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:__main__:Processing tweet 4/100
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:__main__:Processing tweet 5/100
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:__main__:Processing tweet 6/100
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:__main__:Processing tweet 7/100
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:__main__:Processing tweet 8/100
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:__main__


=== 分析结果对比 ===
CHATGPT:
  总分析数量: 100
  Eurovision相关: 6
  相关比例: 6.00%
  总Token消耗: 32763.0
  平均每条Token: 327.63
  总处理时间: 675.36秒
  平均每条处理时间: 6.7536秒

TRADITIONAL:
  总分析数量: 100
  Eurovision相关: 6
  相关比例: 6.00%
  总处理时间: 0.00秒
  平均每条处理时间: 0.0000秒

两种方法比较:


KeyError: 'total_analyzed'