In [None]:
import requests
import json
import pandas as pd
from datetime import datetime, timedelta
import time
import os
import re
from typing import Dict, List, Optional

class RealMadridPerformanceAPI:
    """
    Specialized API client for Real Madrid team and player performance data
    Focus: Match results, player stats, team performance, injuries, ratings
    """
    
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://content.guardianapis.com"
        self.rate_limit_delay = 0.2  # 200ms between requests for safety
        
        # Real Madrid players (current squad 2024-25)
        self.current_squad = [
            # Goalkeepers
            "Courtois", "Lunin", "Fran González",
            # Defenders  
            "Carvajal", "Militão", "Alaba", "Nacho", "Mendy", "Rüdiger", "Vallejo", "Fran García",
            # Midfielders
            "Modrić", "Kroos", "Valverde", "Camavinga", "Tchouaméni", "Bellingham", "Ceballos", "Arda Güler",
            # Forwards
            "Vinícius", "Benzema", "Rodrygo", "Asensio", "Mariano", "Joselu", "Brahim", "Mbappé"
        ]
        
    def get_match_performance(self, days_back: int = 30, max_articles_per_query: int = 50) -> List[Dict]:
        """
        Get Real Madrid match performance and results with pagination
        """
        
        from_date = (datetime.now() - timedelta(days=days_back)).strftime('%Y-%m-%d')
        
        # Specific search for match reports and performance
        match_queries = [
            'Real Madrid AND (match report OR player ratings OR performance)',
            'Real Madrid AND (goals OR assists OR "man of the match")',
            'Real Madrid AND (La Liga OR Champions League OR Copa del Rey)',
            'Real Madrid AND (result OR score OR "Real Madrid" AND won)',
            'Real Madrid AND (Bernabéu OR Santiago Bernabéu OR home OR away)'
        ]
        
        all_articles = []
        total_requests = 0
        
        for query_idx, query in enumerate(match_queries, 1):
            print(f"🔍 Query {query_idx}/{len(match_queries)}: {query[:50]}...")
            
            # Paginate through results for this query
            query_articles = []
            page = 1
            page_size = 10  # Smaller page size to control API usage
            
            while len(query_articles) < max_articles_per_query:
                params = {
                    'api-key': self.api_key,
                    'q': query,
                    'section': 'football',
                    'from-date': from_date,
                    'page-size': page_size,
                    'page': page,
                    'show-fields': 'headline,byline,body,thumbnail,publication',
                    'show-tags': 'sport',
                    'order-by': 'newest'
                }
                
                try:
                    print(f"   📄 Page {page} (Request #{total_requests + 1})")
                    response = requests.get(f"{self.base_url}/search", params=params)
                    response.raise_for_status()
                    total_requests += 1
                    
                    data = response.json()
                    page_articles = data['response']['results']
                    
                    if not page_articles:
                        print(f"   ✅ No more articles on page {page}")
                        break
                    
                    # Filter for Real Madrid specific content
                    filtered_articles = self._filter_real_madrid_content(page_articles)
                    query_articles.extend(filtered_articles)
                    
                    print(f"   📰 Found {len(filtered_articles)} relevant articles")
                    
                    # Rate limiting - respect API limits
                    time.sleep(self.rate_limit_delay)
                    
                    # Check if we have enough articles or hit API limits
                    if len(query_articles) >= max_articles_per_query:
                        break
                    if total_requests >= 30:  # Conservative limit per method
                        print(f"   ⚠️  Reached request limit for this query")
                        break
                        
                    page += 1
                    
                except Exception as e:
                    print(f"   ❌ Error on page {page}: {e}")
                    break
            
            all_articles.extend(query_articles[:max_articles_per_query])
            print(f"   🎯 Collected {len(query_articles)} articles for this query")
            
            # Stop if we're approaching API limits
            if total_requests >= 100:  # Daily limit protection
                print(f"⚠️  Approaching API limits, stopping early")
                break
        
        # Remove duplicates by URL
        unique_articles = {article['webUrl']: article for article in all_articles}
        
        print(f"⚽ Total: {len(unique_articles)} unique match performance articles")
        print(f"📊 API requests used: {total_requests}")
        return list(unique_articles.values())
    
    def get_player_performance(self, player_name: str = None, days_back: int = 60, max_articles_per_query: int = 30) -> List[Dict]:
        """
        Get individual player performance data with pagination
        """
        
        from_date = (datetime.now() - timedelta(days=days_back)).strftime('%Y-%m-%d')
        
        if player_name:
            # Search for specific player
            player_queries = [
                f'Real Madrid AND "{player_name}" AND (performance OR goal OR assist)',
                f'Real Madrid AND "{player_name}" AND (rating OR stats OR minutes)',
                f'"{player_name}" AND Real Madrid AND (injury OR fitness OR return)'
            ]
        else:
            # Search for key players performance
            player_queries = [
                'Real Madrid AND (Vinícius OR Bellingham OR Modrić OR Mbappé) AND performance',
                'Real Madrid AND (Courtois OR Militão OR Carvajal OR Rüdiger) AND performance', 
                'Real Madrid AND (Valverde OR Camavinga OR Tchouaméni) AND performance'
            ]
        
        all_articles = []
        total_requests = 0
        
        for query_idx, query in enumerate(player_queries, 1):
            print(f"👤 Player Query {query_idx}/{len(player_queries)}: {query[:50]}...")
            
            # Paginate through results
            query_articles = []
            page = 1
            page_size = 8  # Smaller page size for player searches
            
            while len(query_articles) < max_articles_per_query:
                params = {
                    'api-key': self.api_key,
                    'q': query,
                    'section': 'football',
                    'from-date': from_date,
                    'page-size': page_size,
                    'page': page,
                    'show-fields': 'headline,byline,body,thumbnail,publication',
                    'order-by': 'newest'
                }
                
                try:
                    print(f"   📄 Page {page} (Request #{total_requests + 1})")
                    response = requests.get(f"{self.base_url}/search", params=params)
                    response.raise_for_status()
                    total_requests += 1
                    
                    data = response.json()
                    page_articles = data['response']['results']
                    
                    if not page_articles:
                        print(f"   ✅ No more articles on page {page}")
                        break
                    
                    filtered_articles = self._filter_real_madrid_content(page_articles)
                    query_articles.extend(filtered_articles)
                    
                    print(f"   🌟 Found {len(filtered_articles)} relevant player articles")
                    
                    time.sleep(self.rate_limit_delay)
                    
                    if len(query_articles) >= max_articles_per_query:
                        break
                    if total_requests >= 20:  # Conservative limit for player searches
                        print(f"   ⚠️  Reached request limit for this query")
                        break
                        
                    page += 1
                    
                except Exception as e:
                    print(f"   ❌ Error on page {page}: {e}")
                    break
            
            all_articles.extend(query_articles[:max_articles_per_query])
            print(f"   🎯 Collected {len(query_articles)} player articles")
            
            if total_requests >= 50:
                print(f"⚠️  Approaching API limits, stopping player search")
                break
        
        # Remove duplicates
        unique_articles = {article['webUrl']: article for article in all_articles}
        
        print(f"🌟 Total: {len(unique_articles)} unique player performance articles")
        print(f"📊 API requests used: {total_requests}")
        return list(unique_articles.values())
    
    def get_team_statistics(self, days_back: int = 90, max_articles_per_query: int = 20) -> List[Dict]:
        """
        Get Real Madrid team statistics and analysis with pagination
        """
        
        from_date = (datetime.now() - timedelta(days=days_back)).strftime('%Y-%m-%d')
        
        stats_queries = [
            'Real Madrid AND (statistics OR stats OR analysis OR tactics)',
            'Real Madrid AND (formation OR strategy OR "playing style")',
            'Real Madrid AND (league table OR standings OR position OR points)'
        ]
        
        all_articles = []
        total_requests = 0
        
        for query_idx, query in enumerate(stats_queries, 1):
            print(f"📊 Stats Query {query_idx}/{len(stats_queries)}: {query[:50]}...")
            
            query_articles = []
            page = 1
            page_size = 6  # Small page size for statistics
            
            while len(query_articles) < max_articles_per_query:
                params = {
                    'api-key': self.api_key,
                    'q': query,
                    'section': 'football',
                    'from-date': from_date,
                    'page-size': page_size,
                    'page': page,
                    'show-fields': 'headline,byline,body,thumbnail,publication',
                    'order-by': 'relevance'
                }
                
                try:
                    print(f"   📄 Page {page} (Request #{total_requests + 1})")
                    response = requests.get(f"{self.base_url}/search", params=params)
                    response.raise_for_status()
                    total_requests += 1
                    
                    data = response.json()
                    page_articles = data['response']['results']
                    
                    if not page_articles:
                        break
                    
                    filtered_articles = self._filter_real_madrid_content(page_articles)
                    query_articles.extend(filtered_articles)
                    
                    print(f"   📈 Found {len(filtered_articles)} stats articles")
                    
                    time.sleep(self.rate_limit_delay)
                    
                    if len(query_articles) >= max_articles_per_query:
                        break
                    if total_requests >= 15:  # Limit for team stats
                        break
                        
                    page += 1
                    
                except Exception as e:
                    print(f"   ❌ Error on page {page}: {e}")
                    break
            
            all_articles.extend(query_articles[:max_articles_per_query])
            
            if total_requests >= 30:
                break
        
        unique_articles = {article['webUrl']: article for article in all_articles}
        
        print(f"📈 Total: {len(unique_articles)} team statistics articles")
        print(f"📊 API requests used: {total_requests}")
        return list(unique_articles.values())
    
    def get_injury_updates(self, days_back: int = 30, max_articles_per_query: int = 15) -> List[Dict]:
        """
        Get player injury and fitness updates with pagination
        """
        
        from_date = (datetime.now() - timedelta(days=days_back)).strftime('%Y-%m-%d')
        
        injury_queries = [
            'Real Madrid AND (injury OR injured OR fitness OR "injury update")',
            'Real Madrid AND (return OR recovered OR "back to training")'
        ]
        
        all_articles = []
        total_requests = 0
        
        for query_idx, query in enumerate(injury_queries, 1):
            print(f"🏥 Injury Query {query_idx}/{len(injury_queries)}: {query[:50]}...")
            
            query_articles = []
            page = 1
            page_size = 5  # Small page size for injury news
            
            while len(query_articles) < max_articles_per_query:
                params = {
                    'api-key': self.api_key,
                    'q': query,
                    'section': 'football',
                    'from-date': from_date,
                    'page-size': page_size,
                    'page': page,
                    'show-fields': 'headline,byline,body,thumbnail,publication',
                    'order-by': 'newest'
                }
                
                try:
                    print(f"   📄 Page {page} (Request #{total_requests + 1})")
                    response = requests.get(f"{self.base_url}/search", params=params)
                    response.raise_for_status()
                    total_requests += 1
                    
                    data = response.json()
                    page_articles = data['response']['results']
                    
                    if not page_articles:
                        break
                    
                    filtered_articles = self._filter_real_madrid_content(page_articles)
                    query_articles.extend(filtered_articles)
                    
                    print(f"   🩹 Found {len(filtered_articles)} injury articles")
                    
                    time.sleep(self.rate_limit_delay)
                    
                    if len(query_articles) >= max_articles_per_query:
                        break
                    if total_requests >= 10:  # Conservative limit for injury news
                        break
                        
                    page += 1
                    
                except Exception as e:
                    print(f"   ❌ Error on page {page}: {e}")
                    break
            
            all_articles.extend(query_articles[:max_articles_per_query])
            
            if total_requests >= 15:
                break
        
        unique_articles = {article['webUrl']: article for article in all_articles}
        
        print(f"🩹 Total: {len(unique_articles)} injury update articles")
        print(f"📊 API requests used: {total_requests}")
        return list(unique_articles.values())
    
    def _filter_real_madrid_content(self, articles: List[Dict]) -> List[Dict]:
        """
        Filter articles to ensure they're actually about Real Madrid
        """
        
        filtered = []
        real_madrid_keywords = [
            'real madrid', 'bernabéu', 'santiago bernabeu', 'los blancos', 
            'madrid cf', 'real madrid cf', 'galácticos'
        ]
        
        for article in articles:
            title = article.get('webTitle', '').lower()
            body = article.get('fields', {}).get('body', '').lower()
            
            # Check if it's actually about Real Madrid (not other Madrid teams)
            if any(keyword in title or keyword in body[:500] for keyword in real_madrid_keywords):
                # Exclude other Madrid teams
                if not any(exclude in title.lower() for exclude in ['atlético', 'atletico', 'getafe', 'rayo']):
                    filtered.append(article)
        
        return filtered
    
    def extract_performance_data(self, articles: List[Dict]) -> pd.DataFrame:
        """
        Extract structured performance data from articles
        """
        
        performance_data = []
        
        for article in articles:
            fields = article.get('fields', {})
            title = fields.get('headline', article.get('webTitle', ''))
            body = fields.get('body', '')
            
            # Extract performance metrics from text
            performance_info = {
                'id': article.get('id'),
                'title': title,
                'url': article.get('webUrl'),
                'date': article.get('webPublicationDate')[:10],
                'author': fields.get('byline', 'Unknown'),
                
                # Performance indicators
                'contains_player_ratings': 'rating' in body.lower() or 'out of 10' in body.lower(),
                'contains_stats': any(word in body.lower() for word in ['goals', 'assists', 'passes', 'shots']),
                'contains_match_result': any(word in title.lower() for word in ['win', 'lose', 'draw', 'defeat', 'victory']),
                'contains_injury_news': any(word in body.lower() for word in ['injury', 'injured', 'fitness', 'doubt']),
                
                # Extract mentioned players
                'mentioned_players': [player for player in self.current_squad 
                                   if player.lower() in body.lower()],
                
                # Competition detection
                'competition': self._detect_competition(title + ' ' + body),
                
                # Performance type
                'content_type': self._classify_content_type(title, body)
            }
            
            performance_data.append(performance_info)
        
        return pd.DataFrame(performance_data)
    
    def _detect_competition(self, text: str) -> str:
        """Detect which competition the article is about"""
        
        text_lower = text.lower()
        
        if any(comp in text_lower for comp in ['champions league', 'ucl', 'european cup']):
            return 'Champions League'
        elif any(comp in text_lower for comp in ['la liga', 'league', 'primera división']):
            return 'La Liga'
        elif any(comp in text_lower for comp in ['copa del rey', 'cup']):
            return 'Copa del Rey'
        elif any(comp in text_lower for comp in ['super cup', 'supercopa']):
            return 'Super Cup'
        elif any(comp in text_lower for comp in ['club world cup', 'fifa']):
            return 'Club World Cup'
        else:
            return 'General'
    
    def _classify_content_type(self, title: str, body: str) -> str:
        """Classify the type of performance content"""
        
        text = (title + ' ' + body).lower()
        
        if any(word in text for word in ['match report', 'player ratings', 'performance']):
            return 'Match Analysis'
        elif any(word in text for word in ['injury', 'fitness', 'return', 'doubt']):
            return 'Injury News'
        elif any(word in text for word in ['statistics', 'stats', 'analysis']):
            return 'Statistics'
        elif any(word in text for word in ['goal', 'assist', 'scored']):
            return 'Goal/Assist News'
        elif any(word in text for word in ['tactics', 'formation', 'strategy']):
            return 'Tactical Analysis'
        else:
            return 'General Performance'
    
    def generate_performance_report(self, days_back: int = 14) -> Dict:
        """
        Generate a comprehensive performance report with API usage tracking
        """
        
        print(f"📋 Generating Real Madrid Performance Report ({days_back} days)")
        print("=" * 60)
        
        # Check if we can proceed with the analysis
        if not usage_tracker.check_limits():
            print("❌ Cannot proceed - API limits reached")
            return {}
        
        # Collect performance data with controlled pagination
        print("\n🔄 Collecting performance data...")
        
        # Get match performance (limited to prevent API overuse)
        matches = self.get_match_performance(days_back, max_articles_per_query=30)
        usage_tracker.print_usage()
        
        if not usage_tracker.check_limits():
            print("⚠️  Stopping early due to API limits")
            return self._create_partial_report(matches, [], [], [], days_back)
        
        # Get player performance
        players = self.get_player_performance(days_back=days_back, max_articles_per_query=20)
        usage_tracker.print_usage()
        
        if not usage_tracker.check_limits():
            return self._create_partial_report(matches, players, [], [], days_back)
        
        # Get team statistics
        team_stats = self.get_team_statistics(days_back, max_articles_per_query=15)
        usage_tracker.print_usage()
        
        if not usage_tracker.check_limits():
            return self._create_partial_report(matches, players, team_stats, [], days_back)
        
        # Get injury updates
        injuries = self.get_injury_updates(days_back, max_articles_per_query=10)
        usage_tracker.print_usage()
        
        # Combine all articles
        all_articles = matches + players + team_stats + injuries
        
        # Remove duplicates
        unique_articles = {article['webUrl']: article for article in all_articles}
        final_articles = list(unique_articles.values())
        
        # Extract structured data
        df = self.extract_performance_data(final_articles)
        
        # Generate report
        report = {
            'report_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'period_days': days_back,
            'total_articles': len(df),
            'api_usage': usage_tracker.get_usage_report(),
            
            # Content breakdown
            'content_types': df['content_type'].value_counts().to_dict() if len(df) > 0 else {},
            'competitions': df['competition'].value_counts().to_dict() if len(df) > 0 else {},
            
            # Player mentions
            'most_mentioned_players': self._get_top_mentioned_players(df) if len(df) > 0 else {},
            
            # Performance indicators
            'articles_with_ratings': df['contains_player_ratings'].sum() if len(df) > 0 else 0,
            'articles_with_stats': df['contains_stats'].sum() if len(df) > 0 else 0,
            'articles_with_results': df['contains_match_result'].sum() if len(df) > 0 else 0,
            'injury_articles': df['contains_injury_news'].sum() if len(df) > 0 else 0,
            
            # Recent highlights
            'latest_match_reports': df[df['content_type'] == 'Match Analysis'].head(3).to_dict('records') if len(df) > 0 else [],
            'latest_injury_news': df[df['content_type'] == 'Injury News'].head(3).to_dict('records') if len(df) > 0 else [],
            'latest_performance_news': df.head(5).to_dict('records') if len(df) > 0 else [],
            
            # Full dataset
            'all_articles': df.to_dict('records') if len(df) > 0 else []
        }
        
        return report
    
    def _create_partial_report(self, matches: List, players: List, team_stats: List, injuries: List, days_back: int) -> Dict:
        """Create a partial report when API limits are reached"""
        
        all_articles = matches + players + team_stats + injuries
        unique_articles = {article['webUrl']: article for article in all_articles}
        final_articles = list(unique_articles.values())
        
        df = self.extract_performance_data(final_articles)
        
        return {
            'report_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'period_days': days_back,
            'total_articles': len(df),
            'api_usage': usage_tracker.get_usage_report(),
            'status': 'PARTIAL - API limits reached',
            'content_types': df['content_type'].value_counts().to_dict() if len(df) > 0 else {},
            'all_articles': df.to_dict('records') if len(df) > 0 else []
        }
        all_articles = matches + players + team_stats + injuries
        
        # Remove duplicates
        unique_articles = {article['webUrl']: article for article in all_articles}
        final_articles = list(unique_articles.values())
        
        # Extract structured data
        df = self.extract_performance_data(final_articles)
        
        # Generate report
        report = {
            'report_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'period_days': days_back,
            'total_articles': len(df),
            
            # Content breakdown
            'content_types': df['content_type'].value_counts().to_dict(),
            'competitions': df['competition'].value_counts().to_dict(),
            
            # Player mentions
            'most_mentioned_players': self._get_top_mentioned_players(df),
            
            # Performance indicators
            'articles_with_ratings': df['contains_player_ratings'].sum(),
            'articles_with_stats': df['contains_stats'].sum(),
            'articles_with_results': df['contains_match_result'].sum(),
            'injury_articles': df['contains_injury_news'].sum(),
            
            # Recent highlights
            'latest_match_reports': df[df['content_type'] == 'Match Analysis'].head(3).to_dict('records'),
            'latest_injury_news': df[df['content_type'] == 'Injury News'].head(3).to_dict('records'),
            'latest_performance_news': df.head(5).to_dict('records'),
            
            # Full dataset
            'all_articles': df.to_dict('records')
        }
        
        return report
    
    def _get_top_mentioned_players(self, df: pd.DataFrame) -> Dict:
        """Get most frequently mentioned players"""
        
        player_mentions = {}
        
        for _, row in df.iterrows():
            for player in row['mentioned_players']:
                player_mentions[player] = player_mentions.get(player, 0) + 1
        
        # Sort by mentions
        sorted_players = sorted(player_mentions.items(), key=lambda x: x[1], reverse=True)
        
        return dict(sorted_players[:10])  # Top 10 most mentioned
    
    def save_performance_report(self, report: Dict, filename: str = None):
        """Save performance report to files"""
        
        if filename is None:
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            filename = f'real_madrid_performance_{timestamp}'
        
        # Save full report as JSON
        with open(f'{filename}.json', 'w', encoding='utf-8') as f:
            json.dump(report, f, indent=2, ensure_ascii=False)
        
        # Save articles as CSV
        df = pd.DataFrame(report['all_articles'])
        df.to_csv(f'{filename}.csv', index=False, encoding='utf-8')
        
        # Save summary as text
        with open(f'{filename}_summary.txt', 'w', encoding='utf-8') as f:
            f.write(f"REAL MADRID PERFORMANCE REPORT\n")
            f.write(f"Generated: {report['report_date']}\n")
            f.write(f"Period: Last {report['period_days']} days\n")
            f.write(f"Total Articles: {report['total_articles']}\n\n")
            
            f.write("CONTENT BREAKDOWN:\n")
            for content_type, count in report['content_types'].items():
                f.write(f"  • {content_type}: {count}\n")
            
            f.write(f"\nCOMPETITIONS:\n")
            for comp, count in report['competitions'].items():
                f.write(f"  • {comp}: {count}\n")
            
            f.write(f"\nTOP MENTIONED PLAYERS:\n")
            for player, mentions in report['most_mentioned_players'].items():
                f.write(f"  • {player}: {mentions} mentions\n")
            
            f.write(f"\nPERFORMANCE INDICATORS:\n")
            f.write(f"  • Articles with player ratings: {report['articles_with_ratings']}\n")
            f.write(f"  • Articles with statistics: {report['articles_with_stats']}\n")
            f.write(f"  • Articles with match results: {report['articles_with_results']}\n")
            f.write(f"  • Injury-related articles: {report['injury_articles']}\n")
        
        print(f"💾 Performance report saved:")
        print(f"   📊 {filename}.json (full data)")
        print(f"   📋 {filename}.csv (articles)")  
        print(f"   📝 {filename}_summary.txt (summary)")

class APIUsageTracker:
    """
    Track API usage to prevent exceeding Guardian API limits
    Daily: 12,000 requests | Weekly: 84,000 requests
    """
    
    def __init__(self):
        self.requests_made = 0
        self.start_time = datetime.now()
        self.daily_limit = 12000
        self.session_limit = 200  # Conservative limit per session
        
    def log_request(self):
        """Log an API request"""
        self.requests_made += 1
        
    def check_limits(self) -> bool:
        """Check if we can make more requests"""
        if self.requests_made >= self.session_limit:
            print(f"⚠️  Session limit reached ({self.session_limit} requests)")
            return False
        return True
    
    def get_usage_report(self) -> Dict:
        """Get current usage statistics"""
        elapsed_time = datetime.now() - self.start_time
        
        return {
            'requests_made': self.requests_made,
            'session_limit': self.session_limit,
            'remaining_in_session': self.session_limit - self.requests_made,
            'elapsed_time': str(elapsed_time).split('.')[0],
            'requests_per_minute': round(self.requests_made / max(elapsed_time.total_seconds() / 60, 0.1), 2),
            'estimated_daily_usage': min(self.requests_made * 24, self.daily_limit)
        }
    
    def print_usage(self):
        """Print current usage statistics"""
        stats = self.get_usage_report()
        print(f"\n📊 API USAGE STATISTICS:")
        print(f"   • Requests made: {stats['requests_made']}")
        print(f"   • Session remaining: {stats['remaining_in_session']}")
        print(f"   • Time elapsed: {stats['elapsed_time']}")
        print(f"   • Rate: {stats['requests_per_minute']} requests/minute")
        print(f"   • Estimated daily usage: {stats['estimated_daily_usage']}")

# Initialize global usage tracker
usage_tracker = APIUsageTracker()
def validate_api_key(api_key: str) -> bool:
    """Test if the API key is valid"""
    test_url = "https://content.guardianapis.com/search"
    test_params = {'api-key': api_key, 'q': 'test', 'page-size': 1}
    
    try:
        response = requests.get(test_url, params=test_params)
        if response.status_code == 200:
            print("✅ API key is valid!")
            return True
        elif response.status_code == 401:
            print("❌ Invalid API key - get your free key at: https://open-platform.theguardian.com/access/")
            return False
        else:
            print(f"⚠️  API returned status: {response.status_code}")
            return False
    except Exception as e:
        print(f"❌ Error testing API key: {e}")
        return False

# Main execution
def main():
    """
    Real Madrid Performance Analysis - Main Function
    """
    
    print("⚽ REAL MADRID PERFORMANCE ANALYZER")
    print("=" * 50)
    
    # Set your Guardian API key here
    API_KEY = "test"  # Replace with your real Guardian API key
    
    # Validate API key
    if not validate_api_key(API_KEY):
        print("\n🔑 Get your FREE Guardian API key:")
        print("   1. Go to: https://open-platform.theguardian.com/access/")
        print("   2. Register (takes 2 minutes)")
        print("   3. Replace 'test' with your real API key")
        return
    
    # Initialize the performance analyzer
    rm_performance = RealMadridPerformanceAPI(API_KEY)
    
# Example 1: Comprehensive Performance Report
    print(f"\n🏆 Generating Comprehensive Performance Report...")
    report = rm_performance.generate_performance_report(days_back=7)  # Shorter period to save API calls
    
    if report:
        # Save the report
        rm_performance.save_performance_report(report)
        
        # Display key findings
        print(f"\n📈 KEY FINDINGS:")
        print(f"   • Total articles analyzed: {report['total_articles']}")
        print(f"   • API requests used: {report.get('api_usage', {}).get('requests_made', 'N/A')}")
        
        if report.get('most_mentioned_players'):
            top_player = list(report['most_mentioned_players'].keys())[0]
            top_mentions = list(report['most_mentioned_players'].values())[0]
            print(f"   • Most mentioned player: {top_player} ({top_mentions} mentions)")
        
        if report.get('content_types'):
            top_content = max(report['content_types'], key=report['content_types'].get)
            print(f"   • Primary content type: {top_content}")
        
        print(f"   • Match analysis articles: {report.get('articles_with_results', 0)}")
        print(f"   • Performance statistics: {report.get('articles_with_stats', 0)}")
        print(f"   • Injury updates: {report.get('injury_articles', 0)}")
    
    # Example 2: Specific Player Analysis (if API usage allows)
    if usage_tracker.check_limits():
        print(f"\n👤 Analyzing Specific Player Performance...")
        player_articles = rm_performance.get_player_performance("Bellingham", days_back=14, max_articles_per_query=10)
        
        if player_articles:
            player_df = rm_performance.extract_performance_data(player_articles)
            print(f"   • Found {len(player_df)} articles about Bellingham")
            
            # Show recent headlines
            for i, row in player_df.head(3).iterrows():
                print(f"   📰 {row['title'][:70]}...")
    
    # Example 3: Quick Match Results (minimal API usage)
    if usage_tracker.check_limits():
        print(f"\n⚽ Recent Match Performance...")
        match_articles = rm_performance.get_match_performance(days_back=7, max_articles_per_query=15)
        
        if match_articles:
            match_df = rm_performance.extract_performance_data(match_articles)
            match_reports = match_df[match_df['content_type'] == 'Match Analysis']
            
            print(f"   • Found {len(match_reports)} match analysis articles")
            for i, row in match_reports.head(2).iterrows():
                print(f"   🏆 {row['title']}")
    
    # Example 4: La Liga Weekly Focus (NEW FEATURE)
    if usage_tracker.check_limits():
        print(f"\n🇪🇸 La Liga Weekly Performance Analysis...")
        
        # Specialized La Liga queries for weekly reliability
        la_liga_params = {
            'api-key': API_KEY,
            'q': 'Real Madrid AND "La Liga" AND (performance OR result OR standings)',
            'section': 'football',
            'from-date': (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d'),
            'page-size': 10,
            'show-fields': 'headline,byline,body,thumbnail,publication',
            'order-by': 'newest'
        }
        
        try:
            print(f"   🔍 Searching La Liga specific coverage...")
            la_liga_response = requests.get(f"https://content.guardianapis.com/search", params=la_liga_params)
            la_liga_response.raise_for_status()
            usage_tracker.log_request()
            
            la_liga_articles = la_liga_response.json()['response']['results']
            filtered_la_liga = rm_performance._filter_real_madrid_content(la_liga_articles)
            
            if filtered_la_liga:
                la_liga_df = rm_performance.extract_performance_data(filtered_la_liga)
                print(f"   • Found {len(la_liga_df)} La Liga articles")
                
                # La Liga specific analysis
                la_liga_stats = la_liga_df[la_liga_df['competition'] == 'La Liga']
                print(f"   📊 La Liga tagged articles: {len(la_liga_stats)}")
                print(f"   📈 With performance stats: {la_liga_stats['contains_stats'].sum()}")
                print(f"   ⭐ With player ratings: {la_liga_stats['contains_player_ratings'].sum()}")
                
                # Recent La Liga headlines
                for i, row in la_liga_df.head(2).iterrows():
                    print(f"   📰 {row['title'][:60]}...")
            
            time.sleep(rm_performance.rate_limit_delay)
            
        except Exception as e:
            print(f"   ❌ Error getting La Liga data: {e}")
    
    # Example 5: Competition Comparison Analysis
    if usage_tracker.check_limits() and report and 'all_articles' in report:
        print(f"\n🏆 Competition Performance Comparison...")
        
        if len(report['all_articles']) > 0:
            df = pd.DataFrame(report['all_articles'])
            
            # Competition breakdown analysis
            competitions = ['La Liga', 'Champions League', 'Copa del Rey', 'General']
            comp_analysis = {}
            
            for comp in competitions:
                comp_articles = df[df['competition'] == comp]
                if len(comp_articles) > 0:
                    comp_analysis[comp] = {
                        'total_articles': len(comp_articles),
                        'with_stats': comp_articles['contains_stats'].sum(),
                        'with_ratings': comp_articles['contains_player_ratings'].sum(),
                        'match_results': comp_articles['contains_match_result'].sum(),
                        'reliability_score': len(comp_articles) * 0.3 + comp_articles['contains_stats'].sum() * 0.4 + comp_articles['contains_player_ratings'].sum() * 0.3
                    }
            
            print(f"   📊 COMPETITION ANALYSIS (7 days):")
            for comp, stats in sorted(comp_analysis.items(), key=lambda x: x[1]['reliability_score'], reverse=True):
                reliability = "HIGH" if stats['reliability_score'] > 5 else "MEDIUM" if stats['reliability_score'] > 2 else "LOW"
                print(f"   🏆 {comp}: {stats['total_articles']} articles | Reliability: {reliability}")
                print(f"      📈 Stats: {stats['with_stats']} | Ratings: {stats['with_ratings']} | Results: {stats['match_results']}")
            
            # Weekly data reliability insight
            la_liga_count = comp_analysis.get('La Liga', {}).get('total_articles', 0)
            ucl_count = comp_analysis.get('Champions League', {}).get('total_articles', 0)
            
            if la_liga_count > ucl_count:
                print(f"   ✅ La Liga shows better weekly coverage ({la_liga_count} vs {ucl_count} articles)")
            elif ucl_count > 0:
                print(f"   ⚠️  Champions League active this week ({ucl_count} articles)")
            else:
                print(f"   📅 Off-season for European competitions - La Liga primary source")
    
    # Example 6: Player Performance Trends
    if usage_tracker.check_limits():
        print(f"\n🌟 Key Player Performance Trends...")
        
        # Focus on current key players
        key_players = ["Bellingham", "Vinícius", "Mbappé", "Modrić", "Courtois"]
        player_mentions = {}
        
        if report and 'all_articles' in report and len(report['all_articles']) > 0:
            df = pd.DataFrame(report['all_articles'])
            
            for player in key_players:
                mentions = 0
                for _, row in df.iterrows():
                    if player in row.get('mentioned_players', []):
                        mentions += 1
                if mentions > 0:
                    player_mentions[player] = mentions
            
            if player_mentions:
                sorted_players = sorted(player_mentions.items(), key=lambda x: x[1], reverse=True)
                print(f"   👑 MOST COVERED PLAYERS (this week):")
                for player, count in sorted_players[:3]:
                    print(f"      🌟 {player}: {count} article mentions")
    
    # Final comprehensive usage report
    print(f"\n" + "="*60)
    print("📊 FINAL ANALYSIS SUMMARY")
    usage_tracker.print_usage()
    
    remaining = usage_tracker.session_limit - usage_tracker.requests_made
    print(f"\n💡 SESSION SUMMARY:")
    print(f"   • Requests remaining: {remaining}")
    print(f"   • Daily Guardian limit: 12,000 requests")
    print(f"   • Weekly Guardian limit: 84,000 requests")
    
    # Performance quality assessment
    if report and 'total_articles' in report:
        efficiency = report['total_articles'] / max(usage_tracker.requests_made, 1)
        
        if report['total_articles'] > 25:
            print(f"   ✅ EXCELLENT data collection: {report['total_articles']} articles")
        elif report['total_articles'] > 15:
            print(f"   👍 GOOD data collection: {report['total_articles']} articles")
        elif report['total_articles'] > 5:
            print(f"   ⚠️  MODERATE data collection: {report['total_articles']} articles")
        else:
            print(f"   ❌ LIMITED data collection: {report['total_articles']} articles")
        
        print(f"   📈 API efficiency: {efficiency:.1f} articles per request")
        
        # Recommendations
        if efficiency < 0.5:
            print(f"   💡 TIP: Try longer time periods or broader search terms")
        elif efficiency > 1.0:
            print(f"   🎯 OPTIMAL: Great API efficiency achieved!")
    
    print(f"\n🏁 Real Madrid Performance Analysis Complete!")
    print(f"📋 Reports saved with timestamp for future reference")

# Additional utility functions
def quick_la_liga_check(api_key: str, days: int = 3):
    """Quick La Liga reliability check"""
    print(f"🇪🇸 QUICK LA LIGA RELIABILITY CHECK ({days} days)")
    print("-" * 45)
    
    if not validate_api_key(api_key):
        return
    
    params = {
        'api-key': api_key,
        'q': 'Real Madrid AND "La Liga"',
        'section': 'football',
        'from-date': (datetime.now() - timedelta(days=days)).strftime('%Y-%m-%d'),
        'page-size': 15,
        'show-fields': 'headline,publication',
        'order-by': 'newest'
    }
    
    try:
        response = requests.get("https://content.guardianapis.com/search", params=params)
        response.raise_for_status()
        
        articles = response.json()['response']['results']
        
        print(f"✅ Found {len(articles)} La Liga articles in {days} days")
        print(f"📊 Average: {len(articles)/days:.1f} articles per day")
        
        if len(articles) > days * 2:
            print(f"🎯 EXCELLENT weekly La Liga coverage reliability")
        elif len(articles) > days:
            print(f"👍 GOOD weekly La Liga coverage")
        else:
            print(f"⚠️  LIMITED La Liga coverage this period")
            
    except Exception as e:
        print(f"❌ Error: {e}")

def competition_schedule_info():
    """Show current competition schedule context"""
    print(f"\n📅 CURRENT FOOTBALL SEASON CONTEXT:")
    print(f"   📍 Date: {datetime.now().strftime('%B %d, %Y')}")
    
    month = datetime.now().month
    
    if month in [8, 9, 10, 11, 12, 1, 2, 3, 4, 5]:
        print(f"   🇪🇸 La Liga: ACTIVE SEASON (Weekly matches)")
        if month in [9, 10, 11, 12, 2, 3, 4, 5]:
            print(f"   🏆 Champions League: ACTIVE (Tournament phase)")
        else:
            print(f"   🏆 Champions League: Off-season")
    else:
        print(f"   🏖️  Off-season: Pre-season friendlies only")
    
    print(f"   💡 RECOMMENDATION: Use La Liga data for weekly analysis")

if __name__ == "__main__":
    # Show season context
    competition_schedule_info()
    
    # Run main analysis
    main()
    
    # Optional quick checks
    print(f"\n" + "="*60)
    print("🔧 OPTIONAL QUICK CHECKS:")
    print("1. quick_la_liga_check(API_KEY, days=3)")
    print("2. demo_pagination_control()")
    print("3. demo_weekly_analysis()")