# Soccer Intelligence System - Data Collection Demo

This notebook demonstrates the data collection capabilities of the Soccer Performance Intelligence System.

## Features Demonstrated:
- API-Football data collection
- Social media data collection
- Wikipedia data extraction
- Data caching and management

In [None]:
# Import required libraries
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Import our modules
from soccer_intelligence.data_collection import APIFootballClient, SocialMediaCollector, WikipediaCollector
from soccer_intelligence.utils import Config, setup_logger

# Set up logging
logger = setup_logger('data_collection_demo')
print("Soccer Intelligence System - Data Collection Demo")
print("=" * 50)

## 1. API-Football Data Collection

Collect comprehensive soccer data from API-Football including teams, matches, and player statistics.

In [None]:
# Initialize API-Football client
# Note: You need to set up your API key in config/api_keys.yaml
api_client = APIFootballClient()

# Check if API key is configured
if not api_client.api_key:
    print("‚ö†Ô∏è  API-Football key not configured. Please set up config/api_keys.yaml")
    print("Demo will use cached data if available.")
else:
    print("‚úÖ API-Football client initialized successfully")

In [None]:
# Collect La Liga data for 2023 season
LA_LIGA_ID = 140
SEASON = 2023

print(f"Collecting La Liga data for {SEASON} season...")

try:
    # Get teams
    teams = api_client.get_teams(LA_LIGA_ID, SEASON)
    print(f"‚úÖ Collected {len(teams)} teams")
    
    # Get matches (limited sample)
    matches = api_client.get_matches(LA_LIGA_ID, SEASON)
    print(f"‚úÖ Collected {len(matches)} matches")
    
    # Get standings
    standings = api_client.get_standings(LA_LIGA_ID, SEASON)
    print(f"‚úÖ Collected standings data")
    
    # Display sample data
    if teams:
        print("\nSample teams:")
        for i, team in enumerate(teams[:5]):
            team_info = team.get('team', {})
            print(f"  {i+1}. {team_info.get('name', 'Unknown')} (ID: {team_info.get('id', 'N/A')})")
    
except Exception as e:
    print(f"‚ùå Error collecting API data: {e}")
    print("This might be due to API key configuration or rate limits.")

## 2. Social Media Data Collection

Collect social media mentions and sentiment data for teams and players.

In [None]:
# Initialize social media collector
social_collector = SocialMediaCollector()

if not social_collector.twitter_client:
    print("‚ö†Ô∏è  Twitter API not configured. Social media collection disabled.")
    print("To enable: Set up Twitter bearer token in config/api_keys.yaml")
else:
    print("‚úÖ Social media collector initialized")
    
    # Collect mentions for popular teams
    teams_to_analyze = ['Real Madrid', 'Barcelona', 'Manchester United']
    
    social_data = {}
    
    for team in teams_to_analyze:
        print(f"\nCollecting social media data for {team}...")
        try:
            mentions = social_collector.get_team_mentions(team, max_results=20)
            social_data[team] = mentions
            print(f"‚úÖ Collected {len(mentions)} mentions for {team}")
        except Exception as e:
            print(f"‚ùå Error collecting data for {team}: {e}")
    
    # Display sample social media data
    if social_data:
        print("\nSample social media mentions:")
        for team, mentions in social_data.items():
            if mentions:
                print(f"\n{team} - Recent mention:")
                print(f"  Text: {mentions[0].get('text', 'N/A')[:100]}...")
                print(f"  Created: {mentions[0].get('created_at', 'N/A')}")

## 3. Wikipedia Data Collection

Collect historical and contextual information from Wikipedia.

In [None]:
# Initialize Wikipedia collector
wiki_collector = WikipediaCollector()
print("‚úÖ Wikipedia collector initialized")

# Collect information for famous players and teams
players_to_analyze = ['Lionel Messi', 'Cristiano Ronaldo', 'Kylian Mbappe']
teams_to_analyze = ['Real Madrid', 'Barcelona', 'Manchester United']

wiki_data = {'players': {}, 'teams': {}}

print("\nCollecting Wikipedia data for players...")
for player in players_to_analyze:
    try:
        player_info = wiki_collector.get_player_info(player)
        if player_info:
            wiki_data['players'][player] = player_info
            print(f"‚úÖ Collected Wikipedia data for {player}")
        else:
            print(f"‚ö†Ô∏è  No Wikipedia data found for {player}")
    except Exception as e:
        print(f"‚ùå Error collecting data for {player}: {e}")

print("\nCollecting Wikipedia data for teams...")
for team in teams_to_analyze:
    try:
        team_info = wiki_collector.get_team_info(team)
        if team_info:
            wiki_data['teams'][team] = team_info
            print(f"‚úÖ Collected Wikipedia data for {team}")
        else:
            print(f"‚ö†Ô∏è  No Wikipedia data found for {team}")
    except Exception as e:
        print(f"‚ùå Error collecting data for {team}: {e}")

In [None]:
# Display sample Wikipedia data
if wiki_data['players']:
    print("\nSample Wikipedia player data:")
    for player, info in list(wiki_data['players'].items())[:2]:
        print(f"\n{player}:")
        print(f"  Birth Date: {info.get('birth_date', 'N/A')}")
        print(f"  Position: {info.get('position', 'N/A')}")
        print(f"  Nationality: {info.get('nationality', 'N/A')}")
        print(f"  Summary: {info.get('summary', 'N/A')[:150]}...")

if wiki_data['teams']:
    print("\nSample Wikipedia team data:")
    for team, info in list(wiki_data['teams'].items())[:2]:
        print(f"\n{team}:")
        print(f"  Founded: {info.get('founded', 'N/A')}")
        print(f"  Stadium: {info.get('stadium', 'N/A')}")
        print(f"  League: {info.get('league', 'N/A')}")
        print(f"  Summary: {info.get('summary', 'N/A')[:150]}...")

## 4. Data Caching and Management

Demonstrate the caching system that prevents repeated API calls.

In [None]:
# Check cache information
from soccer_intelligence.data_collection import CacheManager

cache_manager = CacheManager()
cache_info = cache_manager.get_cache_info()

print("Cache Information:")
print(f"  Cache Directory: {cache_info['cache_directory']}")
print(f"  Total Files: {cache_info['total_files']}")
print(f"  Total Size: {cache_info['total_size_mb']} MB")
print(f"  Cache Duration: {cache_info['cache_duration_hours']} hours")

if cache_info['files']:
    print("\nRecent cache files:")
    for file_info in cache_info['files'][:5]:
        status = "‚úÖ Valid" if file_info['is_valid'] else "‚ùå Expired"
        print(f"  {file_info['filename']}: {file_info['size_bytes']} bytes - {status}")

## 5. Data Collection Summary

Summary of collected data and next steps.

In [None]:
print("Data Collection Summary")
print("=" * 30)

# Count collected data
api_data_count = 0
social_data_count = 0
wiki_data_count = 0

try:
    api_data_count = len(teams) + len(matches) if 'teams' in locals() and 'matches' in locals() else 0
except:
    pass

try:
    social_data_count = sum(len(mentions) for mentions in social_data.values()) if 'social_data' in locals() else 0
except:
    pass

try:
    wiki_data_count = len(wiki_data['players']) + len(wiki_data['teams']) if 'wiki_data' in locals() else 0
except:
    pass

print(f"üìä API-Football Records: {api_data_count}")
print(f"üê¶ Social Media Mentions: {social_data_count}")
print(f"üìñ Wikipedia Articles: {wiki_data_count}")
print(f"üíæ Cache Files: {cache_info.get('total_files', 0)}")

print("\nNext Steps:")
print("1. Run 02_data_processing.ipynb to clean and process the collected data")
print("2. Run 03_shapley_analysis.ipynb for player contribution analysis")
print("3. Run 04_rag_system.ipynb to set up the query system")
print("4. Run 05_tactical_insights.ipynb for comprehensive analysis")

print("\n‚úÖ Data collection demo completed!")