In [9]:
import requests
from bs4 import BeautifulSoup
import time
import json
import pandas as pd
from datetime import datetime, timedelta
import schedule
import threading
from typing import Dict, List, Optional
import re
import logging
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service

# Logging beállítása
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [16]:
class SofaScoreScraper:
    def __init__(self, headless: bool = True):
        self.headless = headless
        self.driver = None
        self.session = requests.Session()
        self.base_headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        }
        self.session.headers.update(self.base_headers)
        
    def setup_selenium(self):
        """Selenium WebDriver beállítása"""
        try:
            chrome_options = Options()
            if self.headless:
                chrome_options.add_argument("--headless")
            chrome_options.add_argument("--no-sandbox")
            chrome_options.add_argument("--disable-dev-shm-usage")
            chrome_options.add_argument("--disable-gpu")
            chrome_options.add_argument("--window-size=1920,1080")
            chrome_options.add_argument(f"--user-agent={self.base_headers['User-Agent']}")
            
            # Felhő környezethez optimalizált beállítások
            chrome_options.add_argument("--disable-extensions")
            chrome_options.add_argument("--disable-plugins")
            chrome_options.add_argument("--disable-images")
            
            service = Service(ChromeDriverManager().install())
            self.driver = webdriver.Chrome(service=service, options=chrome_options)
            logger.info("Selenium WebDriver successfully initialized")
            return True
        except Exception as e:
            logger.error(f"Failed to setup Selenium: {e}")
            return False
    
    def extract_match_id_from_url(self, url: str) -> Optional[str]:
        """Match ID kinyerése az URL-ből"""
        try:
            # SofaScore URL pattern: /match/team1-team2/abc#id:12345,tab:statistics
            if '#id:' in url:
                match_id = url.split('#id:')[1].split(',')[0]
                return match_id
            return None
        except Exception as e:
            logger.error(f"Error extracting match ID: {e}")
            return None
    
    def scrape_with_requests(self, url: str) -> Optional[Dict]:
        """Próbálkozás requests-tel (gyorsabb, de nem mindig működik)"""
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'lxml')
            return self.parse_statistics(soup)
        except Exception as e:
            logger.warning(f"Requests method failed: {e}")
            return None
    
    def scrape_with_selenium(self, url: str) -> Optional[Dict]:
        """Selenium használata dinamikus tartalom betöltéséhez"""
        try:
            if not self.driver and not self.setup_selenium():
                return None
            
            self.driver.get(url)
            
            # Várunk, hogy betöltődjön az oldal
            wait = WebDriverWait(self.driver, 20)
            
            # Több lehetséges selector próbálása
            selectors_to_try = [
                ".pt_sm.bdr-b_lg.ov_hidden",
                "[data-testid='statistics']",
                ".statistics",
                "[class*='statistics']",
                "[class*='stat']"
            ]
            
            stats_container = None
            for selector in selectors_to_try:
                try:
                    stats_container = wait.until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, selector))
                    )
                    logger.info(f"Found stats container with selector: {selector}")
                    break
                except:
                    continue
            
            if not stats_container:
                logger.warning("No statistics container found with any selector")
                # Próbáljuk meg a teljes oldal parsing-ot
                time.sleep(5)  # További várakozás
            else:
                time.sleep(3)  # Statisztikák betöltődésére várakozás
            
            # HTML tartalom lekérése és debug info
            page_source = self.driver.page_source
            
            # Debug: megnézzük mi van az oldalon
            soup = BeautifulSoup(page_source, 'lxml')
            logger.info(f"Page title: {soup.title.string if soup.title else 'No title'}")
            
            # Keresünk bármilyen div-et ami statisztikára utalhat
            potential_stats = soup.find_all('div', string=re.compile(r'xG|Expected|Goals|Shots|Possession', re.I))
            logger.info(f"Found {len(potential_stats)} potential stat elements")
            
            return self.parse_statistics(soup)
            
        except Exception as e:
            logger.error(f"Selenium scraping failed: {e}")
            return None
    
    def parse_statistics(self, soup: BeautifulSoup) -> Dict:
        """Statisztikák kinyerése a HTML-ből - javított verzió több selector-ral"""
        stats = {
            'timestamp': datetime.now().isoformat(),
            'team_stats': {},
            'xg_data': {},
            'raw_stats': []
        }
        
        try:
            # Debug: nézzük meg mi van az oldalon
            logger.info("Starting to parse statistics...")
            
            # Több lehetséges statisztika konténer keresése
            container_selectors = [
                'div.pt_sm.bdr-b_lg.ov_hidden',
                '[data-testid*="stat"]',
                '.statistics-container',
                '[class*="statistics"]',
                '[class*="stat-row"]',
                'div[class*="flex"]'  # SofaScore gyakran flexbox-ot használ
            ]
            
            stats_containers = []
            for selector in container_selectors:
                containers = soup.select(selector)
                if containers:
                    stats_containers.extend(containers)
                    logger.info(f"Found {len(containers)} containers with selector: {selector}")
            
            # Ha nincs specifikus konténer, keresünk az egész oldalon
            if not stats_containers:
                logger.info("No specific containers found, searching entire page...")
                stats_containers = [soup]
            
            # Statisztikák keresése különböző módszerekkel
            for container in stats_containers:
                # 1. módszer: Flexbox alapú keresés
                self._parse_flexbox_stats(container, stats)
                
                # 2. módszer: Szöveg alapú keresés
                self._parse_text_based_stats(container, stats)
                
                # 3. módszer: Table-szerű struktúra
                self._parse_table_like_stats(container, stats)
            
            # xG specifikus keresés
            self._extract_xg_data(soup, stats)
            
            # Csapat nevek kinyerése
            team_names = self.extract_team_names(soup)
            if team_names:
                stats['teams'] = team_names
            
            logger.info(f"Successfully parsed {len(stats['raw_stats'])} statistics")
            
            # Debug: logoljuk az első pár statisztikát
            for i, stat in enumerate(stats['raw_stats'][:3]):
                logger.info(f"Stat {i+1}: {stat}")
            
            return stats
            
        except Exception as e:
            logger.error(f"Error parsing statistics: {e}")
            return stats
    
    def _parse_flexbox_stats(self, container, stats):
        """Flexbox alapú statisztikák parsing"""
        try:
            # Keresünk sorokra osztott statisztikákat
            rows = container.find_all('div', class_=re.compile(r'.*flex.*|.*row.*'))
            
            for row in rows:
                text_elements = row.find_all(string=True)
                text_content = [t.strip() for t in text_elements if t.strip()]
                
                # Ha 3 elemet találunk: érték1, név, érték2
                if len(text_content) >= 3:
                    # Keresünk számokat a szövegben
                    numbers = [t for t in text_content if re.match(r'^[\d.,]+%?$', t)]
                    stat_names = [t for t in text_content if not re.match(r'^[\d.,]+%?$', t) and len(t) > 1]
                    
                    if len(numbers) >= 2 and stat_names:
                        stat_name = stat_names[0]
                        stats['raw_stats'].append({
                            'stat_name': stat_name,
                            'home_value': numbers[0],
                            'away_value': numbers[-1]
                        })
        except Exception as e:
            logger.debug(f"Error in flexbox parsing: {e}")
    
    def _parse_text_based_stats(self, container, stats):
        """Szöveg alapú statisztika keresés"""
        try:
            # Keressük az összes szöveget ami statisztikára utalhat
            stat_patterns = [
                r'(\d+[\.,]?\d*)\s*(xG|Expected goals?|Goals?|Shots?|Possession|Pass|Corner|Yellow|Red)',
                r'(xG|Expected goals?|Goals?|Shots?|Possession|Pass|Corner|Yellow|Red)\s*(\d+[\.,]?\d*)',
                r'(\d+[\.,]?\d*%?)\s*([A-Za-z\s]+)\s*(\d+[\.,]?\d*%?)'
            ]
            
            page_text = container.get_text()
            
            for pattern in stat_patterns:
                matches = re.finditer(pattern, page_text, re.IGNORECASE)
                for match in matches:
                    groups = match.groups()
                    if len(groups) >= 2:
                        # Próbáljuk meghatározni melyik a név és melyik az érték
                        if groups[1].replace(' ', '').replace('.', '').replace(',', '').isdigit():
                            stat_name = groups[0]
                            value = groups[1]
                        else:
                            stat_name = groups[1]
                            value = groups[0]
                        
                        if stat_name and value:
                            stats['raw_stats'].append({
                                'stat_name': stat_name,
                                'home_value': value,
                                'away_value': 'N/A'
                            })
        except Exception as e:
            logger.debug(f"Error in text-based parsing: {e}")
    
    def _parse_table_like_stats(self, container, stats):
        """Table-szerű struktúrák parsing"""
        try:
            # Keresünk tr, td elemeket
            rows = container.find_all(['tr', 'div'])
            
            for row in rows:
                cells = row.find_all(['td', 'div', 'span'])
                if len(cells) >= 3:
                    cell_texts = [cell.get_text(strip=True) for cell in cells]
                    
                    # Szűrjük ki az üres cellákat
                    cell_texts = [text for text in cell_texts if text]
                    
                    if len(cell_texts) >= 3:
                        # Általában: érték1, statisztika_név, érték2
                        potential_left = cell_texts[0]
                        potential_name = cell_texts[1]
                        potential_right = cell_texts[-1]
                        
                        # Ellenőrizzük hogy van-e számérték
                        if (re.match(r'^[\d.,]+%?$', potential_left) and 
                            re.match(r'^[\d.,]+%?$', potential_right) and
                            not re.match(r'^[\d.,]+%?$', potential_name)):
                            
                            stats['raw_stats'].append({
                                'stat_name': potential_name,
                                'home_value': potential_left,
                                'away_value': potential_right
                            })
        except Exception as e:
            logger.debug(f"Error in table-like parsing: {e}")
    
    def _extract_xg_data(self, soup, stats):
        """xG adatok specifikus kinyerése"""
        try:
            # xG keresése különböző módokon
            xg_patterns = [
                r'xG[:\s]*(\d+[\.,]?\d*)',
                r'Expected\s+goals?[:\s]*(\d+[\.,]?\d*)',
                r'(\d+[\.,]?\d*)\s*xG'
            ]
            
            page_text = soup.get_text()
            
            xg_values = []
            for pattern in xg_patterns:
                matches = re.findall(pattern, page_text, re.IGNORECASE)
                xg_values.extend(matches)
            
            if len(xg_values) >= 2:
                stats['xg_data'] = {
                    'home_xg': xg_values[0],
                    'away_xg': xg_values[1],
                    'stat_name': 'xG'
                }
                logger.info(f"Found xG data: {stats['xg_data']}")
            elif len(xg_values) == 1:
                stats['xg_data'] = {
                    'home_xg': xg_values[0],
                    'away_xg': 'N/A',
                    'stat_name': 'xG'
                }
            
        except Exception as e:
            logger.debug(f"Error extracting xG data: {e}")


    def extract_team_names(self, soup: BeautifulSoup) -> Optional[Dict]:
        """Csapat nevek kinyerése"""
        try:
            # Különböző selectorok próbálása
            selectors = [
                'h1[data-testid="match-header-team-name"]',
                '.team-name',
                '[class*="team"][class*="name"]',
                'h1, h2, h3'  # Fallback
            ]
            
            for selector in selectors:
                elements = soup.select(selector)
                if len(elements) >= 2:
                    return {
                        'home_team': elements[0].get_text(strip=True),
                        'away_team': elements[1].get_text(strip=True)
                    }
            
            return None
        except Exception as e:
            logger.error(f"Error extracting team names: {e}")
            return None
    
    def scrape_match_stats(self, url: str) -> Optional[Dict]:
        """Fő scraping függvény - először requests, majd Selenium"""
        logger.info(f"Starting to scrape: {url}")
        
        # Először próbálkozás requests-tel
        stats = self.scrape_with_requests(url)
        
        # Ha nem sikerült, Selenium használata
        if not stats or len(stats.get('raw_stats', [])) == 0:
            logger.info("Trying with Selenium...")
            stats = self.scrape_with_selenium(url)
        
        return stats
    
    def cleanup(self):
        """Erőforrások felszabadítása"""
        if self.driver:
            self.driver.quit()
            self.driver = None

In [17]:
class MatchScheduler:
    def __init__(self, scraper: SofaScoreScraper):
        self.scraper = scraper
        self.results = []
        self.running = False
        self.thread = None
        
    def add_match(self, url: str, interval_minutes: int = 5):
        """Mérkőzés hozzáadása az ütemezéshez"""
        def job():
            try:
                stats = self.scraper.scrape_match_stats(url)
                if stats:
                    stats['url'] = url
                    self.results.append(stats)
                    logger.info(f"Successfully scraped match data. Total results: {len(self.results)}")
                else:
                    logger.warning("No stats retrieved")
            except Exception as e:
                logger.error(f"Error in scheduled job: {e}")
        
        schedule.every(interval_minutes).minutes.do(job)
        logger.info(f"Scheduled match scraping every {interval_minutes} minutes for: {url}")
        
        # Első futtatás azonnal
        job()
    
    def start_scheduler(self):
        """Ütemező indítása külön szálon"""
        def run_scheduler():
            self.running = True
            while self.running:
                schedule.run_pending()
                time.sleep(1)
        
        self.thread = threading.Thread(target=run_scheduler, daemon=True)
        self.thread.start()
        logger.info("Scheduler started")
    
    def stop_scheduler(self):
        """Ütemező leállítása"""
        self.running = False
        schedule.clear()
        logger.info("Scheduler stopped")
    
    def get_latest_stats(self) -> Optional[Dict]:
        """Legfrissebb statisztikák lekérése"""
        if self.results:
            return self.results[-1]
        return None
    
    def save_results_to_csv(self, filename: str = None):
        """Eredmények mentése CSV-be"""
        if not self.results:
            logger.warning("No results to save")
            return
        
        if not filename:
            filename = f"sofascore_stats_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        
        # DataFrame készítése
        rows = []
        for result in self.results:
            base_row = {
                'timestamp': result.get('timestamp'),
                'url': result.get('url'),
                'home_team': result.get('teams', {}).get('home_team', 'Unknown'),
                'away_team': result.get('teams', {}).get('away_team', 'Unknown'),
                'home_xg': result.get('xg_data', {}).get('home_xg', 'N/A'),
                'away_xg': result.get('xg_data', {}).get('away_xg', 'N/A')
            }
            
            # Egyéb statisztikák hozzáadása
            for stat in result.get('raw_stats', []):
                base_row[f"home_{stat['stat_name']}"] = stat['home_value']
                base_row[f"away_{stat['stat_name']}"] = stat['away_value']
            
            rows.append(base_row)
        
        df = pd.DataFrame(rows)
        df.to_csv(filename, index=False)
        logger.info(f"Results saved to {filename}")
        return filename

In [18]:
# Használati példa és tesztelés

# Scraper inicializálás
scraper = SofaScoreScraper(headless=True)  # headless=False a teszteléshez

# Tesztelés egyedi URL-lel
test_url = "https://www.sofascore.com/football/match/1-fc-koln-leicester-city/Gswdb#id:14250691,tab:statistics"

print("🏈 Testing single match scraping...")
stats = scraper.scrape_match_stats(test_url)

if stats:
    print(f"✅ Successfully scraped statistics!")
    print(f"📊 Found {len(stats.get('raw_stats', []))} statistics")
    
    if stats.get('xg_data'):
        print(f"⚽ xG found: {stats['xg_data']}")
    else:
        print("❌ No xG data found")
    
    xg_stats = [s for s in stats['raw_stats'] if 'xG' in s['stat_name'] or 'Expected' in s['stat_name']]
    print(f"🎯 xG related stats: {xg_stats}")

    if stats.get('teams'):
        print(f"🏟️ Teams: {stats['teams']}")
    
    # Első néhány statisztika megjelenítése
    print("\n📈 Sample statistics:")
    for stat in stats.get('raw_stats', [])[:5]:
        print(f"  {stat['stat_name']}: {stat['home_value']} - {stat['away_value']}")
else:
    print("❌ Failed to scrape statistics")

2025-07-25 18:18:42,563 - INFO - Starting to scrape: https://www.sofascore.com/football/match/1-fc-koln-leicester-city/Gswdb#id:14250691,tab:statistics
2025-07-25 18:18:42,651 - INFO - Trying with Selenium...


🏈 Testing single match scraping...


2025-07-25 18:18:43,610 - INFO - Get LATEST chromedriver version for google-chrome
2025-07-25 18:18:43,700 - INFO - Get LATEST chromedriver version for google-chrome
2025-07-25 18:18:43,765 - INFO - Driver [C:\Users\Adam\.wdm\drivers\chromedriver\win64\138.0.7204.168\chromedriver-win32/chromedriver.exe] found in cache
2025-07-25 18:18:44,824 - INFO - Selenium WebDriver successfully initialized
2025-07-25 18:18:46,963 - INFO - Found stats container with selector: .pt_sm.bdr-b_lg.ov_hidden
2025-07-25 18:18:50,248 - INFO - Page title: 1. FC Köln vs Leicester City live score, H2H and lineups | Sofascore
2025-07-25 18:18:50,256 - INFO - Found 6 potential stat elements
2025-07-25 18:18:50,257 - INFO - Starting to parse statistics...
2025-07-25 18:18:50,283 - INFO - Found 1 containers with selector: div.pt_sm.bdr-b_lg.ov_hidden
2025-07-25 18:18:50,329 - INFO - Found 1 containers with selector: [data-testid*="stat"]
2025-07-25 18:18:50,509 - INFO - Found 253 containers with selector: div[class

✅ Successfully scraped statistics!
📊 Found 354 statistics
🏟️ Teams: {'home_team': '1. FC KölnvsLeicester Citylive score, H2H results, standings and prediction', 'away_team': 'Lineups'}

📈 Sample statistics:
  Match overview: 48% - 0
  Ball possession: 48% - 52%
  Corner kicks: 2 - 3
  Fouls: 9 - 6
  Free kicks: 6 - 9


In [None]:
# Ütemezett scraping példa

print("\n🕐 Setting up scheduled scraping...")
scheduler = MatchScheduler(scraper)

# Mérkőzés hozzáadása 2 perces intervallumal (teszteléshez)
scheduler.add_match(test_url, interval_minutes=1)

# Ütemező indítása
scheduler.start_scheduler()

print("⏰ Scheduler started! It will scrape every 1 minutes.")
print("💡 Run the next cell to check results after a few minutes...")

2025-07-25 18:23:48,875 - INFO - Scheduled match scraping every 1 minutes for: https://www.sofascore.com/football/match/1-fc-koln-leicester-city/Gswdb#id:14250691,tab:statistics
2025-07-25 18:23:48,876 - INFO - Starting to scrape: https://www.sofascore.com/football/match/1-fc-koln-leicester-city/Gswdb#id:14250691,tab:statistics
2025-07-25 18:23:48,964 - INFO - Trying with Selenium...
2025-07-25 18:23:48,989 - INFO - Found stats container with selector: .pt_sm.bdr-b_lg.ov_hidden



🕐 Setting up scheduled scraping...


2025-07-25 18:23:52,272 - INFO - Page title: 1. FC Köln vs Leicester City live score, H2H and lineups | Sofascore
2025-07-25 18:23:52,280 - INFO - Found 6 potential stat elements
2025-07-25 18:23:52,283 - INFO - Starting to parse statistics...
2025-07-25 18:23:52,320 - INFO - Found 1 containers with selector: div.pt_sm.bdr-b_lg.ov_hidden
2025-07-25 18:23:52,366 - INFO - Found 1 containers with selector: [data-testid*="stat"]
2025-07-25 18:23:52,542 - INFO - Found 253 containers with selector: div[class*="flex"]
2025-07-25 18:23:52,855 - INFO - Successfully parsed 354 statistics
2025-07-25 18:23:52,856 - INFO - Stat 1: {'stat_name': 'Match overview', 'home_value': '48%', 'away_value': '0'}
2025-07-25 18:23:52,857 - INFO - Stat 2: {'stat_name': 'Ball possession', 'home_value': '48%', 'away_value': '52%'}
2025-07-25 18:23:52,857 - INFO - Stat 3: {'stat_name': 'Corner kicks', 'home_value': '2', 'away_value': '3'}
2025-07-25 18:23:52,858 - INFO - Successfully scraped match data. Total resul

⏰ Scheduler started! It will scrape every 1 minutes.
💡 Run the next cell to check results after a few minutes...


2025-07-25 18:24:48,888 - INFO - Starting to scrape: https://www.sofascore.com/football/match/1-fc-koln-leicester-city/Gswdb#id:14250691,tab:statistics
2025-07-25 18:24:48,985 - INFO - Trying with Selenium...
2025-07-25 18:24:49,020 - INFO - Found stats container with selector: .pt_sm.bdr-b_lg.ov_hidden
2025-07-25 18:24:52,687 - INFO - Page title: 1. FC Köln vs Leicester City live score, H2H and lineups | Sofascore
2025-07-25 18:24:52,696 - INFO - Found 6 potential stat elements
2025-07-25 18:24:52,697 - INFO - Starting to parse statistics...
2025-07-25 18:24:52,726 - INFO - Found 1 containers with selector: div.pt_sm.bdr-b_lg.ov_hidden
2025-07-25 18:24:52,778 - INFO - Found 1 containers with selector: [data-testid*="stat"]
2025-07-25 18:24:52,971 - INFO - Found 253 containers with selector: div[class*="flex"]
2025-07-25 18:24:53,296 - INFO - Successfully parsed 354 statistics
2025-07-25 18:24:53,297 - INFO - Stat 1: {'stat_name': 'Match overview', 'home_value': '49%', 'away_value': '0

In [21]:
# Eredmények ellenőrzése és mentése

print("📊 Current results:")
print(f"Total scraping results: {len(scheduler.results)}")

if scheduler.results:
    latest = scheduler.get_latest_stats()
    if latest:
        print(f"\n🕐 Latest timestamp: {latest.get('timestamp')}")
        if latest.get('xg_data'):
            print(f"⚽ Latest xG: {latest['xg_data']}")
    
    # CSV mentés
    csv_file = scheduler.save_results_to_csv()
    print(f"💾 Results saved to: {csv_file}")
    
    # Egyszerű DataFrame megjelenítés
    if len(scheduler.results) > 0:
        sample_data = []
        for result in scheduler.results[-3:]:  # Utolsó 3 eredmény
            sample_data.append({
                'Time': result.get('timestamp', '')[:19],  # Dátum/idő rövidítve
                'Home xG': result.get('xg_data', {}).get('home_xg', 'N/A'),
                'Away xG': result.get('xg_data', {}).get('away_xg', 'N/A'),
                'Stats Count': len(result.get('raw_stats', []))
            })
        
        df_sample = pd.DataFrame(sample_data)
        print("\n📋 Sample results:")
        print(df_sample.to_string(index=False))

2025-07-25 18:25:10,616 - INFO - Results saved to sofascore_stats_20250725_182510.csv


📊 Current results:
Total scraping results: 3

🕐 Latest timestamp: 2025-07-25T18:24:53.347017
💾 Results saved to: sofascore_stats_20250725_182510.csv

📋 Sample results:
               Time Home xG Away xG  Stats Count
2025-07-25T18:23:52     N/A     N/A          354
2025-07-25T18:24:52     N/A     N/A          354
2025-07-25T18:24:53     N/A     N/A          354


In [6]:
# Felhő deployment segédkód

"""
🌐 FELHŐ DEPLOYMENT OPCIÓK:

1. GOOGLE COLAB:
   - Ingyenes GPU/TPU access
   - Jupyter notebook környezet
   - Korlátozott futási idő (12-24 óra)

2. KAGGLE KERNELS:
   - Heti 30 óra GPU idő
   - Internet access korlátozott
   - Notebook környezet

3. GITHUB ACTIONS (Ingyenes tier):
   - Cron job alapú ütemezés
   - 2000 perc/hó limit
   - Eredmények GitHub-ra mentése

4. RAILWAY/RENDER (Ingyenes tier):
   - 24/7 futás
   - Kis resource limit
   - Web app formában

GitHub Actions workflow példa (.github/workflows/scraper.yml):

name: SofaScore Scraper
on:
  schedule:
    - cron: '*/30 * * * *'  # 30 percenként
  workflow_dispatch:

jobs:
  scrape:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v2
    - name: Set up Python
      uses: actions/setup-python@v2
      with:
        python-version: 3.9
    - name: Install dependencies
      run: |
        pip install requests beautifulsoup4 selenium pandas
        sudo apt-get update
        sudo apt-get install -y chromium-browser
    - name: Run scraper
      run: python scraper.py
    - name: Commit results
      run: |
        git config --local user.email "action@github.com"
        git config --local user.name "GitHub Action"
        git add *.csv
        git commit -m "Update scraping results" || exit 0
        git push
"""

print("🚀 Deployment options printed above!")

🚀 Deployment options printed above!


In [22]:
# Cleanup és leállítás

print("🧹 Cleaning up...")
scheduler.stop_scheduler()
scraper.cleanup()
print("✅ Cleanup completed!")

2025-07-25 18:27:13,484 - INFO - Scheduler stopped


🧹 Cleaning up...
✅ Cleanup completed!


# ## 📝 Használati útmutató
# 
# ### Alapvető használat:
# 1. Futtasd le a csomagtelepítést és importokat
# 2. Állítsd be a `test_url` változót a kívánt SofaScore mérkőzés URL-jével
# 3. Futtasd le a tesztelő cellát
# 4. Indítsd el az ütemezett scrapinget
# 
# ### Testreszabási lehetőségek:
# - `interval_minutes`: Scraping gyakoriság módosítása
# - `headless`: False értékkel látható böngésző ablak
# - `save_results_to_csv()`: Automatikus mentés beállítása
# 
# ### Hibaelhárítás:
# - Ha nem működik a requests módszer, a Selenium automatikusan átveszi
# - Chrome driver automatikusan települ a webdriver-manager segítségével
# - Részletes logok a hibák nyomon követéséhez
# 
# ### Teljesítmény optimalizálás:
# - Headless mód gyorsabb futáshoz
# - Requests először próbálkozik (gyorsabb)
# - Selenium csak szükség esetén (megbízhatóbb)