# ‚ö° Sofascore Future Matches Scraper v1

**Extract upcoming match data with:**
- üîÆ Scheduled matches (next 7 days)
- üåç 10+ major world leagues
- üí∞ Pre-match odds (1X2, BTTS, O/U)
- üìä Team form & current streaks
- üÜö H2H historical records

In [None]:
!pip install tls_client pandas numpy -q
print("Dependencies installed!")

In [None]:
#@title Configuration { run: "auto" }
#@markdown ### Leagues:
Premier_League = True #@param {type:"boolean"}
La_Liga = True #@param {type:"boolean"}
Bundesliga = True #@param {type:"boolean"}
Serie_A = True #@param {type:"boolean"}
Ligue_1 = True #@param {type:"boolean"}
Liga_MX = True #@param {type:"boolean"}
Eredivisie = False #@param {type:"boolean"}
Primeira_Liga = False #@param {type:"boolean"}
MLS = False #@param {type:"boolean"}
Brazilian_Serie_A = False #@param {type:"boolean"}

#@markdown ### Options:
Days_Ahead = 7 #@param {type:"slider", min:1, max:14, step:1}
Parallel_Workers = 5 #@param {type:"slider", min:2, max:10, step:1}
Include_Odds = True #@param {type:"boolean"}
Include_Form = True #@param {type:"boolean"}
Include_H2H = True #@param {type:"boolean"}

LEAGUES = {
    'Premier League': {'id': 17, 'enabled': Premier_League},
    'La Liga': {'id': 8, 'enabled': La_Liga},
    'Bundesliga': {'id': 35, 'enabled': Bundesliga},
    'Serie A': {'id': 23, 'enabled': Serie_A},
    'Ligue 1': {'id': 34, 'enabled': Ligue_1},
    'Liga MX': {'id': 11621, 'enabled': Liga_MX},
    'Eredivisie': {'id': 37, 'enabled': Eredivisie},
    'Primeira Liga': {'id': 238, 'enabled': Primeira_Liga},
    'MLS': {'id': 242, 'enabled': MLS},
    'Brazilian Serie A': {'id': 325, 'enabled': Brazilian_Serie_A},
}

selected = {k: v for k, v in LEAGUES.items() if v['enabled']}
print(f"Selected: {len(selected)} leagues | {Days_Ahead} days ahead | {Parallel_Workers} workers")

In [None]:
import time
import random
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
from tls_client import Session
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Thread-safe session pool
class SessionPool:
    def __init__(self, size=5):
        self.sessions = [Session(client_identifier="firefox_120") for _ in range(size)]
        self.index = 0
        self.lock = Lock()

    def get(self):
        with self.lock:
            session = self.sessions[self.index % len(self.sessions)]
            self.index += 1
            return session

pool = SessionPool(Parallel_Workers)
BASE_URL = "https://www.sofascore.com/api/v1"
request_count = 0
count_lock = Lock()

def fetch_json(url, retries=2):
    global request_count
    full_url = f"{BASE_URL}{url}" if url.startswith('/') else url
    session = pool.get()

    for attempt in range(retries + 1):
        try:
            time.sleep(random.uniform(0.2, 0.5))
            response = session.get(full_url)

            with count_lock:
                request_count += 1

            if response.status_code == 200:
                return response.json()
            elif response.status_code == 403:
                time.sleep(3)
        except:
            if attempt < retries:
                time.sleep(1)
    return None

def convert_fractional(frac_str):
    try:
        if '/' in str(frac_str):
            num, den = map(int, str(frac_str).split('/'))
            return round(1 + (num / den), 3)
        return float(frac_str)
    except:
        return None

def slugify(text):
    return re.sub(r'[^a-z0-9]+', text.lower(), '_').strip('_')

print(f"Session pool ready with {Parallel_Workers} workers")

In [None]:
# Data fetching functions for FUTURE matches

def get_current_season(tournament_id):
    """Get the current/active season for a tournament."""
    data = fetch_json(f"/unique-tournament/{tournament_id}/seasons")
    if not data or 'seasons' not in data:
        return None
    # First season is typically current
    return data['seasons'][0] if data['seasons'] else None

def get_scheduled_matches(tournament_id, season_id, days_ahead=7):
    """Get scheduled (upcoming) matches for a tournament."""
    matches = []
    now = datetime.now()
    max_timestamp = (now + timedelta(days=days_ahead)).timestamp()

    # Try next events endpoint
    data = fetch_json(f"/unique-tournament/{tournament_id}/season/{season_id}/events/next/0")
    if data and 'events' in data:
        for e in data.get('events', []):
            ts = e.get('startTimestamp', 0)
            status = e.get('status', {}).get('type', '')

            # Only get scheduled matches within time range
            if status != 'notstarted':
                continue
            if ts > max_timestamp:
                continue

            matches.append({
                'match_id': e.get('id'),
                'date': datetime.fromtimestamp(ts).strftime('%Y-%m-%d'),
                'time': datetime.fromtimestamp(ts).strftime('%H:%M'),
                'timestamp': ts,
                'home_team': e.get('homeTeam', {}).get('name'),
                'home_team_id': e.get('homeTeam', {}).get('id'),
                'away_team': e.get('awayTeam', {}).get('name'),
                'away_team_id': e.get('awayTeam', {}).get('id'),
                'tournament_id': tournament_id,
                'round': e.get('roundInfo', {}).get('round'),
            })

    return matches

def get_team_form(team_id, tournament_id, num_matches=5):
    """Get recent form (last N finished matches) for a team."""
    data = fetch_json(f"/team/{team_id}/events/last/0")
    if not data or 'events' not in data:
        return {'form': '', 'wins': 0, 'draws': 0, 'losses': 0, 'goals_scored': 0, 'goals_conceded': 0}

    form = []
    wins = draws = losses = gs = gc = 0

    for e in data.get('events', []):
        if e.get('status', {}).get('type') != 'finished':
            continue
        if e.get('tournament', {}).get('uniqueTournament', {}).get('id') != tournament_id:
            continue
        if len(form) >= num_matches:
            break

        home_id = e.get('homeTeam', {}).get('id')
        home_score = e.get('homeScore', {}).get('current', 0)
        away_score = e.get('awayScore', {}).get('current', 0)

        if home_id == team_id:
            gs += home_score
            gc += away_score
            if home_score > away_score:
                form.append('W')
                wins += 1
            elif home_score < away_score:
                form.append('L')
                losses += 1
            else:
                form.append('D')
                draws += 1
        else:
            gs += away_score
            gc += home_score
            if away_score > home_score:
                form.append('W')
                wins += 1
            elif away_score < home_score:
                form.append('L')
                losses += 1
            else:
                form.append('D')
                draws += 1

    return {
        'form': ''.join(form),
        'wins': wins,
        'draws': draws,
        'losses': losses,
        'goals_scored': gs,
        'goals_conceded': gc
    }

def get_match_odds(match_id):
    """Fetch pre-match odds for a future match."""
    odds = {}
    odds_data = fetch_json(f"/event/{match_id}/odds/1/all")
    if odds_data and 'markets' in odds_data:
        for market in odds_data.get('markets', []):
            mid = market.get('marketId')
            for choice in market.get('choices', []):
                name = choice.get('name', '')
                dec = convert_fractional(choice.get('fractionalValue', ''))
                if not dec:
                    continue
                if mid == 1:  # 1X2
                    if name == '1':
                        odds['odds_1x2_home'] = dec
                    elif name == 'X':
                        odds['odds_1x2_draw'] = dec
                    elif name == '2':
                        odds['odds_1x2_away'] = dec
                elif mid == 5:  # BTTS
                    if name.lower() == 'yes':
                        odds['odds_btts_yes'] = dec
                    elif name.lower() == 'no':
                        odds['odds_btts_no'] = dec
                elif mid == 2:  # Over/Under 2.5
                    if 'over' in name.lower():
                        odds['odds_over_2_5'] = dec
                    elif 'under' in name.lower():
                        odds['odds_under_2_5'] = dec
    return odds

def get_h2h_record(match_id, home_team_id, away_team_id):
    """Get head-to-head record between two teams."""
    h2h_data = fetch_json(f"/event/{match_id}/h2h/events")
    if not h2h_data or 'events' not in h2h_data:
        return {'h2h_home_wins': 0, 'h2h_away_wins': 0, 'h2h_draws': 0, 'h2h_total': 0}

    home_wins = away_wins = draws = 0
    for h2h in h2h_data.get('events', []):
        if h2h.get('status', {}).get('type') != 'finished':
            continue

        winner = h2h.get('winnerCode')
        h2h_home_id = h2h.get('homeTeam', {}).get('id')

        if winner == 1:
            if h2h_home_id == home_team_id:
                home_wins += 1
            else:
                away_wins += 1
        elif winner == 2:
            if h2h_home_id == home_team_id:
                away_wins += 1
            else:
                home_wins += 1
        elif winner == 3:
            draws += 1

    return {
        'h2h_home_wins': home_wins,
        'h2h_away_wins': away_wins,
        'h2h_draws': draws,
        'h2h_total': home_wins + away_wins + draws
    }

def enrich_future_match(match, include_odds, include_form, include_h2h, tournament_id):
    """Enrich a future match with odds, form, and H2H."""
    match_id = match['match_id']
    home_id = match['home_team_id']
    away_id = match['away_team_id']

    # Odds
    if include_odds:
        odds = get_match_odds(match_id)
        match.update(odds)

    # Team form
    if include_form:
        home_form = get_team_form(home_id, tournament_id)
        away_form = get_team_form(away_id, tournament_id)
        match['home_form'] = home_form['form']
        match['home_form_wins'] = home_form['wins']
        match['home_form_draws'] = home_form['draws']
        match['home_form_losses'] = home_form['losses']
        match['home_form_gs'] = home_form['goals_scored']
        match['home_form_gc'] = home_form['goals_conceded']

        match['away_form'] = away_form['form']
        match['away_form_wins'] = away_form['wins']
        match['away_form_draws'] = away_form['draws']
        match['away_form_losses'] = away_form['losses']
        match['away_form_gs'] = away_form['goals_scored']
        match['away_form_gc'] = away_form['goals_conceded']

    # H2H
    if include_h2h:
        h2h = get_h2h_record(match_id, home_id, away_id)
        match.update(h2h)

    return match

print("Data functions ready")

In [None]:
def scrape_future_matches(league_name, tournament_id, days_ahead, workers,
                          include_odds, include_form, include_h2h):
    """Scrape upcoming matches for a single league."""
    print(f"\n{'='*50}")
    print(f"SCRAPING FUTURE: {league_name}")
    print(f"{'='*50}")

    # Get current season
    season = get_current_season(tournament_id)
    if not season:
        print(f"No current season found")
        return pd.DataFrame()

    print(f"Season: {season.get('name', 'Unknown')}")

    # Get scheduled matches
    matches = get_scheduled_matches(tournament_id, season['id'], days_ahead)
    print(f"Found {len(matches)} scheduled matches in next {days_ahead} days")

    if not matches:
        return pd.DataFrame()

    # Add league name to each match
    for m in matches:
        m['league'] = league_name

    # Enrich matches in parallel
    print(f"Enriching with odds/form/h2h (parallel)...")
    enriched = []
    with ThreadPoolExecutor(max_workers=workers) as executor:
        futures = [executor.submit(enrich_future_match, m, include_odds, include_form, include_h2h, tournament_id)
                   for m in matches]
        done = 0
        for future in as_completed(futures):
            enriched.append(future.result())
            done += 1
            print(f"\r  Enriched: {done}/{len(matches)}", end='', flush=True)

    print(f"\n  Completed: {len(enriched)} matches")
    return pd.DataFrame(enriched)

print("Future match scraper ready")

In [None]:
#@title Run Future Matches Scraping { display-mode: "form" }

all_data = []
start_time = time.time()

print("="*60)
print("FUTURE MATCHES SCRAPER")
print("="*60)
print(f"Leagues: {len(selected)}")
print(f"Days ahead: {Days_Ahead}")
print(f"Workers: {Parallel_Workers}")

for name, info in selected.items():
    try:
        df = scrape_future_matches(
            league_name=name,
            tournament_id=info['id'],
            days_ahead=Days_Ahead,
            workers=Parallel_Workers,
            include_odds=Include_Odds,
            include_form=Include_Form,
            include_h2h=Include_H2H
        )
        if len(df) > 0:
            all_data.append(df)
    except Exception as e:
        print(f"\nError in {name}: {e}")

# Combine
if all_data:
    df_future = pd.concat(all_data, ignore_index=True)
    df_future = df_future.drop_duplicates(subset=['match_id'], keep='first')
    df_future = df_future.sort_values(['date', 'time']).reset_index(drop=True)

    elapsed = time.time() - start_time
    print("\n" + "="*60)
    print("COMPLETE!")
    print("="*60)
    print(f"Total matches: {len(df_future)}")
    print(f"Columns: {len(df_future.columns)}")
    print(f"Time: {elapsed/60:.1f} minutes")
    print(f"API requests: {request_count}")
    print(f"Leagues: {df_future['league'].nunique()}")

    if 'odds_1x2_home' in df_future.columns:
        print(f"Odds coverage: {df_future['odds_1x2_home'].notna().mean()*100:.1f}%")
else:
    df_future = pd.DataFrame()
    print("No upcoming matches found")

In [None]:
# Preview
if len(df_future) > 0:
    print("Upcoming Matches:")
    display(df_future[['date', 'time', 'league', 'home_team', 'away_team', 'odds_1x2_home', 'odds_1x2_draw', 'odds_1x2_away']].head(20))

    print("\nBy League:")
    print(df_future['league'].value_counts())

    print("\nBy Date:")
    print(df_future['date'].value_counts().sort_index())

In [None]:
# View all columns
if len(df_future) > 0:
    print("All columns:")
    print(df_future.columns.tolist())
    print(f"\nShape: {df_future.shape}")

In [None]:
from google.colab import files

if len(df_future) > 0:
    today = datetime.now().strftime('%Y%m%d')
    filename = f"sofascore_future_{len(selected)}lg_{len(df_future)}matches_{today}.csv"
    df_future.to_csv(filename, index=False)
    print(f"Saved: {filename}")
    files.download(filename)

---
## Data Dictionary

### Match Info
| Column | Description |
|--------|-------------|
| `match_id` | Unique Sofascore match ID |
| `date` | Match date (YYYY-MM-DD) |
| `time` | Kickoff time (HH:MM) |
| `league` | League name |
| `home_team` / `away_team` | Team names |
| `round` | Match round/gameweek |

### Odds
| Column | Description |
|--------|-------------|
| `odds_1x2_home/draw/away` | Decimal 1X2 odds |
| `odds_btts_yes/no` | Both Teams to Score |
| `odds_over_2_5/under_2_5` | Over/Under 2.5 goals |

### Team Form (Last 5 Matches)
| Column | Description |
|--------|-------------|
| `home_form/away_form` | Recent results (e.g. WWDLW) |
| `*_form_wins/draws/losses` | W/D/L counts |
| `*_form_gs/gc` | Goals scored/conceded |

### Head-to-Head
| Column | Description |
|--------|-------------|
| `h2h_home_wins` | Historical wins for home team |
| `h2h_away_wins` | Historical wins for away team |
| `h2h_draws` | Historical draws |
| `h2h_total` | Total H2H matches |

---

### Tips
- Run daily to get updated odds
- Increase `Days_Ahead` for more matches
- If you get 403 errors, reduce workers or wait a bit