In [1]:
"""
Before pulling any data, we map out what LanusStats supports.
This determines which source to use for Chilean football stats.
"""

from LanusStats import (
    get_available_pages,
    get_available_leagues,
    get_available_season_for_leagues
)

print("Available data sources:")
print(get_available_pages())

Available data sources:
dict_keys(['Fbref', 'Sofascore', '365Scores', 'Fotmob', 'Transfermarkt', 'DataFactory'])


In [2]:
for source in ["Fotmob", "Sofascore", "Fbref"]:
    print(f"\n=== {source} ===")
    print(get_available_leagues(source))


=== Fotmob ===
['Premier League', 'Bundesliga', 'La Liga', 'La Liga 2', 'Serie A', 'Ligue 1', 'Argentina Copa de la Liga', 'Argentina Primera Division', 'Primera Division Colombia', 'Primera Division Chile', 'Brasileirao', 'Primera Division Peru', 'Copa America', 'Euros', 'Champions League', 'Europa League']

=== Sofascore ===
['Argentina Liga Profesional', 'Argentina Copa de la Liga Profesional', 'Argentina Primera Nacional', 'Brasileirão Série A', 'Bolivia Division Profesional', 'Chile Primera Division', 'Colombia Primera A Apertura', 'Colombia Primera A Clausura', 'Ecuador LigaPro', 'Mexico LigaMX Apertura', 'Mexico LigaMX Clausura', 'Peru Liga 1', 'Uruguay Primera Division', 'Venezuela Primera Division', 'World Cup', 'Euros', 'Copa America', 'Premier League', 'La Liga', 'Bundesliga', 'Serie A', 'Ligue 1', 'Copa Libertadores', 'Copa Sudamericana', 'MLS', 'Saudi Pro League', 'J1 League', 'NSWL', 'USL Championship', 'La Liga 2', 'Primera RFEF', 'Champions League', 'Europa League']

=

In [3]:
"""
League name strings differ per source — each is queried with its exact name.
This confirms whether 2025 and 2026 seasons are already indexed.
"""
chile_leagues = {
    "Fotmob":    "Primera Division Chile",
    "Sofascore": "Chile Primera Division",
    "Fbref":     "Primera Division Chile"
}

for source, league in chile_leagues.items():
    print(f"\n=== {source} — {league} ===")
    try:
        print(get_available_season_for_leagues(source, league))
    except Exception as e:
        print(f"Not available: {e}")



=== Fotmob — Primera Division Chile ===
{'id': 273, 'seasons': {'2025': 24934, '2024': 22749, '2023': 18600, '2022': 17370, '2021': 16185}}

=== Sofascore — Chile Primera Division ===
{'id': 11653, 'seasons': {'2023': 48017, '2024': 57883, '2025': 71131, '2026': 88493}}

=== Fbref — Primera Division Chile ===
{'id': 35, 'slug': 'Primera-Division', 'seasons': {'2023', '2025', '2024', '2021', '2022'}}


In [6]:
"""
SofaScore.scrape_league_stats() pulls all player stats indexed in
ss.league_stats_fields, which are hardcoded in the class constructor.
Accumulation options: 'total' | 'per90' | 'perMatch'
"""
from LanusStats import SofaScore

LEAGUE      = "Chile Primera Division"
SEASON_2026 = "2026"
SEASON_2025 = "2025"

ss = SofaScore()

# Review which fields the library will request before downloading
print("Stat fields to be fetched:")
print(ss.league_stats_fields)


Stat fields to be fetched:
['goals', 'yellowCards', 'redCards', 'groundDuelsWon', 'groundDuelsWonPercentage', 'aerialDuelsWon', 'aerialDuelsWonPercentage', 'successfulDribbles', 'successfulDribblesPercentagetackles', 'assists', 'accuratePassesPercentage', 'totalDuelsWon', 'totalDuelsWonPercentage', 'minutesPlayed', 'wasFouled', 'fouls', 'dispossessed', 'possesionLost', 'appearances', 'started', 'saves', 'cleanSheets', 'savedShotsFromInsideTheBox', 'savedShotsFromOutsideTheBox', 'goalsConcededInsideTheBox', 'goalsConcededOutsideTheBox', 'highClaims', 'successfulRunsOut', 'punches', 'runsOut', 'accurateFinalThirdPasses', 'bigChancesCreated', 'accuratePasses', 'keyPasses', 'accurateCrosses', 'accurateCrossesPercentage', 'accurateLongBalls', 'accurateLongBallsPercentage', 'interceptions', 'clearances', 'dribbledPast', 'bigChancesMissed', 'totalShots', 'shotsOnTarget', 'blockedShots', 'goalConversionPercentage', 'hitWoodwork', 'offsides', 'expectedGoals', 'errorLeadToGoal', 'errorLeadToShot

In [8]:
"""
The library's sofascore_request() has three issues on Chrome 133+:
  1. Relies on get_proxy() — dead proxies cause Chrome to close immediately.
  2. Passes 'option=' instead of 'options=' (typo) — Chrome args never apply.
  3. use_subprocess=False is unstable with newer Chrome builds.
This patch replaces the method at runtime without modifying library files.
"""
import json
import time
from bs4 import BeautifulSoup
from faker import Faker
import undetected_chromedriver as uc

def _patched_sofascore_request(self, path):
    path = f"{self.base_url}{path}"

    chrome_options = uc.ChromeOptions()
    chrome_options.add_argument(f'user-agent={Faker().chrome()}')
    chrome_options.add_argument('accept-language=en-US,en;q=0.9')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')

    driver = uc.Chrome(
        headless=True,
        use_subprocess=True,
        options=chrome_options,
        version_main=145
    )

    try:
        driver.get(path)
        time.sleep(3)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        data = json.loads(soup.text)
    finally:
        driver.quit()

    time.sleep(3)
    return data

SofaScore.sofascore_request = _patched_sofascore_request
print("Patch applied successfully.")


Patch applied successfully.


In [13]:
"""
Replaces undetected-chromedriver with pydoll, which drives Chrome via CDP
directly — no ChromeDriver binary required, no version compatibility issues.
Strategy: open sofascore.com to establish a real browser session, then call
the statistics API using browser-context HTTP, which inherits cookies and
session state automatically.
"""
import asyncio
import json
import time
import nest_asyncio
from pydoll.browser.chromium import Chrome
from pydoll.browser.options import ChromiumOptions

nest_asyncio.apply()  # allows asyncio.run() inside Jupyter's running event loop

async def _fetch_sofascore_json(url: str) -> dict:
    options = ChromiumOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')

    async with Chrome(options=options) as browser:
        tab = await browser.start(headless=True)
        # Establish a real session with SofaScore before hitting the API
        await tab.go_to("https://www.sofascore.com")
        await asyncio.sleep(2)
        # Browser-context HTTP: request inherits cookies and session state
        response = await tab.request.get(url)
        result = response.json()
        if asyncio.iscoroutine(result):
            result = await result
        return result

def _patched_sofascore_request(self, path: str) -> dict:
    url = f"{self.base_url}{path}"
    data = asyncio.run(_fetch_sofascore_json(url))
    time.sleep(3)
    return data

SofaScore.sofascore_request = _patched_sofascore_request
print("Patch applied — using pydoll (WebDriver-free CDP automation).")


Patch applied — using pydoll (WebDriver-free CDP automation).


In [14]:
"""
Scrapes all player statistics for the 2026 Chilean Primera División season.
Chrome runs headless in the background — this is expected behavior.
The scraper paginates automatically and stops when all pages are consumed.
"""
df_2026 = ss.scrape_league_stats(
    league=LEAGUE,
    season=SEASON_2026,
    accumulation="total"
)
print(f"Shape: {df_2026.shape}")
df_2026.head()

End of the pages
Shape: (328, 50)


Unnamed: 0,goals,yellowCards,redCards,groundDuelsWon,groundDuelsWonPercentage,aerialDuelsWon,aerialDuelsWonPercentage,successfulDribbles,assists,accuratePassesPercentage,...,blockedShots,goalConversionPercentage,hitWoodwork,offsides,expectedGoals,errorLeadToGoal,errorLeadToShot,passToAssist,player,team
0,3,0,0,12,46.15,0,0.0,4,0,80.77,...,1,37.5,0,1,,0,0,0,Maximiliano Gutiérrez,Huachipato
1,6,3,0,10,58.82,5,38.46,3,1,76.92,...,1,37.5,0,2,,0,0,0,Fernando Zampedri,Universidad Católica
2,1,0,0,21,55.26,2,50.0,4,3,67.11,...,2,25.0,0,0,,0,0,0,Kevin Méndez,Unión La Calera
3,0,0,0,0,0.0,3,100.0,0,0,73.03,...,0,0.0,0,0,,0,0,0,Gabriel Castellón,Universidad de Chile
4,3,0,0,19,46.34,1,33.33,5,2,66.13,...,0,75.0,0,2,,0,0,0,Jean Meneses,Deportes Limache


In [15]:
print(df_2026.columns.tolist())

['goals', 'yellowCards', 'redCards', 'groundDuelsWon', 'groundDuelsWonPercentage', 'aerialDuelsWon', 'aerialDuelsWonPercentage', 'successfulDribbles', 'assists', 'accuratePassesPercentage', 'totalDuelsWon', 'totalDuelsWonPercentage', 'minutesPlayed', 'wasFouled', 'fouls', 'dispossessed', 'appearances', 'saves', 'savedShotsFromInsideTheBox', 'savedShotsFromOutsideTheBox', 'goalsConcededInsideTheBox', 'goalsConcededOutsideTheBox', 'highClaims', 'successfulRunsOut', 'punches', 'runsOut', 'accurateFinalThirdPasses', 'bigChancesCreated', 'accuratePasses', 'keyPasses', 'accurateCrosses', 'accurateCrossesPercentage', 'accurateLongBalls', 'accurateLongBallsPercentage', 'interceptions', 'clearances', 'dribbledPast', 'bigChancesMissed', 'totalShots', 'shotsOnTarget', 'blockedShots', 'goalConversionPercentage', 'hitWoodwork', 'offsides', 'expectedGoals', 'errorLeadToGoal', 'errorLeadToShot', 'passToAssist', 'player', 'team']


In [16]:
print(df_2026.dtypes)
print(f"\nNulls per column:\n{df_2026.isnull().sum().sort_values(ascending=False).head(15)}")

goals                            int64
yellowCards                      int64
redCards                         int64
groundDuelsWon                   int64
groundDuelsWonPercentage       float64
aerialDuelsWon                   int64
aerialDuelsWonPercentage       float64
successfulDribbles               int64
assists                          int64
accuratePassesPercentage       float64
totalDuelsWon                    int64
totalDuelsWonPercentage        float64
minutesPlayed                    int64
wasFouled                        int64
fouls                            int64
dispossessed                     int64
appearances                      int64
saves                            int64
savedShotsFromInsideTheBox       int64
savedShotsFromOutsideTheBox      int64
goalsConcededInsideTheBox        int64
goalsConcededOutsideTheBox       int64
highClaims                       int64
successfulRunsOut                int64
punches                          int64
runsOut                  

In [17]:
"""
Downloading each position group separately allows us to tag every player
with their position before merging — the API does not return this field.
Chrome opens and closes once per group: 4 rounds total.
Expected runtime: 5–15 minutes depending on network speed.
"""
import pandas as pd

position_groups = ['Goalkeepers', 'Defenders', 'Midfielders', 'Forwards']
frames = []

for position in position_groups:
    print(f"Fetching {position}...")
    df_pos = ss.scrape_league_stats(
        league=LEAGUE,
        season=SEASON_2026,
        accumulation="total",
        selected_positions=[position]
    )
    df_pos["position"] = position
    frames.append(df_pos)
    print(f"  → {len(df_pos)} players found\n")

df_2026 = pd.concat(frames, ignore_index=True)

print(f"Total shape: {df_2026.shape}")
print(df_2026["position"].value_counts())


Fetching Goalkeepers...
End of the pages
  → 17 players found

Fetching Defenders...
End of the pages
  → 114 players found

Fetching Midfielders...
End of the pages
  → 134 players found

Fetching Forwards...
End of the pages
  → 63 players found

Total shape: (328, 51)
position
Midfielders    134
Defenders      114
Forwards        63
Goalkeepers     17
Name: count, dtype: int64


In [18]:
"""
Saving the raw dataset before any transformation — this is the source of
truth. Never overwrite files in data/raw/ during processing.
"""
from pathlib import Path

raw_path = Path("../data/raw")
raw_path.mkdir(parents=True, exist_ok=True)

df_2026.to_csv(raw_path / "sofascore_primera_division_2026.csv", index=False)
print(f"Saved: {raw_path / 'sofascore_primera_division_2026.csv'}")
print(f"Shape: {df_2026.shape}")


Saved: ..\data\raw\sofascore_primera_division_2026.csv
Shape: (328, 51)


In [19]:
"""
2025 season data for year-over-year player comparisons.
Same pipeline as 2026 — Chrome opens and closes 4 times.
Expected runtime: 5–15 minutes.
"""
frames_2025 = []

for position in position_groups:
    print(f"Fetching {position}...")
    df_pos = ss.scrape_league_stats(
        league=LEAGUE,
        season=SEASON_2025,
        accumulation="total",
        selected_positions=[position]
    )
    df_pos["position"] = position
    frames_2025.append(df_pos)
    print(f"  → {len(df_pos)} players found\n")

df_2025 = pd.concat(frames_2025, ignore_index=True)

print(f"Total shape: {df_2025.shape}")
print(df_2025["position"].value_counts())


Fetching Goalkeepers...
End of the pages
  → 30 players found

Fetching Defenders...
End of the pages
  → 155 players found

Fetching Midfielders...
End of the pages
  → 176 players found

Fetching Forwards...
End of the pages
  → 102 players found

Total shape: (463, 51)
position
Midfielders    176
Defenders      155
Forwards       102
Goalkeepers     30
Name: count, dtype: int64


In [20]:
df_2025.to_csv(raw_path / "sofascore_primera_division_2025.csv", index=False)
print(f"Saved: {raw_path / 'sofascore_primera_division_2025.csv'}")
print(f"Shape: {df_2025.shape}")


Saved: ..\data\raw\sofascore_primera_division_2025.csv
Shape: (463, 51)


In [21]:
pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", "{:.2f}".format)

# Top scorers so far in 2026
print("=== Top 10 scorers 2026 ===")
display(df_2026[["player", "team", "position", "goals", "assists",
                  "minutesPlayed", "appearances"]]
        .sort_values("goals", ascending=False)
        .head(10))


=== Top 10 scorers 2026 ===


Unnamed: 0,player,team,position,goals,assists,minutesPlayed,appearances
265,Fernando Zampedri,Universidad Católica,Forwards,6,1,360,4
149,Justo Giani,Universidad Católica,Midfielders,3,0,352,4
132,Jean Meneses,Deportes Limache,Midfielders,3,2,354,4
133,Daniel Castro,Deportes Limache,Midfielders,3,2,358,4
270,Steffan Pino,Cobresal,Forwards,3,0,301,4
17,Maximiliano Gutiérrez,Huachipato,Defenders,3,0,259,3
147,Felipe Chamorro,Deportes La Serena,Midfielders,2,0,242,4
142,Agustin Nadruz,Cobresal,Midfielders,2,0,360,4
163,Alejandro Camargo,Coquimbo Unido,Midfielders,2,0,291,4
286,Ignacio Jeraldino,Ñublense,Forwards,2,0,280,4


In [None]:
print("=== Top 10 assisters 2026 ===")
display(df_2026[["player", "team", "position", "assists", "keyPasses",
                  "bigChancesCreated", "minutesPlayed"]]
        .sort_values("assists", ascending=False)
        .head(10))

print("\n=== Players per team ===")
print(df_2026["team"].value_counts())
