In [1]:
"""
Processing pipeline for raw SofaScore data:
  1. Drop fully null columns
  2. Filter low-minute players
  3. Normalize stats to per-90 basis
  4. Fetch specific positions per player
  5. Export cleaned datasets to data/processed/
"""
import pandas as pd
import numpy as np
from pathlib import Path

RAW_PATH  = Path("../data/raw")
PROC_PATH = Path("../data/processed")
PROC_PATH.mkdir(parents=True, exist_ok=True)

df_2026 = pd.read_csv(RAW_PATH / "sofascore_primera_division_2026.csv")
df_2025 = pd.read_csv(RAW_PATH / "sofascore_primera_division_2025.csv")

print(f"2026 → {df_2026.shape}")
print(f"2025 → {df_2025.shape}")


2026 → (328, 51)
2025 → (463, 51)


In [None]:
from LanusStats import SofaScore

LEAGUE      = "Chile Primera Division"
SEASON_2026 = "2026"
SEASON_2025 = "2025"

ss = SofaScore()

# Re-apply the pydoll patch so scraping works from this notebook too
import asyncio
import time
import nest_asyncio
from pydoll.browser.chromium import Chrome
from pydoll.browser.options import ChromiumOptions

nest_asyncio.apply()

async def _fetch_sofascore_json(url: str) -> dict:
    options = ChromiumOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    async with Chrome(options=options) as browser:
        tab = await browser.start(headless=True)
        await tab.go_to("https://www.sofascore.com")
        await asyncio.sleep(2)
        response = await tab.request.get(url)
        result = response.json()
        if asyncio.iscoroutine(result):
            result = await result
        return result

def _patched_sofascore_request(self, path: str) -> dict:
    url = f"{self.base_url}{path}"
    data = asyncio.run(_fetch_sofascore_json(url))
    time.sleep(3)
    return data

SofaScore.sofascore_request = _patched_sofascore_request
print("Constants defined and patch applied.")


Constants defined and patch applied.


In [2]:
"""
expectedGoals is fully null for the Chilean league in both seasons.
Dropping it before any further processing.
"""
df_2026 = df_2026.drop(columns=["expectedGoals"])
df_2025 = df_2025.drop(columns=["expectedGoals"])

# Confirm no other columns have significant nulls
print("=== 2026 remaining nulls ===")
print(df_2026.isnull().sum()[df_2026.isnull().sum() > 0])

print("\n=== 2025 remaining nulls ===")
print(df_2025.isnull().sum()[df_2025.isnull().sum() > 0])


=== 2026 remaining nulls ===
Series([], dtype: int64)

=== 2025 remaining nulls ===
Series([], dtype: int64)


In [3]:
"""
Players with very few minutes skew per-90 stats significantly.
Threshold: 90 minutes minimum (equivalent to 1 full match).
This removes players who appeared only as late substitutes.
"""
MIN_MINUTES = 90

df_2026 = df_2026[df_2026["minutesPlayed"] >= MIN_MINUTES].reset_index(drop=True)
df_2025 = df_2025[df_2025["minutesPlayed"] >= MIN_MINUTES].reset_index(drop=True)

print(f"2026 after filter → {df_2026.shape}")
print(f"2025 after filter → {df_2025.shape}")
print(f"\nPlayers removed from 2026: {328 - len(df_2026)}")
print(f"Players removed from 2025: {463 - len(df_2025)}")


2026 after filter → (237, 50)
2025 after filter → (416, 50)

Players removed from 2026: 91
Players removed from 2025: 47


In [4]:
"""
Raw counting stats favor players with more minutes. Normalizing to per-90
allows fair comparison regardless of appearances or minutes played.
Percentage-based and ratio columns are excluded — they are already normalized.
Rate stats (per-90) are suffixed with '_p90' to distinguish from raw values.
"""
EXCLUDE_FROM_P90 = [
    "player", "team", "position", "minutesPlayed", "appearances",
    "groundDuelsWonPercentage", "aerialDuelsWonPercentage",
    "accuratePassesPercentage", "accurateCrossesPercentage",
    "accurateLongBallsPercentage", "totalDuelsWonPercentage",
    "goalConversionPercentage"
]

def normalize_p90(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    cols_to_normalize = [c for c in df.columns if c not in EXCLUDE_FROM_P90]
    for col in cols_to_normalize:
        df[f"{col}_p90"] = (df[col] / df["minutesPlayed"] * 90).round(3)
    return df

df_2026 = normalize_p90(df_2026)
df_2025 = normalize_p90(df_2025)

p90_cols = [c for c in df_2026.columns if c.endswith("_p90")]
print(f"Per-90 columns created: {len(p90_cols)}")
print(f"2026 final shape: {df_2026.shape}")
print(f"2025 final shape: {df_2025.shape}")


Per-90 columns created: 38
2026 final shape: (237, 88)
2025 final shape: (416, 88)


In [5]:
"""
Processed files retain both raw and per-90 columns.
Raw values are preserved for context (e.g. appearances, minutes played).
"""
df_2026.to_csv(PROC_PATH / "primera_division_2026_processed.csv", index=False)
df_2025.to_csv(PROC_PATH / "primera_division_2025_processed.csv", index=False)

print(f"Saved: primera_division_2026_processed.csv → {df_2026.shape}")
print(f"Saved: primera_division_2025_processed.csv → {df_2025.shape}")


Saved: primera_division_2026_processed.csv → (237, 88)
Saved: primera_division_2025_processed.csv → (416, 88)


In [6]:
"""
scrape_league_stats() returns only broad position groups (G/D/M/F).
SofaScore's player endpoint provides the specific position for each player
(e.g. Centre Back, Right Back, Attacking Midfielder, Centre Forward).
This enriches the dataset for position-specific analysis.
"""
from LanusStats import SofaScore
import time

ss = SofaScore()

def get_specific_position(player_name: str) -> str:
    try:
        data = ss.get_player_stats(player_name)
        return data.get("position", "Unknown")
    except Exception:
        return "Unknown"

# Test with one player before running the full loop
test_player = df_2026["player"].iloc[0]
print(f"Testing with: {test_player}")
print(get_specific_position(test_player))


Testing with: Gabriel Castellón
Unknown


In [7]:
methods = [m for m in dir(ss) if not m.startswith("_")]
print("\n".join(methods))


base_url
get_lineups
get_match_data
get_match_id
get_match_momentum
get_match_shotmap
get_player_heatmap
get_player_ids
get_player_match_events
get_player_season_heatmap
get_players_average_positions
get_players_match_stats
get_positions
get_team_names
league_stats_fields
scrape_league_stats
sofascore_request


In [8]:
help(ss.get_positions)


Help on method get_positions in module LanusStats.sofascore:

get_positions(selected_positions) method of LanusStats.sofascore.SofaScore instance
    Returns a string for the parameter filters of the scrape_league_stats() request.

    Args:
        selected_positions (list): List of the positions available to filter on the SofaScore UI

    Returns:
        dict: Goalies, Defenders, Midfielders and Forwards and their translation for the parameter of the request



In [11]:
help(ss.get_player_ids)


Help on method get_player_ids in module LanusStats.sofascore:

get_player_ids(match_url) method of LanusStats.sofascore.SofaScore instance
    Get the player ids for a Sofascore match

    Args:
        match_url (string): Full link to a SofaScore match

    Returns:
        dict: Name and ids of every player in the match
            Key: Name
            Value: Id



In [12]:
print("=== Processing summary ===")
print(f"2026: {df_2026.shape[0]} players, {df_2026.shape[1]} columns")
print(f"2025: {df_2025.shape[0]} players, {df_2025.shape[1]} columns")
print(f"\nFiles saved to data/processed/:")
print("  - primera_division_2026_processed.csv")
print("  - primera_division_2025_processed.csv")


=== Processing summary ===
2026: 237 players, 88 columns
2025: 416 players, 88 columns

Files saved to data/processed/:
  - primera_division_2026_processed.csv
  - primera_division_2025_processed.csv
