In [40]:
import pandas as pd

In [41]:
import requests
from bs4 import BeautifulSoup

In [42]:
def laliga_url(start_year):
    end_short = str(start_year + 1)[-2:]
    return f"https://en.wikipedia.org/wiki/{start_year}%E2%80%93{end_short}_La_Liga"

In [43]:
from io import StringIO

# Add headers to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def scrape_legue_table(start_year):
    url = laliga_url(start_year)
    
    # Fetch the page with headers
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    
    # Parse tables from the HTML content
    tables = pd.read_html(StringIO(response.text), attrs={"class": "wikitable"})
    
    for df in tables:
        cols = df.columns.astype(str).str.lower()
        if any("team" in c for c in cols) and any("pos" in c for c in cols):
            df["season"] = f"{start_year}-{str(start_year+1)[-2:]}"
            return df

    raise ValueError("League table not found")  
    

In [44]:
df_2014_2015 = scrape_legue_table(2014)
df_2014_2015.head(2)

Unnamed: 0,Pos,Team,Pld,W,D,L,GF,GA,GD,Pts,Qualification or relegation,season
0,1,Barcelona (C),38,30,4,4,110,21,89,94,Qualification for the Champions League group s...,2014-15
1,2,Real Madrid,38,30,2,6,118,38,80,92,Qualification for the Champions League group s...,2014-15


In [45]:
df_2014_2015 = scrape_legue_table(2014)
df_2015_2016 = scrape_legue_table(2015)
df_2016_2017 = scrape_legue_table(2016)
df_2017_2018 = scrape_legue_table(2017)
df_2018_2019 = scrape_legue_table(2018)
df_2019_2020 = scrape_legue_table(2019)
df_2020_2021 = scrape_legue_table(2020)
df_2021_2022 = scrape_legue_table(2021)
df_2022_2023 = scrape_legue_table(2022)
df_2023_2024 = scrape_legue_table(2023)
df_2024_2025 = scrape_legue_table(2024)
df_2025_2026 = scrape_legue_table(2025)

In [46]:
# list of all dataframes
dfs = [df_2014_2015, df_2015_2016, df_2016_2017, df_2017_2018, 
       df_2018_2019, df_2019_2020, df_2020_2021, df_2021_2022, 
       df_2022_2023, df_2023_2024, df_2024_2025, df_2025_2026]

#standardize column names across all dataframes
for df in dfs:
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('\n', '_')
    df.rename(columns={'team': 'team', 'teamvte': 'team'}, inplace=True)

# Define the expected columns
expected_columns = ['pos', 'team', 'pld', 'w', 'd', 'l', 
                    'gf', 'ga', 'gd', 'pts', 'season']

#concatenate all dataframes into a single dataframe
full_data = pd.concat(dfs, ignore_index=True)



In [47]:
print(full_data.shape)
full_data.head()

(240, 12)


Unnamed: 0,pos,team,pld,w,d,l,gf,ga,gd,pts,qualification_or_relegation,season
0,1,Barcelona (C),38,30,4,4,110,21,89,94,Qualification for the Champions League group s...,2014-15
1,2,Real Madrid,38,30,2,6,118,38,80,92,Qualification for the Champions League group s...,2014-15
2,3,Atlético Madrid,38,23,9,6,67,29,38,78,Qualification for the Champions League group s...,2014-15
3,4,Valencia,38,22,11,5,70,32,38,77,Qualification for the Champions League play-of...,2014-15
4,5,Sevilla,38,23,7,8,71,45,26,76,Qualification for the Champions League group s...,2014-15


In [48]:
full_data.to_csv('laliga_2014_2026.csv', index=False)
print("Data saved to successfully to laliga_2014_2026.csv")

Data saved to successfully to laliga_2014_2026.csv
