In [50]:
import requests
import json
import os
import urllib.parse
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
finals = [
    'https://en.wikipedia.org/wiki/List_of_European_Cup_and_UEFA_Champions_League_finals',
    'https://en.wikipedia.org/wiki/List_of_UEFA_Cup_and_Europa_League_finals',
    'https://en.wikipedia.org/wiki/List_of_UEFA_Cup_Winners%27_Cup_finals',
    'https://en.wikipedia.org/wiki/List_of_UEFA_Conference_League_finals'
]

In [6]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'}

In [89]:
def scrape_final(url):
    
    print(f"scraping {url}")
    final_matches = []
    r = requests.get(url, timeout=20, headers=headers)
    soup = BeautifulSoup(r.text, "html.parser")
    
    competition = soup.find('h1').text.replace('List of ','').replace(' finals','')

    for table in soup.find(class_='sortable'):
        
        if ('venue' in table.text.lower()) and ('score' in table.text.lower()):
            for r in table.find_all('tr'):
                
                if r.find('td') and r.find('th') and ('2024–25' not in r.text) and ('2025–26' not in r.text) and ('2026–27' not in r.text):
                    final_match = {}
                    final_match['competition'] = competition
                    final_match['season'] = r.find('th').text.strip()
                    final_match['champions'] = r.find_all('td')[1].text.strip()
                    final_match['championsUrl'] = r.find_all('td')[1].find('a')['href'].strip()
                    final_match['runners-up'] = r.find_all('td')[3].text.strip()
                    final_match['runners-upUrl'] = r.find_all('td')[3].find('a')['href'].strip()
                    final_match['venues'] = [r.find_all('td')[5].text.strip()]
                    if r.find('th', rowspan="2"):
                        if r.find_next_sibling('tr').find_all('td')[1].text.strip() != '19,917':
                            if r.find_next_sibling('tr').find_all('td')[1].text.strip() != '23,325':
                                final_match['venues'].append(r.find_next_sibling('tr').find_all('td')[1].text.strip())
                    final_matches.append(final_match)
            
    return final_matches

In [90]:
scrape_final('https://en.wikipedia.org/wiki/List_of_European_Cup_and_UEFA_Champions_League_finals')

scraping https://en.wikipedia.org/wiki/List_of_European_Cup_and_UEFA_Champions_League_finals


[{'competition': 'European Cup and UEFA Champions League',
  'season': '1955–56',
  'champions': 'Real Madrid',
  'championsUrl': '/wiki/Real_Madrid_CF',
  'runners-up': 'Reims',
  'runners-upUrl': '/wiki/Stade_de_Reims',
  'venues': ['Parc des Princes, Paris, France']},
 {'competition': 'European Cup and UEFA Champions League',
  'season': '1956–57',
  'champions': 'Real Madrid',
  'championsUrl': '/wiki/Real_Madrid_CF',
  'runners-up': 'Fiorentina',
  'runners-upUrl': '/wiki/ACF_Fiorentina',
  'venues': ['Santiago Bernabéu, Madrid, Spain']},
 {'competition': 'European Cup and UEFA Champions League',
  'season': '1957–58',
  'champions': 'Real Madrid',
  'championsUrl': '/wiki/Real_Madrid_CF',
  'runners-up': 'AC Milan',
  'runners-upUrl': '/wiki/AC_Milan',
  'venues': ['Heysel Stadium, Brussels, Belgium']},
 {'competition': 'European Cup and UEFA Champions League',
  'season': '1958–59',
  'champions': 'Real Madrid',
  'championsUrl': '/wiki/Real_Madrid_CF',
  'runners-up': 'Reims',


In [91]:
cup_finals = pd.DataFrame()
for f in finals:
    cup_finals = pd.concat([cup_finals, pd.DataFrame(scrape_final(f))])

scraping https://en.wikipedia.org/wiki/List_of_European_Cup_and_UEFA_Champions_League_finals
scraping https://en.wikipedia.org/wiki/List_of_UEFA_Cup_and_Europa_League_finals
scraping https://en.wikipedia.org/wiki/List_of_UEFA_Cup_Winners%27_Cup_finals
scraping https://en.wikipedia.org/wiki/List_of_UEFA_Conference_League_finals


In [92]:
cup_finals

Unnamed: 0,competition,season,champions,championsUrl,runners-up,runners-upUrl,venues
0,European Cup and UEFA Champions League,1955–56,Real Madrid,/wiki/Real_Madrid_CF,Reims,/wiki/Stade_de_Reims,"[Parc des Princes, Paris, France]"
1,European Cup and UEFA Champions League,1956–57,Real Madrid,/wiki/Real_Madrid_CF,Fiorentina,/wiki/ACF_Fiorentina,"[Santiago Bernabéu, Madrid, Spain]"
2,European Cup and UEFA Champions League,1957–58,Real Madrid,/wiki/Real_Madrid_CF,AC Milan,/wiki/AC_Milan,"[Heysel Stadium, Brussels, Belgium]"
3,European Cup and UEFA Champions League,1958–59,Real Madrid,/wiki/Real_Madrid_CF,Reims,/wiki/Stade_de_Reims,"[Neckarstadion, Stuttgart, West Germany]"
4,European Cup and UEFA Champions League,1959–60,Real Madrid,/wiki/Real_Madrid_CF,Eintracht Frankfurt,/wiki/Eintracht_Frankfurt,"[Hampden Park, Glasgow, Scotland]"
...,...,...,...,...,...,...,...
37,UEFA Cup Winners' Cup,1997–98,Chelsea,/wiki/Chelsea_F.C.,VfB Stuttgart,/wiki/VfB_Stuttgart,"[Råsunda Stadium, Stockholm, Sweden]"
38,UEFA Cup Winners' Cup,1998–99,Lazio,/wiki/S.S._Lazio,Mallorca,/wiki/RCD_Mallorca,"[Villa Park, Birmingham, England]"
0,UEFA Conference League,2021–22,Roma,/wiki/A.S._Roma,Feyenoord,/wiki/Feyenoord,"[Arena Kombëtare, Tirana, Albania]"
1,UEFA Conference League,2022–23,West Ham United,/wiki/West_Ham_United_F.C.,Fiorentina,/wiki/ACF_Fiorentina,"[Fortuna Arena, Prague, Czech Republic]"


In [93]:
df = cup_finals.copy()
df['season'] = df['season'].apply(lambda x: x.replace('–','-'))
df = df.sort_values(by='season')
df['champions'] = df['champions'].apply(lambda x: x.split('(')[0].strip())
df['runners-up'] = df['runners-up'].apply(lambda x: x.split('(')[0].strip())
df['championsUrl'] = df['championsUrl'].apply(lambda x: x.replace('/wiki/',''))
df['runners-upUrl'] = df['runners-upUrl'].apply(lambda x: x.replace('/wiki/',''))
df = df.reset_index(drop=True)

In [94]:
df['champions'].drop_duplicates()

0              Real Madrid
5                  Benfica
6               Fiorentina
8          Atlético Madrid
9                 AC Milan
              ...         
154             Villarreal
155    Eintracht Frankfurt
157                   Roma
162               Atalanta
163             Olympiacos
Name: champions, Length: 65, dtype: object

In [95]:
df.head(50)

Unnamed: 0,competition,season,champions,championsUrl,runners-up,runners-upUrl,venues
0,European Cup and UEFA Champions League,1955-56,Real Madrid,Real_Madrid_CF,Reims,Stade_de_Reims,"[Parc des Princes, Paris, France]"
1,European Cup and UEFA Champions League,1956-57,Real Madrid,Real_Madrid_CF,Fiorentina,ACF_Fiorentina,"[Santiago Bernabéu, Madrid, Spain]"
2,European Cup and UEFA Champions League,1957-58,Real Madrid,Real_Madrid_CF,AC Milan,AC_Milan,"[Heysel Stadium, Brussels, Belgium]"
3,European Cup and UEFA Champions League,1958-59,Real Madrid,Real_Madrid_CF,Reims,Stade_de_Reims,"[Neckarstadion, Stuttgart, West Germany]"
4,European Cup and UEFA Champions League,1959-60,Real Madrid,Real_Madrid_CF,Eintracht Frankfurt,Eintracht_Frankfurt,"[Hampden Park, Glasgow, Scotland]"
5,European Cup and UEFA Champions League,1960-61,Benfica,S.L._Benfica,Barcelona,FC_Barcelona,"[Wankdorf Stadium, Bern, Switzerland]"
6,UEFA Cup Winners' Cup,1960-61[a],Fiorentina,ACF_Fiorentina,Rangers,Rangers_F.C.,"[Ibrox Stadium, Glasgow, Scotland, Stadio Comu..."
7,European Cup and UEFA Champions League,1961-62,Benfica,S.L._Benfica,Real Madrid,Real_Madrid_CF,"[Olympisch Stadion, Amsterdam, Netherlands]"
8,UEFA Cup Winners' Cup,1961-62,Atlético Madrid,Atl%C3%A9tico_Madrid,Fiorentina,ACF_Fiorentina,"[Hampden Park, Glasgow, Scotland, Neckarstadio..."
9,European Cup and UEFA Champions League,1962-63,AC Milan,AC_Milan,Benfica,S.L._Benfica,"[Wembley Stadium, London, England]"


In [96]:
df.to_json(os.path.join('data','final_matches.json'))