In [475]:
# import required modules
import requests
from bs4 import BeautifulSoup
import pandas as pd
import geopy
import os
import re
from geopy.geocoders import Nominatim
import time

In [4]:
# get URL
page = requests.get("https://en.wikipedia.org/wiki/List_of_rugby_union_competitions")
soup = BeautifulSoup(page.content, 'html.parser')

In [504]:
tournament_year = {}
tournament_year['mid-year_rugby_union_tests'] = [year for year in range(2004,2011)]
tournament_year['June_rugby_union_tests'] = [year for year in range(2013,2019)]
tournament_year['July_rugby_union_tests'] = [year for year in range(2021,2023)]

tournament_year['Rugby_World_Cup_Pool_A'] = [2007, 2011, 2015, 2019, 2023]
tournament_year['Rugby_World_Cup_Pool_B'] = [2007, 2011, 2015, 2019, 2023]
tournament_year['Rugby_World_Cup_Pool_C'] = [2007, 2011, 2015, 2019, 2023]
tournament_year['Rugby_World_Cup_Pool_D'] = [2007, 2011, 2015, 2019, 2023]
tournament_year['Rugby_World_Cup'] = [1987, 1991, 1995, 1999,2003,207,2011,2015,2019, 2023]
tournament_year['Home_Nations_Championship'] = [year for year in range(1883,1910)] + [year for year in range(1932,1940)]
tournament_year['Five_Nations_Championship'] = [year for year in range(1910,1915)] +  [year for year in range(1920,1932)]+[year for year in range(1940,2000)]

tournament_year['Six_Nations_Championship'] = [year for year in range(2000,2023)]
tournament_year['Tri_Nations_Series'] = [year for year in range(1996,2012)]
tournament_year['Rugby_Championship'] = [year for year in range(2012,2024)]
tournament_year['Rugby_Union_European_Cup'] = [1952, 1954]
tournament_year['FIRA_Nations_Cup'] = [f"{year}-{year+1}" for year in range(1965, 1973)]
tournament_year['FIRA_Trophy'] =  [f"{year}-{year+1}" for year in range(1973, 1995)]
tournament_year['FIRA_Tournament'] = ['1996-1997']
tournament_year['European_Nations_Cup_First_Division'] = [2000,2001]+[year for year in range(2010,2017)] + ['2001-2002', '2003-2004', '2004-2006', '2006-2008', '2008-2010']
tournament_year['Rugby_Europe_International_Championships'] = [f"20{number}-{number+1}" for number in range(16,22) ]
tournament_year['IRB_Pacific_5_Nations'] = [2006]
tournament_year['IRB_Pacific_Nations_Cup'] = [year for year in range(2007,2015)]
tournament_year['World_Rugby_Pacific_Nations_Cup']= [year for year in range(2015,2024)]

tournament_year['IRB_Nations_Cup']= [year for year in range(2006,2015)]
tournament_year['World_Rugby_Nations_Cup']= [year for year in range(2015,2020)]
tournament_year['Americas_Rugby_Championship']= [year for year in range(2009,2020)]
tournament_year['ARFU_Asian_Rugby_Championship'] = [year for year in range(1969,1998)]+[1998,2000,2002,2004]+['2007-2007']
tournament_year['Asian_Five_Nations']  = [year for year in range(2008,2015)]
tournament_year['Asia_Rugby_Championship']  = [year for year in range(2015,2024)]
tournament_year['Asian_Five_Nations_division_tournaments']  = [year for year in range(2008,2015)]
tournament_year['Asia_Rugby_Championship_division_tournaments']  = [year for year in range(2015,2023)]
tournament_year['South_American_Rugby_Championship'] = [year for year in range(1951,2018)]
tournament_year['South_American_Rugby_Championship_"A"'] = [year for year in range(2000,2018)]
tournament_year['South_American_Six_Nations'] = [2018]


In [523]:
#import the file from rugbycology, and add features city, venue and country,to make it coherent with the scrap data

current_dir = os.getcwd()
folder = '/data/'
file = 'all_games_RWC2023_fixed.csv'
tournaments_url = 'https://en.wikipedia.org/wiki/List_of_rugby_union_competitions'
result_folder = '/data/results_international/'

def switch_home(row):
    #tinput: a row of the rwc2023 dataset. 
    #output: a row with the inverted columns
    #function: if the team B is NZ(home team), switch teams A and B as well as scores A and B
    if row['Team B'] == 'New Zealand':
        row['Team A'], row['Team B'] = row['Team B'], row['Team A']
        row['Score A'], row['Score B'] = row['Score B'], row['Score A']
    if row['Team A'] == 'New Zealand':
        row['Neut.'] = False
    return row
    
def import_wc2023(file): 
    #input: the rwx2023 file
    #output: the imported filed with added columns
    #function: selcect only the necessary columns, and add Neut., city, Tourn, Venue and country variables
    df= pd.read_csv(file)
    df = df[['Date', 'Team A', 'Team B', 'Score A', 'Score B']]
    df['City'] = 'Auckland'
    df['Neut.'] = True
    df = df.apply(switch_home, axis=1)
    df['Tourn'] = 'WC'
    df['Venue'] = 'Na'
    df['Country'] = 'New Zealand'
    df['Date'] = df['Date'].apply(lambda date: pd.to_datetime(date, dayfirst=True, format='%d-%b-%Y'))
    return df

def find_tournament_list(link):
    #input: the wiki page with the international rugby competition
    #output: a list of the tournaments
    #function: used to obtain missing results after 2021
    page = requests.get(link)
    soup = BeautifulSoup(page.content, 'html.parser')
    tournament_links = []
    tournament_table = soup.find_all('table')[0]
    tournament_row = tournament_table.find_all('tr')
    
    for tournament in tournament_row:
        tf = tournament.find('td')
        a = tournament.find('a',href=True)
        if a:
            if 'Women' not in a['href']:
                tournament_links.append(a['href'][6:])
    return tournament_links

def result_scrapping(year, url):
    #input: a year and the championship partial url
    #output: A list with a match result: teams, score, venue, city
    #function: will be used to collect data for match since 2021. Can be Updated to collect more data if necessary
    results = []
    try:
        page = requests.get(f"https://en.wikipedia.org/wiki/{year}_{url}", allow_redirects=False, timeout=5)
        results = []
        if page.status_code == 200:
            soup = BeautifulSoup(page.content, 'html.parser')
            fixtures =  soup.find_all(class_='vevent summary')
           
            for i in range(len(fixtures)):
                try:
                    tables = fixtures[i].find_all('table')
                    teams = tables[1].find_all('td', class_='vcard')
                    team_a = teams[0].find('a').text
                    team_b = teams[1].find('a').text
                    
                    #date
                    date=tables[0].find('td')
                   
                    br_tags = date.select("br")
                    if br_tags:
                        br_tags[0].replace_with("\n")
                        date= date.text.split('\n')[0]
                    else:
                        date= date.text     
                    #remove trailing info
                    date_parts = date.split()
                    date_str = ' '.join(date_parts)
                    
                    
                    # Define a regular expression pattern to match the year
                    year_pattern = r'\b\d{4}\b'
                    
                    # Search for the year in the date string using the pattern
                    match = re.search(year_pattern, date_str)
                
                    # If a match is found, extract the matched year
                    if match:
                        year_str = match.group()
                        # Find the index of the matched year in the date string
                        year_index = date_str.index(year_str)
                        # Slice the date string up to the matched year
                        date_str = date_str[:year_index + len(year_str)]
                    else:
                        print(f"No year found in the date: {date_str}")
                        date_str = results[-1][1]
                                            
                    #find the score
                    table_result = tables[1]
                    
                    result = table_result.find_all('td')[1].text
                    
                    clean_score = re.match(r'^(\d+)\D+(\d+)', result)
                    if clean_score:
                        clean_score = f"{clean_score.group(1)}-{clean_score.group(2)}"
                    else:
                        print("Invalid score format")
                        continue
                    
                    #find the venue
                    location = tables[2]
                   
                    a = location.find_all('a') 
                    if len(a)>=2:
                        venue = a[0].text
                        city = a[1].text
                    else:
                        city = a[0].text
                        venue = None
                    #br_tags = venue.select("br")
                    
                    #if br_tags:
                     #   br_tags[0].replace_with("\n")
                      #  venue= venue.text.split('\n')[0]
                   
                    results.append([f'{team_a}_v_{team_b}', date_str, clean_score , venue, city])
                except Exception as e:
                    print(f"An error occurred: {e} {year} {url}")
                    continue
    except Exception as e:
        print(f"An error occurred while fetching page: {e}")
    
    return results


def scrappe_match_results(urls, years):
    #input: a list of championship url, and the years we want to collect data from
    #output: a dict with championship as key and list(year) of list of results
    #function: main function, collect data from wikipedia
    scapped_results = {}
    for url in urls:
        tournament_results = [result_scrapping(years[i], url) for i in range(len(years))] 
        scapped_results[url]=tournament_results
    return scapped_results
    
def find_country(city, geo):
    #inputs: a string with a city name, a Nominatim object
    #output: the country of the city
    #function: will help define if a team is playing at home
    location = geo.geocode(city, language='en')
    if location:

        country_name = location.raw['display_name'].split(', ')[-1]
        if country_name == 'United Kingdom':
            if location.raw['display_name'].split(', ')[-2] != 'England':
                country_name = location.raw['display_name'].split(', ')[1]
            else:
                country_name = 'England'
        return country_name
    return None

def create_result_df(list_results, geolocator):
    df = pd.DataFrame(columns=['Date', 'Team A', 'Team B', 'Score A', 'Score B', 'Tourn', 'Venue', 'City', 'Neut.', 'Country'])
    dates = []
    home_score = []
    away_score = []
    home_team = []
    away_team = []
    competition = []
    stadium = []
    city = []
    for entry in list_results:
        tournaments = list_results[entry]
        for i in range(len(tournaments)):
            try:
                for j in range(len(tournaments[i])):
                    game_info = tournaments[i][j]
                    if len(game_info) == 5:
                        teams, date, score, venue, cit = game_info
                        scores = re.split('–|-|–', score.strip())
                        if len(scores) != 2 :
                            print(f"Skipping game with invalid score format: {score}")
                            continue  # Skip this game
                        
                        try:
                            home_score_value = int(scores[0])
                            away_score_value = int(scores[1])
                        except ValueError as e:
                            print(f"Error converting score to integer: {e}")
                          
                            continue  # Skip this game  
                        dates.append(date)
                        teams = teams.split('_v_')
                        home_team.append(teams[0])
                        away_team.append(teams[1])
                        competition.append(entry)
                        stadium.append(venue)

                        city.append(cit)
                        home_score.append(home_score_value)
                        away_score.append(away_score_value)
                        
            except IndexError as e:
                print(f"IndexError: {e}")
                pass

    valid_data_indices = [i for i, item in enumerate(home_score) if item is not None]
    dates = [dates[i] for i in valid_data_indices]
    home_team = [home_team[i] for i in valid_data_indices]
    away_team = [away_team[i] for i in valid_data_indices]
    competition = [competition[i] for i in valid_data_indices]
    stadium = [stadium[i] for i in valid_data_indices]
    city = [city[i] for i in valid_data_indices]
    home_score = [home_score[i] for i in valid_data_indices]
    away_score = [away_score[i] for i in valid_data_indices]


    date_parsed = [pd.to_datetime(date, dayfirst=True, format='mixed') for date in dates]
    df['Score A'] = home_score
    df['Score B'] = away_score
    df['Team A'] = home_team
    df['Team B'] = away_team
    df['Date'] = date_parsed
    df['Tourn'] = competition
    df['Venue'] = stadium
    df['City'] = city
    #df['Country'] = df.apply(lambda row: find_country(row['City'], geolocator), axis=1)
    df['Neut.'] = df.apply(lambda row: row['Team A'] != row['Country'], axis=1)
    
    # Filter out rows with empty dates
    df = df[df['Date'].notna()]
    df = df[(df['Team A'] != '') & (df['Team B'] != '')]
    df.sort_values('Date',inplace=True)
    #remove wrong countries
    df['Team A'] = df['Team A'].str.replace(r'(\s?A|\s?Selects|\s?A1|\s?B)$', '', regex=True)
    df['Team B'] = df['Team B'].str.replace(r'(\s?A|\s?Selects|\s?A1|\s?B)$', '', regex=True)

    df['Team A'] = df['Team A'].str.replace(r'(Soviet Union)$', 'Russia', regex=True)
    df['Team B'] = df['Team B'].str.replace(r'(Soviet Union)$', 'Russia', regex=True)

    df['Team A'] = df['Team A'].str.replace(r'(West Germany)$', 'Germany', regex=True)
    df['Team B'] = df['Team B'].str.replace(r'(West Germany)$', 'Germany', regex=True)
    
    df = df[~df['Team A'].str.contains('Emerging| XV|Junior|U20|Barbarians|Jaguars|Māori|Saxons|Warriors|Chiefs|Crusaders|Ulster', case=False)]
    df = df[~df['Team B'].str.contains('Emerging| XV|Junior|U20|Barbarians|Jaguars|Māori|Saxons|Warriors|Chiefs|Crusaders|Ulster', case=False)]
    df = df.reset_index(drop=True)
    return df

In [505]:
def scrap_slams(url):
    #input: the url to the grand slam page. 
    #output: a list of results, for each year, containing only teams, scores, and year
    #function: use in conjunction with df_from_slam to create a df with the results, that has the feature as the other df
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')   
    grand_slam = []
    games = []
    tournament_table = soup.find_all('table')[5]
    tournament_body = tournament_table.find('tbody')
    tournament_rows = (tournament_body.find_all('tr')[2:8] +
                       tournament_body.find_all('tr')[9:-2])
    
    for row in tournament_rows:
        tds = row.find_all('td')
        years = tds[0].text.strip()
        if years == 'South Africa':
            years = '2010'
            
            away = tds[0].text.strip()
            england = tds[3].text.strip()
            ireland = tds[4].text.strip()
            scotland = tds[5].text.strip()
            wales = tds[6].text.strip()
        else:
            away = tds[1].text.strip()
            england = tds[4].text.strip()
            ireland = tds[5].text.strip()
            scotland = tds[6].text.strip()
            wales = tds[7].text.strip()
        games.append([years, away, england, ireland, scotland, wales])
    return games
    
def df_from_slams(lists):
    #input: a list of list created by scrap slams
    #output: a df with the values
    #function: collect info from slams
    df = pd.DataFrame(columns=['Date', 'Team A','Team B','Score A','Score B', 'Tourn','Venue', 'City', 'Neut.'])
    dates = []
    home_score = []
    away_score = []
    home_team = []
    away_team = []
    competition = []
    stadium = []
    city = []
    for list_ in lists:
        for i in range(4):
            dates.append(list_[0].split('–')[0])
        score1 = list_[2].split('–')
        score2 = list_[3].split('–')
        score3 = list_[4].split('–')
        score4 = list_[5].split('–')
        for i in range(4):
            away_team.append(list_[1])
        home_team += ['England', 'Ireland', 'Scotland', 'Wales']
        home_score.extend([score1[1],score2[1],score3[1], score4[1]])
        away_score.extend([score1[0],score2[0],score3[0], score4[0]])
    df['Date'] = dates
    df['Team A'] = home_team
    df['Team B'] = away_team
    df['Score A'] = home_score
    df['Score B'] = away_score
    df['Tourn'] = 'Grand Slam'
    df['Neut.'] = False
    return df

def df_slams():
    #creates a slam df with results
    grand_slam_link = "https://en.wikipedia.org/wiki/Grand_Slam_(rugby_union)#Grand_Slam_tours"
    return df_from_slams(scrap_slams(grand_slam_link))

def df_master(tournament_dict):
    #input: a dict with the tournament url as keys and the values accepted for the tourn
    #output: a df with all the data scapped+ grand slam
    #create a df from all games from all tournaments
    slam_df = df_slams()
    time_start= time.time()
    pointer_tourn=0
    print(f"The script will import macth results for {len(list(tournament_dict.keys()))} competitions")
    scrapped_results = {}
    for tourn, years in tournament_dict.items():
        print(f"{pointer_tourn+1}/{len(list(tournament_dict.keys()))}, Tourn:{list(tournament_dict.keys())[pointer_tourn]}")
        partial_result = []
        for year in years:
            partial_result.append(result_scrapping(year= year, url=tourn))
        scrapped_results[tourn] = partial_result
        pointer_tourn+=1
    end_time = time.time()
    print(f"Duration:{end_time-time_start} ")
   
    #df['Date'] = df['Date'].apply(lambda x : pd.to_datetime(f"{x}-01-01") if re.match(r'^\d{4}$', x) else x)
    return scrapped_results

In [518]:
#print(tournament_year.keys())
result = df_master(tournament_year)

The script will import macth results for 33 competitions
1/33, Tourn:mid-year_rugby_union_tests
An error occurred: 'NoneType' object has no attribute 'text' 2010 mid-year_rugby_union_tests
An error occurred: 'NoneType' object has no attribute 'text' 2010 mid-year_rugby_union_tests
2/33, Tourn:June_rugby_union_tests
An error occurred: 'NoneType' object has no attribute 'text' 2014 June_rugby_union_tests
3/33, Tourn:July_rugby_union_tests
Invalid score format
Invalid score format
Invalid score format
Invalid score format
Invalid score format
Invalid score format
Invalid score format
4/33, Tourn:Rugby_World_Cup_Pool_A
5/33, Tourn:Rugby_World_Cup_Pool_B
6/33, Tourn:Rugby_World_Cup_Pool_C
7/33, Tourn:Rugby_World_Cup_Pool_D
8/33, Tourn:Rugby_World_Cup
9/33, Tourn:Home_Nations_Championship
Invalid score format
Invalid score format
Invalid score format
Invalid score format
Invalid score format
Invalid score format
Invalid score format
Invalid score format
Invalid score format
Invalid score for

In [519]:
geolocator = Nominatim(user_agent = "AGexercice", timeout=10)
df = create_result_df(result, geolocator)

In [493]:
#result = result.loc[:,['Date', 'Team A', 'Team B', 'Score A', 'Score B', 'Tourn', 'Venue', 'City', 'Neut.', 'Country']]

In [524]:
df.sort_values(by='Date')
slam_df = df_slams()
slam_df['Date'] = slam_df['Date'].apply(lambda x : pd.to_datetime(f"{x}-01-01"))
merged_df = pd.concat([df, slam_df])
merged_df.to_excel(current_dir+result_folder+'scrap_out.xlsx')

In [521]:
merged_df.sort_values(by='Date')

Unnamed: 0,Date,Team A,Team B,Score A,Score B,Tourn,Venue,City,Neut.,Country
0,1882-12-16,Wales,England,0,2,Home_Nations_Championship,,Swansea,True,
1,1883-01-08,Scotland,Wales,3,1,Home_Nations_Championship,,Edinburgh,True,
2,1883-01-08,Scotland,Wales,3,1,Home_Nations_Championship,Raeburn Place,Edinburgh,True,
3,1883-02-05,England,Ireland,1,0,Home_Nations_Championship,,Manchester,True,
4,1883-02-05,England,Ireland,1,3,Home_Nations_Championship,Whalley Range,Manchester,True,
...,...,...,...,...,...,...,...,...,...,...
4004,2023-10-15,France,South Africa,28,29,Rugby_World_Cup,Stade de France,Saint-Denis,True,
4005,2023-10-20,Argentina,New Zealand,6,44,Rugby_World_Cup,Stade de France,Saint-Denis,True,
4006,2023-10-21,England,South Africa,15,16,Rugby_World_Cup,Stade de France,Saint-Denis,True,
4007,2023-10-27,Argentina,England,23,26,Rugby_World_Cup,Stade de France,Saint-Denis,True,
