<h1 align="center"> Obtaining the data required to train the model </h1> 

In [1]:
# Import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# URL of the LNR website redirecting to the resultst of the corresponding season.
base_url = "https://top14.lnr.fr/calendrier-et-resultats/2022-2023/"


data = []


# For every day of the season, obtain the URLs of the results of every games.
# To browse the different days from the source url, simply add "j" + the corresponding day.
# To cover the whole season, the day need to be incremented by one each time.
# There are 26 days in a TOP 14 regular season (without the playoffs).
for i in range(1, 27):
    day = f"j{i}"
    url = f"{base_url}{day}"
    # The URL is now "https://top14.lnr.fr/calendrier-et-resultats/2022-2023/j{i}", as desired.
    
    # Send a requests to the server using the URL address.
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # For each day, it is necessary to extract the results and the statistiscs every match.
    # For each match, the information is contained in a URL of class 'match-links__link' (noticed by inspecting the web page).
    match_links = soup.find_all('a', class_='match-links__link')

    # Browse all links of class 'match-links__link' having the right structure:
    # The searched URLs look like: 'https://top14.lnr.fr/feuille-de-match/2023-2024/day/team1-team2/statistiques-du-match'
    for link in match_links:
        href = link['href']
        
        # If the URL contains "feuille de match", then this is the right URL, because this is where the statistics are stored.
        if "/feuille-de-match/" in href:
            # Add "/statistiques-du-match" to complete the right URL.
            full_url = href + "/statistiques-du-match"
            
            # Send a requests to the server using the full_url address.
            response = requests.get(full_url)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Identify the HTML elements (and their class) that contain the data of interest by inspecting the page beforehand.
            stats = soup.find_all('div', class_='stats-bar')
            score_div = soup.find('div', class_='title--large')
            team1_wrapper = soup.find('div', class_='match-header-club__wrapper--left')
            team2_wrapper = soup.find('div', class_='match-header-club__wrapper--right')

            # Make a case disjunction in case certain elements are missing (Some matches are "empty").
            if team1_wrapper:
                team1_element = team1_wrapper.find('a', class_='match-header-club__title')
            else:
                team1_element = None

            if team2_wrapper:
                team2_element = team2_wrapper.find('a', class_='match-header-club__title')
            else:
                team2_element = None

            if team1_element:
                team1 = team1_element.text
            else:
                team1 = "Unknown Team 1"

            if team2_element:
                team2 = team2_element.text
            else:
                team2 = "Unknown Team 2"

            # Retrieve the page score
            score = score_div.text.strip()
            scores = score.split("-")
            score_team1 = scores[0]
            score_team2 = scores[1]

            # Enter the data relating to the match just extracted on the same line.
            match_data = {'Saison': '2022-2023', 'Journée': day, 'Equipe 1': team1, 'Equipe 2': team2, 'Score Equipe 1': score_team1, 'Score Equipe 2': score_team2}
            
            # Extract match statistics.
            for stat in stats:
                
                # Identify the HTML element (and their class) that contain the data of interest by inspecting the page beforehand.
                title = stat.find('div', class_='stats-bar__title').text.strip()
                
                # Collect the data related to each team.
                team1_stat = stat.find('div', class_='stats-bar__val--left').text.strip()
                team2_stat = stat.find('div', class_='stats-bar__val--right').text.strip()
                
                # Enter the statistiscs corresponding to the match.
                match_data[f"{title} Equipe 1"] = team1_stat
                match_data[f"{title} Equipe 2"] = team2_stat
                
            # Append all the data of the match in the list "data".
            data.append(match_data)

# Create a DataFrame from the data collected.
df = pd.DataFrame(data)

print(df)

        Saison Journée              Equipe 1                   Equipe 2  \
0    2022-2023      j1             Racing 92          Castres Olympique   
1    2022-2023      j1              CA Brive                  LOU Rugby   
2    2022-2023      j1             RC Toulon           Aviron Bayonnais   
3    2022-2023      j1       Section Paloise              USA Perpignan   
4    2022-2023      j1  Stade Français Paris               ASM Clermont   
..         ...     ...                   ...                        ...   
177  2022-2023     j26             LOU Rugby           Aviron Bayonnais   
178  2022-2023     j26             RC Toulon      Union Bordeaux-Bègles   
179  2022-2023     j26       Section Paloise  Montpellier Hérault Rugby   
180  2022-2023     j26       Stade Rochelais       Stade Français Paris   
181  2022-2023     j26      Stade Toulousain                   CA Brive   

    Score Equipe 1 Score Equipe 2 Essais accordés Equipe 1  \
0              25              19    

<h3 align="center"> The web page of the 6th day of the season 2022-2023: <h3>
    <img src="img/calendrier-et-resultats.png">

<h3 align="center"> The web page of the statistics of the match between Stade Toulousain and ASM Clermont: <h3>
    <img src="img/statistiques-match.png">

In [3]:
# Convert specific columns to integer type by filling in missing values with 0
cols_to_convert = ['Score Equipe 1', 'Score Equipe 2', 'Essais accordés Equipe 1', 'Essais accordés Equipe 2', 'Mêlées obtenues Equipe 1', 'Mêlées obtenues Equipe 2', 'Mêlées perdues Equipe 1', 'Mêlées perdues Equipe 2', 'Mêlées gagnées Equipe 1', 'Mêlées gagnées Equipe 2', 'Mêlées refaites Equipe 1', 'Mêlées refaites Equipe 2', 'Touches obtenues Equipe 1', 'Touches obtenues Equipe 2', 'Touches gagnées sur son propre lancer Equipe 1', 'Touches gagnées sur son propre lancer Equipe 2', 'Touches gagnées sur lancer adverse Equipe 1', 'Touches gagnées sur lancer adverse Equipe 2', 'En-avant commis Equipe 1', 'En-avant commis Equipe 2', 'Pénalités réussies Equipe 1', 'Pénalités réussies Equipe 2', 'Pénalités concédées Equipe 1', 'Pénalités concédées Equipe 2', 'Plaquages réussis Equipe 1', 'Plaquages réussis Equipe 2', 'Plaquages offensifs réussis Equipe 1', 'Plaquages offensifs réussis Equipe 2', 'Plaquages manqués Equipe 1', 'Plaquages manqués Equipe 2', 'Ballons joués au pied Equipe 1', 'Ballons joués au pied Equipe 2', 'Ballons passés Equipe 1', 'Ballons passés Equipe 2']
df[cols_to_convert] = df[cols_to_convert].fillna(0).astype(int)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182 entries, 0 to 181
Data columns (total 48 columns):
 #   Column                                          Non-Null Count  Dtype 
---  ------                                          --------------  ----- 
 0   Saison                                          182 non-null    object
 1   Journée                                         182 non-null    object
 2   Equipe 1                                        182 non-null    object
 3   Equipe 2                                        182 non-null    object
 4   Score Equipe 1                                  182 non-null    int32 
 5   Score Equipe 2                                  182 non-null    int32 
 6   Essais accordés Equipe 1                        182 non-null    int32 
 7   Essais accordés Equipe 2                        182 non-null    int32 
 8   Possession de la balle Equipe 1                 171 non-null    object
 9   Possession de la balle Equipe 2                 171 no

In [4]:
# Export the DataFrame in excel
df.to_excel("data/Par saison/2022-2023.xlsx", index=False)