In [1]:
import pandas as pd 
import requests 
import numpy as np 
from bs4 import BeautifulSoup
import json, os, time, pdb 

headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64)'+\
            'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}

## Next Task: Scrape list of leagues and list of game ids 
### May need to gather data team by team instead of league by league 

In [2]:
def get_teams_list(season, league):
    
    test_url = 'https://www.espn.co.uk/rugby/table/_/league/{}/season/{}'.format(league, season)
    response = requests.get(test_url, headers=headers).content
    soup_team = BeautifulSoup(response, 'html.parser')

    
    get_index = lambda x, char: x.find(char)
    
    tbodies = soup_team.findAll('tbody')
    
    if len(tbodies) == 0:
        team_name_link = {}
    
    else:
        
        team_name_link = {}
        for tbody in tbodies:

            row_trs = tbody.findAll('tr')

            for tr in row_trs:

                td_start = tr.findAll('td')[0]
                team_link = td_start.findAll('a')[0]['href']

                team_name_link[td_start.findAll('a')[1].find('span').text] = [
                    team_link,
                    team_link[team_link.find('id/')+3:team_link.find('id/')+3+\
                              get_index(team_link[team_link.find('id/')+3:], '/')]
                ]

    return team_name_link 


def get_schedule(teams_list, season):
    
    if len(teams_list.keys()) == 0:
        sched_dict = {}
        
    else:
        sched_dict = {}

        results_base_url = "https://www.espn.co.uk/rugby/results/_/team/"

        
        for team in teams_list.keys():

            team_id = teams_list[team][1]

            team_page_resp = requests.get(results_base_url+team_id+'/season/{}'.format(season), headers=headers).content
            team_soup = BeautifulSoup(team_page_resp, 'html.parser')

            full_sched = team_soup.find(id='sched-container')

            match_months = full_sched.findAll('tbody')

            sched_dict[team] = match_months 
    
    return sched_dict 


def parse_scheds(sched):
    
    totals = []
    for mon in sched:

        rows = mon.findAll('tr')

        for row in rows:

            first_row = row.findAll('td')
            date = first_row[0].text

            home_base, away_base = first_row[1].findAll('a')[0], \
                    first_row[2].findAll('a')[0]

            home_team, away_team = home_base.find('span').text, \
                    away_base.find('span').text

            home_team_abbr, away_team_abbr = home_base.find('abbr').text, \
                    away_base.find('abbr').text
            
            try:
                
                game_link = first_row[1].findAll('span')[-1].find('a')['href']

                game_id, league_id = game_link[game_link.find('Id/')+3:game_link.find('/league')], \
                        game_link[game_link.find('league/')+7:]
                
                score = first_row[1].findAll('span')[-1].find('a').text
            
            except TypeError:
                
                game_link, game_id, league_id = np.nan, np.nan, np.nan 
                
                score = first_row[1].findAll('span')[-1].text
        
            competition, stadium = first_row[4].text, first_row[5].text

            home_score, away_score = score.split()[0], score.split()[-1]

            totals.append([
                date, home_team, away_team, home_team_abbr,
                away_team_abbr, game_link, score, home_score,
                away_score, competition, stadium, game_id, league_id
            ])

    comb_df = pd.DataFrame(
        totals,
        columns=[
            'date', 'home_team', 'away_team', 
            'home_team_abbr', 'away_team_abbr', 'game_link', 
            'score', 'home_score', 'away_score', 
            'competition', 'stadium', 'game_id', 'league_id']
    )
    
    return comb_df 

In [3]:
team_links = get_teams_list(2022, 270557)
team_links

{'Leinster': ['/rugby/team/_/id/25924/leinster', '25924'],
 'Stormers': ['/rugby/team/_/id/25962/stormers', '25962'],
 'Ulster': ['/rugby/team/_/id/25926/ulster', '25926'],
 'Bulls': ['/rugby/team/_/id/25953/bulls', '25953'],
 'Sharks': ['/rugby/team/_/id/25961/sharks', '25961'],
 'Munster': ['/rugby/team/_/id/25925/munster', '25925'],
 'Edinburgh': ['/rugby/team/_/id/25951/edinburgh', '25951'],
 'Glasgow Warriors': ['/rugby/team/_/id/25952/glasgow-warriors', '25952'],
 'Ospreys': ['/rugby/team/_/id/25968/ospreys', '25968'],
 'Scarlets': ['/rugby/team/_/id/25966/scarlets', '25966'],
 'Connacht': ['/rugby/team/_/id/25923/connacht', '25923'],
 'Lions': ['/rugby/team/_/id/25958/lions', '25958'],
 'Benetton Treviso': ['/rugby/team/_/id/25927/benetton-treviso', '25927'],
 'Cardiff Blues': ['/rugby/team/_/id/25965/cardiff-blues', '25965'],
 'Dragons': ['/rugby/team/_/id/25967/dragons', '25967'],
 'Zebre': ['/rugby/team/_/id/167124/zebre', '167124']}

In [4]:
team_sched = get_schedule(team_links, 2022)

In [8]:
comb_df = []
for team in team_sched.keys():
    # try:
    parsed_scheds = parse_scheds(team_sched[team])
    parsed_scheds['season'] = '2022'
    comb_df.append(parsed_scheds)
    # except:
    #     print(team)
    #     break 


team_dfs = pd.concat(comb_df, axis=0, ignore_index=True).drop_duplicates()

In [9]:
team_dfs

Unnamed: 0,date,home_team,away_team,home_team_abbr,away_team_abbr,game_link,score,home_score,away_score,competition,stadium,game_id,league_id,season
0,"Fri, Dec 16",Leinster,Gloucester Rugby,LEINS,GLOUC,,57 - 0,57,0,European Rugby Champions Cup,"RDS Arena, Dublin",,,2022
1,"Sat, Dec 10",Racing 92,Leinster,RAMET,LEINS,,10 - 42,10,42,European Rugby Champions Cup,"Stade Oceane, Le Havre",,,2022
2,"Fri, Jun 10",Leinster,Bulls,LEINS,BULLS,/rugby/match/_/gameId/594483/league/270557,26 - 27,26,27,United Rugby Championship,"RDS Arena, Dublin",594483,270557,2022
3,"Sat, Jun 4",Leinster,Glasgow Warriors,LEINS,GLASG,/rugby/match/_/gameId/594479/league/270557,76 - 14,76,14,United Rugby Championship,"RDS Arena, Dublin",594479,270557,2022
4,"Sat, May 21",Munster,Leinster,MUNST,LEINS,/rugby/match/_/gameId/594395/league/270557,25 - 35,25,35,United Rugby Championship,"Aviva Stadium, Dublin",594395,270557,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,"Sat, Apr 9",Dragons,Gloucester Rugby,DRA,GLOUC,,21 - 26,21,26,European Rugby Challenge Cup,"Rodney Parade, Newport",,,2022
236,"Sun, Dec 18",Bristol Rugby,Zebre,BTL,ZEB,/rugby/match/_/gameId/598621/league/272073,35 - 19,35,19,European Rugby Challenge Cup,"Ashton Gate, Bristol",598621,272073,2022
237,"Sat, Dec 10",Zebre,Toulon,ZEB,TOUL,/rugby/match/_/gameId/598611/league/272073,21 - 24,21,24,European Rugby Challenge Cup,"Stadio Sergio Lanfranchi, Parma",598611,272073,2022
243,"Sat, Apr 9",Newcastle Falcons,Zebre,NEWC,ZEB,,25 - 22,25,22,European Rugby Challenge Cup,"Kingston Park, Newcastle",,,2022


In [10]:
team_dfs[team_dfs['game_link'].isnull()]

Unnamed: 0,date,home_team,away_team,home_team_abbr,away_team_abbr,game_link,score,home_score,away_score,competition,stadium,game_id,league_id,season
0,"Fri, Dec 16",Leinster,Gloucester Rugby,LEINS,GLOUC,,57 - 0,57,0,European Rugby Champions Cup,"RDS Arena, Dublin",,,2022
1,"Sat, Dec 10",Racing 92,Leinster,RAMET,LEINS,,10 - 42,10,42,European Rugby Champions Cup,"Stade Oceane, Le Havre",,,2022
15,"Sat, Dec 17",Stormers,London Irish,STORM,IRISH,,34 - 14,34,14,European Rugby Champions Cup,"DHL Stadium, Cape Town",,,2022
16,"Sat, Dec 10",Clermont Auvergne,Stormers,CLER,STORM,,24 - 14,24,14,European Rugby Champions Cup,"Stade Marcel Michelin, Clermont-Ferrand",,,2022
33,"Sat, Dec 17",Ulster,La Rochelle,ULST,LA RO,,29 - 36,29,36,European Rugby Champions Cup,"Aviva Stadium, Dublin",,,2022
34,"Sun, Dec 11",Sale Sharks,Ulster,SALE,ULST,,39 - 0,39,0,European Rugby Champions Cup,"Salford City Stadium, Salford",,,2022
48,"Sat, Dec 17",Exeter Chiefs,Bulls,EXET,BULLS,,44 - 14,44,14,European Rugby Champions Cup,"Sandy Park, Exeter",,,2022
49,"Sat, Dec 10",Bulls,Lyon,BULLS,LYON,,42 - 36,42,36,European Rugby Champions Cup,"Loftus Versfeld, Pretoria",,,2022
66,"Fri, Dec 16",Bordeaux Begles,Sharks,BEG,SHARK,,16 - 19,16,19,European Rugby Champions Cup,"Stade Chaban-Delmas, Bordeaux",,,2022
67,"Sat, Dec 10",Sharks,Harlequins,SHARK,HQUIN,,39 - 31,39,31,European Rugby Champions Cup,"Hollywoodbets Kings Park, Durban",,,2022


In [32]:
league_dict = {
    'URC': 270557, 
    'Prem': 267979,
    'T14': 270559,
    'RWC': 164205,
    'RChamp': 244293,
    # 'ChampCup': 271937,
    # 'ChallCup': 272073,  
    'SR': 242041, 
    'PNCup': 256449
}

seasons = [
    2018, 2019, 2020, 
    2021, 2022, 2024
]

In [7]:
sn_seasons = [
   2022, 2023, 2024
]

comb_df = [] 

for season in sn_seasons:
    
    team_links = get_teams_list(season, 180659)
    team_scheds = get_schedule(team_links, season)

    for team in team_scheds.keys():
        parsed_scheds = parse_scheds(team_scheds[team])
        parsed_scheds['season'] = season
        comb_df.append(parsed_scheds)


team_dfs = pd.concat(comb_df, axis=0, ignore_index=True).drop_duplicates()


In [8]:
team_dfs

Unnamed: 0,date,home_team,away_team,home_team_abbr,away_team_abbr,game_link,score,home_score,away_score,competition,stadium,game_id,league_id,season
0,"Sat, Nov 19",France,Japan,FRA,JPN,/rugby/match/_/gameId/595936/league/289273,35 - 17,35,17,Summer Nations Series,"Stade Toulouse, Toulouse",595936,289273,2022
1,"Sat, Nov 12",France,South Africa,FRA,SA,/rugby/match/_/gameId/595933/league/289273,30 - 26,30,26,Summer Nations Series,"Orange Velodrome, Marseille",595933,289273,2022
2,"Sat, Nov 5",France,Australia,FRA,AUS,/rugby/match/_/gameId/595929/league/289273,30 - 29,30,29,Summer Nations Series,"Stade de France, Saint-Denis",595929,289273,2022
3,"Fri, Jul 8",Japan,France,JPN,FRA,/rugby/match/_/gameId/595420/league/289234,15 - 20,15,20,International Test Match,National Kasumigaoka Stadium,595420,289234,2022
4,"Fri, Jul 1",Japan,France,JPN,FRA,/rugby/match/_/gameId/595421/league/289234,23 - 42,23,42,International Test Match,"Toyota Stadium, Bloemfontein",595421,289234,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,"Fri, Mar 8",Italy,Scotland,ITALY,SCOT,/rugby/match/_/gameId/597481/league/289259,31 - 29,31,29,Five/Six Nations,"Stadio Olimpico, Rome",597481,289259,2024
248,"Sat, Feb 3",Wales,Scotland,WALES,SCOT,/rugby/match/_/gameId/597379/league/180659,26 - 27,26,27,Six Nations Championship,"Principality Stadium, Cardiff",597379,180659,2024
249,"Fri, Feb 2",Wales,Scotland,WALES,SCOT,/rugby/match/_/gameId/597474/league/289259,26 - 27,26,27,Five/Six Nations,"Principality Stadium, Cardiff",597474,289259,2024
250,"Sat, Mar 16",Wales,Italy,WALES,ITALY,/rugby/match/_/gameId/597389/league/180659,21 - 24,21,24,Six Nations Championship,"Principality Stadium, Cardiff",597389,180659,2024


In [34]:
comb_df = [] 

for season in seasons[-2:]:
    
for league in list(league_dict.keys())[:3]:

    print(season, league)
    team_links = get_teams_list('2023', league)
    team_scheds = get_schedule(team_links, season)

    for team in team_scheds.keys():
        comb_df.append(parse_scheds(team_scheds[team]))


team_dfs = pd.concat(comb_df, axis=0, ignore_index=True).drop_duplicates()

        
        


2021 URC


IndexError: list index out of range

In [29]:
team_links = get_teams_list('2022', '270557')
team_scheds = get_schedule(team_links, '2022')

In [8]:
dfs = []
for team in team_scheds.keys():
    dfs.append(parse_scheds(team_scheds[team]))
    
team_dfs = pd.concat(dfs, axis=0, ignore_index=True).drop_duplicates()

In [8]:
len(team_dfs)

30

In [9]:
team_dfs = team_dfs.drop_duplicates()

In [10]:
team_dfs

Unnamed: 0,date,home_team,away_team,home_team_abbr,away_team_abbr,game_link,score,home_score,away_score,competition,stadium,game_id,league_id
0,"Sat, Mar 16",Ireland,Scotland,IRE,SCOT,/rugby/match/_/gameId/597390/league/180659,17 - 13,17,13,Six Nations Championship,"Aviva Stadium, Dublin",597390,180659
1,"Sat, Mar 9",England,Ireland,ENG,IRE,/rugby/match/_/gameId/597387/league/180659,23 - 22,23,22,Six Nations Championship,"Twickenham, London",597387,180659
2,"Sat, Feb 24",Ireland,Wales,IRE,WALES,/rugby/match/_/gameId/597383/league/180659,31 - 7,31,7,Six Nations Championship,"Aviva Stadium, Dublin",597383,180659
3,"Sun, Feb 11",Ireland,Italy,IRE,ITALY,/rugby/match/_/gameId/597382/league/180659,36 - 0,36,0,Six Nations Championship,"Aviva Stadium, Dublin",597382,180659
4,"Fri, Feb 2",France,Ireland,FRA,IRE,/rugby/match/_/gameId/597377/league/180659,17 - 38,17,38,Six Nations Championship,"Orange Velodrome, Marseille",597377,180659
5,"Sat, Mar 16",France,England,FRA,ENG,/rugby/match/_/gameId/597391/league/180659,33 - 31,33,31,Six Nations Championship,"Groupama Stadium, Lyon",597391,180659
6,"Sun, Mar 10",Wales,France,WALES,FRA,/rugby/match/_/gameId/597388/league/180659,24 - 45,24,45,Six Nations Championship,"Principality Stadium, Cardiff",597388,180659
7,"Sun, Feb 25",France,Italy,FRA,ITALY,/rugby/match/_/gameId/597385/league/180659,13 - 13,13,13,Six Nations Championship,"Stade Pierre Mauroy, Lille",597385,180659
8,"Sat, Feb 10",Scotland,France,SCOT,FRA,/rugby/match/_/gameId/597380/league/180659,16 - 20,16,20,Six Nations Championship,"Scottish Gas Murrayfield, Edinburgh",597380,180659
12,"Sat, Feb 24",Scotland,England,SCOT,ENG,/rugby/match/_/gameId/597384/league/180659,30 - 21,30,21,Six Nations Championship,"Scottish Gas Murrayfield, Edinburgh",597384,180659


In [18]:
team_dfs.drop_duplicates()

Unnamed: 0,date,home_team,away_team,home_team_abbr,away_team_abbr,game_link,score,home_score,away_score,competition,stadium,game_id,league_id
0,"Sat, Mar 16",Ireland,Scotland,IRE,SCOT,/rugby/match/_/gameId/597390/league/180659,17 - 13,17,13,Six Nations Championship,"Aviva Stadium, Dublin",597390,180659
1,"Sat, Mar 9",England,Ireland,ENG,IRE,/rugby/match/_/gameId/597387/league/180659,23 - 22,23,22,Six Nations Championship,"Twickenham, London",597387,180659
2,"Sat, Feb 24",Ireland,Wales,IRE,WALES,/rugby/match/_/gameId/597383/league/180659,31 - 7,31,7,Six Nations Championship,"Aviva Stadium, Dublin",597383,180659
3,"Sun, Feb 11",Ireland,Italy,IRE,ITALY,/rugby/match/_/gameId/597382/league/180659,36 - 0,36,0,Six Nations Championship,"Aviva Stadium, Dublin",597382,180659
4,"Fri, Feb 2",France,Ireland,FRA,IRE,/rugby/match/_/gameId/597377/league/180659,17 - 38,17,38,Six Nations Championship,"Orange Velodrome, Marseille",597377,180659
5,"Sat, Mar 16",France,England,FRA,ENG,/rugby/match/_/gameId/597391/league/180659,33 - 31,33,31,Six Nations Championship,"Groupama Stadium, Lyon",597391,180659
6,"Sun, Mar 10",Wales,France,WALES,FRA,/rugby/match/_/gameId/597388/league/180659,24 - 45,24,45,Six Nations Championship,"Principality Stadium, Cardiff",597388,180659
7,"Sun, Feb 25",France,Italy,FRA,ITALY,/rugby/match/_/gameId/597385/league/180659,13 - 13,13,13,Six Nations Championship,"Stade Pierre Mauroy, Lille",597385,180659
8,"Sat, Feb 10",Scotland,France,SCOT,FRA,/rugby/match/_/gameId/597380/league/180659,16 - 20,16,20,Six Nations Championship,"Scottish Gas Murrayfield, Edinburgh",597380,180659
12,"Sat, Feb 24",Scotland,England,SCOT,ENG,/rugby/match/_/gameId/597384/league/180659,30 - 21,30,21,Six Nations Championship,"Scottish Gas Murrayfield, Edinburgh",597384,180659


In [6]:
# Unittest to get one row of data from one month of games parsed and stored 

# Most recent month in the data, stored in a tbody tag
month = match_months[0]

totals = []

for mon in match_months:

    # List of rows in the most recent month of matches 
    rows = mon.findAll('tr')

    all_vals = []
    for row in rows:
        
        # Columns in the first row 
        first_row = row.findAll('td')
        date = first_row[0].text

        home_base, away_base = first_row[1].findAll('a')[0], \
                first_row[2].findAll('a')[0]

        home_team, away_team = home_base.find('span').text, \
                away_base.find('span').text

        home_team_abbr, away_team_abbr = home_base.find('abbr').text, \
                away_base.find('abbr').text

        game_link = first_row[1].findAll('span')[-1].find('a')['href']

        game_id, league_id = game_link[game_link.find('Id/')+3:game_link.find('/league')], \
                game_link[game_link.find('league/')+7:]

        score = first_row[1].findAll('span')[-1].find('a').text
        competition, stadium = first_row[4].text, first_row[5].text

        home_score, away_score = score.split()[0], score.split()[-1]

        totals.append([
            date, home_team, away_team, home_team_abbr,
            away_team_abbr, game_link, score, home_score,
            away_score, competition, stadium, game_id, league_id
        ])

comb_df = pd.DataFrame(
    totals,
    columns=[
        'date', 'home_team', 'away_team', 
        'home_team_abbr', 'away_team_abbr', 'game_link', 
        'score', 'home_score', 'away_score', 
        'competition', 'stadium', 'game_id', 'league_id']
)

print(comb_df)

          date home_team away_team home_team_abbr away_team_abbr  \
0  Sat, Mar 16   Ireland  Scotland            IRE           SCOT   
1   Sat, Mar 9   England   Ireland            ENG            IRE   
2  Sat, Feb 24   Ireland     Wales            IRE          WALES   
3  Sun, Feb 11   Ireland     Italy            IRE          ITALY   
4   Fri, Feb 2    France   Ireland            FRA            IRE   

                                    game_link    score home_score away_score  \
0  /rugby/match/_/gameId/597390/league/180659  17 - 13         17         13   
1  /rugby/match/_/gameId/597387/league/180659  23 - 22         23         22   
2  /rugby/match/_/gameId/597383/league/180659   31 - 7         31          7   
3  /rugby/match/_/gameId/597382/league/180659   36 - 0         36          0   
4  /rugby/match/_/gameId/597377/league/180659  17 - 38         17         38   

                competition                      stadium game_id league_id  
0  Six Nations Championship      

## COMPLETED
## 8 Groups of Data to scrape 
### - Final Score and Home/Away Team designation 
### - Match Events 
### - Kick/Pass/Run 
### - Attacking
### - Possession 
### - Set Pieces 
### - Defending 
### - Discipline and Penalties 




In [None]:
game_id = '597383'
league_id = '180659'

url = 'https://www.espn.com/rugby/matchstats?gameId={}&league={}'.format(game_id, league_id)
response = requests.get(url, headers=headers).content
soup = BeautifulSoup(response, 'html.parser')

tables = soup.findAll('table')

## Group 1: Final Score and Home/Away Team designation

In [11]:
top_bar = soup.find(class_='competitors')

home_team_bar = top_bar.find(class_='team team-a')
away_team_bar = top_bar.find(class_='team team-b')

form_top_data = {
    'game_id': game_id,
    'league_id': league_id,
    'home_team': home_team_bar.find(class_='short-name').text,
    'home_team_score': int(home_team_bar.find(class_='score-container').text), 
    'away_team': away_team_bar.find(class_='short-name').text,
    'away_team_score': int(away_team_bar.find(class_='score-container').text)
}

form_top_data

{'game_id': '597383',
 'league_id': '180659',
 'home_team': 'Ireland',
 'home_team_score': 31,
 'away_team': 'Wales',
 'away_team_score': 7}

## Group 2: Match Events

In [12]:
four_tables = soup.findAll(class_='sub-module equal-height countChartList height-reset')

match_event = four_tables[0].find('tbody')
match_event_rows = match_event.findAll('td')

match_event_labels = ['Tries', 'Conversion Goals', 'Penalty Goals', 'Kick Percent Success']
match_event_step = []
for td in match_event_rows:
    val_text = td.text
    if val_text not in match_event_labels:
        match_event_step.append(val_text)

match_event_ordered = [[match_event_step[i-1], match_event_step[i]] \
                       for i in range(1, len(match_event_step), 2)]
    
formed_match_data = {
    'game_id': game_id, 
    'league_id': league_id,
    'home_tries': match_event_ordered[0][0],
    'away_tries': match_event_ordered[0][1],
    'home_conversions': match_event_ordered[1][0],
    'away_conversions': match_event_ordered[1][1],
    'home_penalty_goals': match_event_ordered[2][0],
    'away_penalty_goals': match_event_ordered[2][1],
    'home_kick_percent': match_event_ordered[3][0],
    'away_kick_percent': match_event_ordered[3][1]
}

formed_match_data

{'game_id': '597383',
 'league_id': '180659',
 'home_tries': '4',
 'away_tries': '1',
 'home_conversions': '4',
 'away_conversions': '0',
 'home_penalty_goals': '1',
 'away_penalty_goals': '0',
 'home_kick_percent': '100%',
 'away_kick_percent': '0%'}

## Group 3: Kick/Pass/Run

In [4]:
check_top = soup.findAll(
    class_="stat-graph compareLineGraph twoTeam largeLabels"
)[0]

home_away_total_meters = [
    int(i.text) for i in check_top.findAll(class_='chartValue')
]

meter_rows = four_tables[1].find('tbody').findAll('td')

meter_labels = ['Kicks From Hand', 'Passes', 'Runs']
meter_step = []
for td in meter_rows:
    val_text = td.text
    if val_text not in meter_labels:
        meter_step.append(val_text)

meter_ordered = [[meter_step[i-1], meter_step[i]] \
                       for i in range(1, len(meter_step), 2)]

formed_meter_data = {
    'game_id': game_id, 
    'league_id': league_id,
    'home_total_meters': home_away_total_meters[0],
    'away_total_meters': home_away_total_meters[1],
    'home_kfh': meter_ordered[0][0],
    'away_kfh': meter_ordered[0][1],
    'home_pass_meters': meter_ordered[1][0],
    'away_pass_meters': meter_ordered[1][1],
    'home_runs': meter_ordered[2][0],
    'away_runs': meter_ordered[2][1],
}

formed_meter_data

{'game_id': '597383',
 'league_id': '180659',
 'home_total_meters': 497,
 'away_total_meters': 289,
 'home_kfh': '26',
 'away_kfh': '22',
 'home_pass_meters': '256',
 'away_pass_meters': '141',
 'home_runs': '159',
 'away_runs': '126'}

## Group 4: Attacking

In [5]:
middle_groups_start = soup.findAll(class_='stacked-rl')
attack_rows = middle_groups_start[0].find('tbody').findAll('td')

attack_labels = [
    'Possession 1H/2H', 'Territory 1H/2H', 'Clean Breaks', 
    'Defenders Beaten', 'Offload', 'Rucks Won', 
    'Mauls Won', 'Turnovers Conceded'
]

attack_step = []
for td in attack_rows:
    val_text = td.text
    if val_text not in attack_labels:
        attack_step.append(val_text)

attack_ordered = [[attack_step[i-1], attack_step[i]] \
                       for i in range(1, len(attack_step), 2)]

formed_attack_data = {
    
    'game_id': game_id, 
    'league_id': league_id,
    
    'home_possession_1h_2h': attack_ordered[0][0],
    'home_territory_1h_2h': attack_ordered[1][0],
    'home_clean_breaks': attack_ordered[2][0],
    'home_defenders_beaten': attack_ordered[3][0],
    'home_offloads': attack_ordered[4][0],
    'home_rucks_won': attack_ordered[5][0],
    'home_mauls_won': attack_ordered[6][0],
    'home_turnovers_conceeded': attack_ordered[7][0],
    
    'away_possession_1h_2h': attack_ordered[0][1],
    'away_territory_1h_2h': attack_ordered[1][1],
    'away_clean_breaks': attack_ordered[2][1],
    'away_defenders_beaten': attack_ordered[3][1],
    'away_offloads': attack_ordered[4][1],
    'away_rucks_won': attack_ordered[5][1],
    'away_mauls_won': attack_ordered[6][1],
    'away_turnovers_conceeded': attack_ordered[7][1]
    
}

formed_attack_data

{'game_id': '597383',
 'league_id': '180659',
 'home_possession_1h_2h': '69% / 46%',
 'home_territory_1h_2h': '59% / 49%',
 'home_clean_breaks': '12',
 'home_defenders_beaten': '36',
 'home_offloads': '10',
 'home_rucks_won': '130 / 131 (99%)',
 'home_mauls_won': '5 / 6 (83%)',
 'home_turnovers_conceeded': '10',
 'away_possession_1h_2h': '31% / 54%',
 'away_territory_1h_2h': '41% / 51%',
 'away_clean_breaks': '1',
 'away_defenders_beaten': '15',
 'away_offloads': '3',
 'away_rucks_won': '109 / 115 (94%)',
 'away_mauls_won': '2 / 3 (66%)',
 'away_turnovers_conceeded': '8'}

## Group 5: Possession and Territory 

In [6]:
poss_top = soup.findAll(
    class_="stat-graph compareLineGraph twoTeam largeLabels"
)[1]

terr_top = soup.findAll(
    class_="stat-graph compareLineGraph twoTeam largeLabels large"
)[0]

poss_vals = poss_top.findAll(class_='chartValue')
terr_vals = terr_top.findAll(class_='chartValue')

form_terr_poss = {
    
    'game_id': game_id,
    'league_id': league_id,
    
    'home_total_possession': poss_vals[0].text,
    'home_total_territory': terr_vals[0].text,
    
    'away_total_possesion': poss_vals[1].text,
    'away_total_territory': terr_vals[1].text
}

form_terr_poss

{'game_id': '597383',
 'league_id': '180659',
 'home_total_possession': '54%',
 'home_total_territory': '53%',
 'away_total_possesion': '46%',
 'away_total_territory': '47%'}

In [17]:
sp_bar = soup.findAll(class_='sub-module equal-height countChartList height-reset')
len(sp_bar)

4

## Group 6: Set Pieces 

In [7]:
sp_bar = soup.findAll(class_='sub-module equal-height countChartList height-reset')[2]
sp_charts = sp_bar.findAll(class_='countChart')

scrum = sp_charts[0]
home_scrum = scrum.findAll(class_='countLabel')[0].text
away_scrum = scrum.findAll(class_='countLabel')[1].text

lineouts = sp_charts[1]
home_lineout = lineouts.findAll(class_='countLabel')[0].text
away_lineout = lineouts.findAll(class_='countLabel')[1].text

form_sp = {
    'game_id': game_id,
    'league_id': league_id,
    'home_scrum': home_scrum,
    'home_lineout': home_lineout,
    
    'away_scrum': away_scrum,
    'away_lineout': away_lineout
}

form_sp

{'game_id': '597383',
 'league_id': '180659',
 'home_scrum': '3/3 (100%)',
 'home_lineout': '12/14 (85%)',
 'away_scrum': '5/8 (62%)',
 'away_lineout': '10/12 (83%)'}

## Group 7: Defending 

In [8]:
def_bar = soup.findAll(class_='sub-module equal-height countChartList height-reset')[3]
raw_tackles = def_bar.findAll(class_='home-team')
perc_tackles = def_bar.findAll(class_='away-team')

form_def = {
    'game_id': game_id,
    'league_id': league_id,
    
    'home_tackles': raw_tackles[0].text, 
    'home_tackle_perc': perc_tackles[0].text, 
    
    'away_tackles': raw_tackles[1].text, 
    'away_tackle_perc': perc_tackles[1].text
}

form_def

{'game_id': '597383',
 'league_id': '180659',
 'home_tackles': '190/205',
 'home_tackle_perc': '93%',
 'away_tackles': '194/230',
 'away_tackle_perc': '84%'}

## Group 8: Discipline and Penalties

In [9]:
disc_tab = tables[3]
disc_rows = disc_tab.find('tbody').findAll('td')

stack_rls = soup.findAll(class_='stacked-rl')
penalty = stack_rls[1].find(class_='countChart').findAll(
    class_='countLabel'
)

disc_labels = [
    'Red Cards', 'Yellow Cards', 'Total Free Kicks Conceded'
]

disc_step = []
for td in disc_rows:
    val_text = td.text
    if val_text not in disc_labels:
        disc_step.append(val_text)

disc_ordered = [[disc_step[i-1], disc_step[i]] \
                       for i in range(1, len(disc_step), 2)]

form_def_penalty = {
    'game_id': game_id,
    'league_id': league_id,
    
    'home_red_cards': disc_ordered[0][0],
    'home_yellow_cards': disc_ordered[1][0],
    'home_free_kicks_con': disc_ordered[2][0], 
    
    
    'away_red_cards': disc_ordered[0][1],
    'away_yellow_cards': disc_ordered[1][1],
    'away_free_kicks_con': disc_ordered[2][1]
}

form_def_penalty 


{'game_id': '597383',
 'league_id': '180659',
 'home_red_cards': '0',
 'home_yellow_cards': '2',
 'home_free_kicks_con': '1',
 'away_red_cards': '0',
 'away_yellow_cards': '0',
 'away_free_kicks_con': '2'}