In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

r = requests.get('https://www.bjjheroes.com/a-z-bjj-fighters-list')
# print the first 500 characters of the HTML
print(r.text[0:500])

<!doctype html>
<head dir="ltr" lang="en-US" prefix="og: https://ogp.me/ns#">

<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<link href="//www.google-analytics.com" rel="dns-prefetch">
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" />
<link rel="pingback" href="https://www.bjjheroes.com/xmlrpc.php">
<link rel="icon" id="favicon" type="image/png" href="https://www.bjjheroes.com/wp-content/uploads/2020/03/favicon-16x


In [3]:
soup = BeautifulSoup(r.text, 'html.parser')

# Method 1 to Tag html 

In [4]:
tag = soup.title
tag

<title>Competitor and Coach Database | BJJ Heroes</title>

# Method 2 to Tag html

In [9]:
tag = soup.find_all("td")

print('First 20 entries')
tag[:10]

First 20 entries


[<td class="column-1"><a href="/?p=8141">Aarae</a> </td>,
 <td class="column-2"><a href="/?p=8141">Alexander</a></td>,
 <td class="column-3"></td>,
 <td class="column-4">Team Lloyd Irvin</td>,
 <td class="column-1"><a href="/?p=9246">Aaron</a> </td>,
 <td class="column-2"><a href="/?p=9246">Johnson</a> </td>,
 <td class="column-3"><a href="/?p=9246">Tex</a> </td>,
 <td class="column-4">Unity JJ</td>,
 <td class="column-1"><a href="/?p=8494">Abdurakhman</a> </td>,
 <td class="column-2"><a href="/?p=8494">Bilarov</a> </td>]

# Storing Athlete information in a DataFrame

In [12]:
# Find all table rows
rows = soup.find_all("tr")

# empty list to store data
data = []

# Iterate over the rows and extract
for row in rows:
    columns = row.find_all("td")
    if len(columns) == 4:  # confirm the column count in the row
        first_name_tag = columns[0].find("a")
        first_name = first_name_tag.text.strip() if first_name_tag else columns[0].text.strip()
        first_name_url = first_name_tag['href'] if first_name_tag else None

        last_name_tag = columns[1].find("a")
        last_name = last_name_tag.text.strip() if last_name_tag else columns[1].text.strip()
        last_name_url = last_name_tag['href'] if last_name_tag else None

        nick_name_tag = columns[2].find("a")
        nick_name = nick_name_tag.text.strip() if nick_name_tag else columns[2].text.strip()
        nick_name_url = nick_name_tag['href'] if nick_name_tag else None

        team_tag = columns[3].find("a")
        team = team_tag.text.strip() if team_tag else columns[3].text.strip()
        team_url = team_tag['href'] if team_tag else None
        
        # Append data to list
        data.append({
            "First Name": first_name,
            "Last Name": last_name,
            "Nick Name": nick_name,
            "Team": team,
            "Athlete URL": first_name_url,
        })

# Convert to DataFrame
df = pd.DataFrame(data)

print(df)

       First Name      Last Name  Nick Name               Team Athlete URL
0           Aarae      Alexander              Team Lloyd Irvin    /?p=8141
1           Aaron        Johnson        Tex           Unity JJ    /?p=9246
2     Abdurakhman        Bilarov                 Team Nogueira    /?p=8494
3           Abmar        Barbosa                                   /?p=390
4         Abraham  Marte Messina             Yamasaki / Basico    /?p=3083
...           ...            ...        ...                ...         ...
1379       Valdir         Canuto  Tio Chico          Zenith JJ    /?p=7505
1380      Nakapan    Phungephorn                  BETA Academy    /?p=7512
1381        Eliot          Kelly                     Yemaso JJ    /?p=7519
1382     Mauricio        Pereira   Mauricao         Behring JJ    /?p=7556
1383     Vinicius         Garcia                                  /?p=7636

[1384 rows x 5 columns]


# Attempt to store a single Athlete data in a dataframe

In [17]:
url = 'https://www.bjjheroes.com/bjj-fighters/aj-agazarm'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find all the table rows
rows = soup.find_all("tr")
rows[:20]

[<tr>
 <th>ID</th>
 <th>Opponent</th>
 <th>W/L</th>
 <th>Method</th>
 <th>Competition</th>
 <th>Weight</th>
 <th>Stage</th>
 <th>Year</th>
 </tr>,
 <tr><td>4909</td><td class="sort"><span>Alan Finfou</span><a href="/?p=289">Alan Finfou</a></td><td style="color:#d91300;">L</td><td>Referee Decision</td><td>European NoGi</td><td>ABS</td><td>SF</td><td>2013</td></tr>,
 <tr><td>4923</td><td class="sort"><span>Claudio Mattos</span><a href="/?p=443">Claudio Mattos</a></td><td style="color:#d91300;">L</td><td>Adv</td><td>Rio Open</td><td>76KG</td><td>F</td><td>2013</td></tr>,
 <tr><td>4945</td><td class="sort"><span>Vitor Oliveira</span><a href="/?p=2035">Vitor Oliveira</a></td><td style="color:#d91300;">L</td><td><a href="/?p=5463">Armbar</a></td><td>American Nats</td><td>ABS</td><td>F</td><td>2013</td></tr>,
 <tr><td>4972</td><td class="sort"><span>Murilo Santana</span><a href="/?p=484">Murilo Santana</a></td><td style="color:#d91300;">L</td><td>Choke</td><td>NoGi Pan Am.</td><td>ABS</td><td

# Iterate through the DataFrame
### Use Stored Athlete URL to scrape data from each athletes match results

In [13]:


# Function to append athlete url and extract data
def extract_athlete_data(athlete_url):
    full_url = f'https://www.bjjheroes.com{athlete_url}'
    response = requests.get(full_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all table rows
    rows = soup.find_all("tr")

    # Stores Data
    data = []

    # Iterate extract
    for row in rows:
        columns = row.find_all("td")
        if len(columns) == 8:  # confirm table column count
            opponent_tag = columns[1].find("a")
            opponent_name = opponent_tag.text.strip() if opponent_tag else columns[1].text.strip()
            result = columns[2].text.strip()
            method_tag = columns[3].find("a")
            method = method_tag.text.strip() if method_tag else columns[3].text.strip()
            event = columns[4].text.strip()
            weight = columns[5].text.strip()
            stage = columns[6].text.strip()
            year = columns[7].text.strip()

            # Append to list
            data.append({
                "Athlete URL": full_url,
                "Opponent Name": opponent_name,
                "Result": result,
                "Method": method,
                "Event": event,
                "Weight": weight,
                "Stage": stage,
                "Year": year
            })
    return data


# Store the final data
all_data = []

# Loop through each athlete in the initial DataFrame
for index, row in df.iterrows():
    athlete_url = row['Athlete URL']
    athlete_data = extract_athlete_data(athlete_url)
    all_data.extend(athlete_data)

# Convert the list to DataFrame
df_final = pd.DataFrame(all_data)

print(df_final)


                             Athlete URL       Opponent Name Result  \
0      https://www.bjjheroes.com/?p=9246  Quentin Rosensweig      L   
1      https://www.bjjheroes.com/?p=9246       Neiman Gracie      L   
2      https://www.bjjheroes.com/?p=9246     Richie Martinez      L   
3      https://www.bjjheroes.com/?p=9246        Leo Nogueira      L   
4      https://www.bjjheroes.com/?p=9246      Romulo Azevedo      L   
...                                  ...                 ...    ...   
50828  https://www.bjjheroes.com/?p=7636         Cody Heller      W   
50829  https://www.bjjheroes.com/?p=7636      Daniel Olivier      W   
50830  https://www.bjjheroes.com/?p=7636      Joshua Murdock      W   
50831  https://www.bjjheroes.com/?p=7636       Kyle Raemisch      W   
50832  https://www.bjjheroes.com/?p=7636        Kevin Vieira      W   

                 Method             Event Weight Stage  Year  
0      Inside heel hook          Kakuto 5    ABS   SPF  2015  
1                   R

# Extract URL to new column 'ID', then match first and last name to ID and replace the URL

In [34]:
df['ID'] = df['Athlete URL'].str.extract(r'\?p=(\d+)')
df_final['ID'] = df_final['Athlete URL'].str.extract(r'\?p=(\d+)')

df_merged = df_final.merge(df[['ID', 'First Name', 'Last Name']], on='ID', how='left')

# Drop'Athlete URL' and 'ID'
df_merged.drop(columns=["Athlete URL", "ID"], inplace=True)

# Place 'First Name' and 'Last Name' at the beginning
df_merged = df_merged[["First Name", "Last Name", "Opponent Name", "Result", "Method", "Event", "Weight", "Stage", "Year"]]

# Final DataFrame
print(df_merged)

      First Name Last Name       Opponent Name Result            Method  \
0          Aaron   Johnson  Quentin Rosensweig      L  Inside heel hook   
1          Aaron   Johnson       Neiman Gracie      L               RNC   
2          Aaron   Johnson     Richie Martinez      L         Heel hook   
3          Aaron   Johnson        Leo Nogueira      L            Points   
4          Aaron   Johnson      Romulo Azevedo      L               N/A   
...          ...       ...                 ...    ...               ...   
51294   Vinicius    Garcia         Cody Heller      W               N/A   
51295   Vinicius    Garcia      Daniel Olivier      W       Canto choke   
51296   Vinicius    Garcia      Joshua Murdock      W            Points   
51297   Vinicius    Garcia       Kyle Raemisch      W   Mounted X choke   
51298   Vinicius    Garcia        Kevin Vieira      W   Hashimoto choke   

                  Event Weight Stage  Year  
0              Kakuto 5    ABS   SPF  2015  
1        

In [16]:
df_merged

Unnamed: 0,First Name,Last Name,Opponent Name,Result,Method,Event,Weight,Stage,Year
0,Aaron,Johnson,Quentin Rosensweig,L,Inside heel hook,Kakuto 5,ABS,SPF,2015
1,Aaron,Johnson,Neiman Gracie,L,RNC,NoGi Pan Ams,94KG,SF,2015
2,Aaron,Johnson,Richie Martinez,L,Heel hook,Kakuto Challenge,ABS,SF,2015
3,Aaron,Johnson,Leo Nogueira,L,Points,Atlanta W. Open,94KG,SF,2016
4,Aaron,Johnson,Romulo Azevedo,L,,UAEJJF NYC Pro,94KG,SF,2016
