In [22]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [23]:
# Send a GET request to the specified URL (Ilia Topuria's UFC stats page)
r = requests.get('http://ufcstats.com/fighter-details/54f64b5e283b0ce7')

# Print the status code of the response to ensure the request was successful (200 means OK)
print(r)

# Parse the content of the response using BeautifulSoup for HTML parsing
soup = BeautifulSoup(r.content, 'html.parser')

# Print the prettified version of the parsed HTML, which formats the HTML in a more readable manner
print(soup.prettify())


<Response [200]>
<!DOCTYPE html>
<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html class="no-js ie8 lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <title>
   Stats | UFC
  </title>
  <meta content="" name="description"/>
  <meta content="" name="viewport"/>
  <link href="/blocks/main.css?ver=616580" rel="stylesheet"/>
  <script src="/js/vendor/modernizr-2.6.2.min.js">
  </script>
  <script>
   (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
    (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
    m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
    })(window,document,'script','//www.google-analytics.com/analytics.js','ga');

    ga('create', 'U

In [24]:
# ## Initialize an empty DataFrame for Ilia Topuria's fight statistics
illia_topuria = pd.DataFrame(columns=[
    'W/L',                # Win or Loss outcome
    'Fighter',            # Name of the fighter
    'Knockdown',          # Number of knockdowns
    'Strikes',            # Number of strikes
    'Takedowns',          # Number of takedowns
    'SubmissionAttempt',  # Submission attempts
    'Event',              # Event name
    'WinMethod',          # Method of victory
    'Round',              # Round in which the fight ended
    'Time'                # Time at which the fight ended
])

# ## Scraping data from the HTML table

# Find all the rows in the table body (tbody)
rows = soup.find('tbody').find_all('tr')

# Loop through each row in the table
for row in rows:
    # Extract the data from each column (td) in the row
    col = row.find_all('td')
    
    # Only process rows that have at least 10 columns
    if len(col) >= 10:
        # Clean and extract the text from each cell, removing newlines
        winorlose = col[0].text.strip().replace('\n', '')
        fighter = col[1].text.strip().replace('\n', '')
        knockdown = col[2].text.strip().replace('\n', '')
        strikes = col[3].text.strip().replace('\n', '')
        takedowns = col[4].text.strip().replace('\n', '')
        submissionattempt = col[5].text.strip().replace('\n', '')
        event = col[6].text.strip().replace('\n', '')
        winmethod = col[7].text.strip().replace('\n', '')
        round = col[8].text.strip().replace('\n', '')
        time = col[9].text.strip().replace('\n', '')
        
        # ## Append the extracted data as a new row in the DataFrame
        illia_topuria = pd.concat([illia_topuria, pd.DataFrame({
            'W/L': [winorlose],
            'Fighter': [fighter],
            'Knockdown': [knockdown],
            'Strikes': [strikes],
            'Takedowns': [takedowns],
            'SubmissionAttempt': [submissionattempt],
            'Event': [event],
            'WinMethod': [winmethod],
            'Round': [round],
            'Time': [time]
        })], ignore_index=True)

# ## Display the first few rows of the DataFrame to inspect the data
illia_topuria.head()


Unnamed: 0,W/L,Fighter,Knockdown,Strikes,Takedowns,SubmissionAttempt,Event,WinMethod,Round,Time
0,win,Ilia Topuria Alexander Vo...,1 0,35 47,0 0,0 0,UFC 298: Volkanovski vs. Topuria ...,KO/TKO Punch,2,3:32
1,win,Ilia Topuria Josh Emmett,1 0,152 87,3 0,0 0,UFC Fight Night: Emmett vs. Topuria ...,U-DEC,5,5:00
2,win,Ilia Topuria Bryce Mitchell,1 0,40 36,0 1,1 0,UFC 282: Blachowicz vs. Ankalaev ...,SUB Arm Triangle,2,3:10
3,win,Ilia Topuria Jai Herbert,1 1,20 22,1 0,0 0,UFC Fight Night: Volkov vs. Aspinall ...,KO/TKO Punch,2,1:07
4,win,Ilia Topuria Ryan Hall,0 0,18 10,0 0,0 0,UFC 264: Poirier vs. McGregor 3 ...,KO/TKO Punches,1,4:47


In [25]:
# Extract and clean specific columns in the DataFrame

# Extracting the first part (landed knockdowns) from the 'Knockdown' column
illia_topuria['Knockdowns'] = illia_topuria['Knockdown'].str.split(pat=' ').str[0]

# Extracting the first part (landed strikes) from the 'Strikes' column
illia_topuria['Strike'] = illia_topuria['Strikes'].str.split(pat=' ').str[0]

# Extracting the first part (landed takedowns) from the 'Takedowns' column
illia_topuria['Takedown'] = illia_topuria['Takedowns'].str.split(pat=' ').str[0]

# Extracting the first part (number of submission attempts) from the 'SubmissionAttempt' column
illia_topuria['SubmissionAttempts'] = illia_topuria['SubmissionAttempt'].str.split(pat=' ').str[0]

# Extracting the method of win (first word) from the 'WinMethod' column
illia_topuria['Method_of_Win'] = illia_topuria['WinMethod'].str.split(pat=' ').str[0]

# Extracting the opponent's name by splitting the 'Fighter' column at 'Topuria'
illia_topuria['Opponent'] = illia_topuria['Fighter'].str.split(pat='Topuria').str[1]


# Drop original columns that are no longer needed

# Dropping the 'Knockdown' column
illia_topuria = illia_topuria.drop(['Knockdown'], axis=1)

# Dropping the 'Strikes' column
illia_topuria = illia_topuria.drop(['Strikes'], axis=1)

# Dropping the 'Takedowns' column
illia_topuria = illia_topuria.drop(['Takedowns'], axis=1)

# Dropping the 'SubmissionAttempt' column
illia_topuria = illia_topuria.drop(['SubmissionAttempt'], axis=1)

# Dropping the 'WinMethod' column
illia_topuria = illia_topuria.drop(['WinMethod'], axis=1)

# Dropping the 'Fighter' column
illia_topuria = illia_topuria.drop(['Fighter'], axis=1)


FINDING STRIKE DATA OF ILLIA TOPURIA

In [26]:
# Define list of URLs to scrape fight data from
data_url = [
    'http://ufcstats.com/fight-details/bec3154a11df3299',
    'http://ufcstats.com/fight-details/c3ef3cb03edde8bb',
    'http://ufcstats.com/fight-details/4a17876e99f6baf3',
    'http://ufcstats.com/fight-details/a200b5dcbdd2506e',
    'http://ufcstats.com/fight-details/07468b6347ac5e3d',
    'http://ufcstats.com/fight-details/b1be8b41b1a4fd85',
    'http://ufcstats.com/fight-details/22fe6779c3fa649d'
]

# Define columns for the significant strikes DataFrame
columns = ['Fighter', 'Sig. Str.', 'Sig. Str. %', 'Head', 'Body', 'Leg', 'Distance', 'Clinch', 'Ground']

# Create an empty DataFrame with the specified columns
illia_significant_strikes = pd.DataFrame(columns=columns)

# Function to scrape fight data for a given URL
def scrape_fight_data(fight_url):
    # Send a request to the URL
    response = requests.get(fight_url)
    
    # Parse the response content with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Locate the significant strikes by target table (2nd table)
    sig_strikes_table = soup.find_all('table', class_='b-fight-details__table')[1]

    # Extract the rows of the table (each row corresponds to a fighter)
    rows = sig_strikes_table.find('tbody').find_all('tr')

    # Iterate through the rows to find Ilia Topuria's data
    for row in rows:
        cols = row.find_all('td')
        fighter_name = cols[0].text.strip()

        # Check if the fighter is Ilia Topuria
        if 'Ilia Topuria' in fighter_name:
            # Create a dictionary to store the fight data
            fight_data = {
                'Fighter': fighter_name.replace('\n', ''),
                'Sig. Str.': cols[1].text.strip().replace('\n', ''),
                'Sig. Str. %': cols[2].text.strip().replace('\n', ''),
                'Head': cols[3].text.strip().replace('\n', ''),
                'Body': cols[4].text.strip().replace('\n', ''),
                'Leg': cols[5].text.strip().replace('\n', ''),
                'Distance': cols[6].text.strip().replace('\n', ''),
                'Clinch': cols[7].text.strip().replace('\n', ''),
                'Ground': cols[8].text.strip().replace('\n', '')
            }
            return fight_data

# Create an empty list to store individual DataFrames
dataframes = []

# Loop through the URLs, scrape data, and append it to the list of DataFrames
for url in data_url:
    fight_data = scrape_fight_data(url)
    if fight_data:
        # Create a DataFrame for each fight and append to the list
        fight_df = pd.DataFrame([fight_data], columns=columns)
        dataframes.append(fight_df)

# Concatenate all DataFrames into one
illia_significant_strikes = pd.concat(dataframes, ignore_index=True)

# Display the final concatenated DataFrame
illia_significant_strikes.head()


Unnamed: 0,Fighter,Sig. Str.,Sig. Str. %,Head,Body,Leg,Distance,Clinch,Ground
0,Alexander Volkanovski Ilia Topuria,26 of 52 15 of 38,50% 39%,8 of 29 4 of 25,7 of 12 7 of 8,11 of 11 4 of 5,24 of 49 13 of 36,2 of 3 2 of 2,0 of 0 0 of 0
1,Josh Emmett Ilia Topuria,14 of 68 21 of 56,20% 37%,8 of 58 14 of 47,4 of 8 4 of 5,2 of 2 3 of 4,14 of 68 21 of 56,0 of 0 0 of 0,0 of 0 0 of 0
2,Bryce Mitchell Ilia Topuria,21 of 52 17 of 38,40% 44%,13 of 38 12 of 27,6 of 12 4 of 10,2 of 2 1 of 1,20 of 50 16 of 37,0 of 0 1 of 1,1 of 2 0 of 0
3,Jai Herbert Ilia Topuria,19 of 47 12 of 27,40% 44%,18 of 45 11 of 25,0 of 1 1 of 2,1 of 1 0 of 0,16 of 43 6 of 21,1 of 1 2 of 2,2 of 3 4 of 4
4,Ryan Hall Ilia Topuria,10 of 17 18 of 32,58% 56%,2 of 7 11 of 24,4 of 5 2 of 2,4 of 5 5 of 6,9 of 16 3 of 8,0 of 0 0 of 0,1 of 1 15 of 24


## Before applying any transformations, it’s important to inspect the structure of the columns to understand how the data is formatted.

We will:

### Display the Raw Data
Check the contents of the columns to see if there are any irregularities (such as extra spaces or newlines).

### Split the Data
Split the data in the column to understand how the values are being parsed and identify where the second part of the strikes is located.

### Identify Issues
Review the result of the split to check for empty or misplaced values.


In [27]:
# Define the column names to inspect
col_names = ['Sig. Str.', 'Head', 'Body', 'Leg', 'Distance', 'Clinch', 'Ground']

# Iterate through each column name in the list
for i in col_names:
    # Print the logic for the current column
    print(f"Logic of {i}")
    
    # Print the result of splitting the data in the column by spaces
    # This helps in inspecting how the values are formatted and if there are any irregularities
    print(illia_significant_strikes[i].str.split(pat=' '))
    
    # Print a newline for better readability between results
    print("""\n""")


Logic of Sig. Str.
0    [26, of, 52, , , , , , , , , , 15, of, 38]
1    [14, of, 68, , , , , , , , , , 21, of, 56]
2    [21, of, 52, , , , , , , , , , 17, of, 38]
3    [19, of, 47, , , , , , , , , , 12, of, 27]
4    [10, of, 17, , , , , , , , , , 18, of, 32]
5    [31, of, 63, , , , , , , , , , 16, of, 52]
6      [4, of, 21, , , , , , , , , , 6, of, 16]
Name: Sig. Str., dtype: object


Logic of Head
0      [8, of, 29, , , , , , , , , , 4, of, 25]
1     [8, of, 58, , , , , , , , , , 14, of, 47]
2    [13, of, 38, , , , , , , , , , 12, of, 27]
3    [18, of, 45, , , , , , , , , , 11, of, 25]
4      [2, of, 7, , , , , , , , , , 11, of, 24]
5    [12, of, 34, , , , , , , , , , 10, of, 40]
6       [1, of, 16, , , , , , , , , , 2, of, 9]
Name: Head, dtype: object


Logic of Body
0      [7, of, 12, , , , , , , , , , 7, of, 8]
1       [4, of, 8, , , , , , , , , , 4, of, 5]
2     [6, of, 12, , , , , , , , , , 4, of, 10]
3       [0, of, 1, , , , , , , , , , 1, of, 2]
4       [4, of, 5, , , , , , , ,

In [28]:
# First, clean the  column by removing multiple spaces and newlines
# Using the col_names list from previous cell here as well
for i in col_names:
    illia_significant_strikes[i] = illia_significant_strikes[i].str.replace(r'\s+', ' ', regex=True)

# Now apply the splitting logic to extract the second part of the strikes 



# Check the results
illia_significant_strikes.head()


Unnamed: 0,Fighter,Sig. Str.,Sig. Str. %,Head,Body,Leg,Distance,Clinch,Ground
0,Alexander Volkanovski Ilia Topuria,26 of 52 15 of 38,50% 39%,8 of 29 4 of 25,7 of 12 7 of 8,11 of 11 4 of 5,24 of 49 13 of 36,2 of 3 2 of 2,0 of 0 0 of 0
1,Josh Emmett Ilia Topuria,14 of 68 21 of 56,20% 37%,8 of 58 14 of 47,4 of 8 4 of 5,2 of 2 3 of 4,14 of 68 21 of 56,0 of 0 0 of 0,0 of 0 0 of 0
2,Bryce Mitchell Ilia Topuria,21 of 52 17 of 38,40% 44%,13 of 38 12 of 27,6 of 12 4 of 10,2 of 2 1 of 1,20 of 50 16 of 37,0 of 0 1 of 1,1 of 2 0 of 0
3,Jai Herbert Ilia Topuria,19 of 47 12 of 27,40% 44%,18 of 45 11 of 25,0 of 1 1 of 2,1 of 1 0 of 0,16 of 43 6 of 21,1 of 1 2 of 2,2 of 3 4 of 4
4,Ryan Hall Ilia Topuria,10 of 17 18 of 32,58% 56%,2 of 7 11 of 24,4 of 5 2 of 2,4 of 5 5 of 6,9 of 16 3 of 8,0 of 0 0 of 0,1 of 1 15 of 24


In [29]:
# Iterate through each column name in col_names
for i in col_names:
    # Split the column data by spaces and extract the 4th, 5th, and 6th elements
    # Concatenate these elements with spaces in between and update the column
    illia_significant_strikes[i] = illia_significant_strikes[i].str.split(' ').str[3] + ' ' + illia_significant_strikes[i].str.split(' ').str[4] + ' ' + illia_significant_strikes[i].str.split(' ').str[5]

# Display the first few rows of the DataFrame to verify the changes
illia_significant_strikes.head()


Unnamed: 0,Fighter,Sig. Str.,Sig. Str. %,Head,Body,Leg,Distance,Clinch,Ground
0,Alexander Volkanovski Ilia Topuria,15 of 38,50% 39%,4 of 25,7 of 8,4 of 5,13 of 36,2 of 2,0 of 0
1,Josh Emmett Ilia Topuria,21 of 56,20% 37%,14 of 47,4 of 5,3 of 4,21 of 56,0 of 0,0 of 0
2,Bryce Mitchell Ilia Topuria,17 of 38,40% 44%,12 of 27,4 of 10,1 of 1,16 of 37,1 of 1,0 of 0
3,Jai Herbert Ilia Topuria,12 of 27,40% 44%,11 of 25,1 of 2,0 of 0,6 of 21,2 of 2,4 of 4
4,Ryan Hall Ilia Topuria,18 of 32,58% 56%,11 of 24,2 of 2,5 of 6,3 of 8,0 of 0,15 of 24


In [30]:
# List of new column names where the transformed data will be stored
landed_col_names = ['Significant_Strikes_Landed', 'Head_Landed', 'Body_Landed', 'Leg_Landed', 'Distance_Landed', 'Clinch_Landed', 'Ground_Landed']

# Use zip() to iterate over both col_names and landed_col_names simultaneously
for i, j in zip(col_names, landed_col_names):
    # For each pair of original and new column names, split the data in the original column by spaces
    # Select the first part (index 0) of the split data and assign it to the new column
    illia_significant_strikes[j] = illia_significant_strikes[i].str.split(' ').str[0]

# Display the first few rows of the DataFrame to check the results
illia_significant_strikes.head()


Unnamed: 0,Fighter,Sig. Str.,Sig. Str. %,Head,Body,Leg,Distance,Clinch,Ground,Significant_Strikes_Landed,Head_Landed,Body_Landed,Leg_Landed,Distance_Landed,Clinch_Landed,Ground_Landed
0,Alexander Volkanovski Ilia Topuria,15 of 38,50% 39%,4 of 25,7 of 8,4 of 5,13 of 36,2 of 2,0 of 0,15,4,7,4,13,2,0
1,Josh Emmett Ilia Topuria,21 of 56,20% 37%,14 of 47,4 of 5,3 of 4,21 of 56,0 of 0,0 of 0,21,14,4,3,21,0,0
2,Bryce Mitchell Ilia Topuria,17 of 38,40% 44%,12 of 27,4 of 10,1 of 1,16 of 37,1 of 1,0 of 0,17,12,4,1,16,1,0
3,Jai Herbert Ilia Topuria,12 of 27,40% 44%,11 of 25,1 of 2,0 of 0,6 of 21,2 of 2,4 of 4,12,11,1,0,6,2,4
4,Ryan Hall Ilia Topuria,18 of 32,58% 56%,11 of 24,2 of 2,5 of 6,3 of 8,0 of 0,15 of 24,18,11,2,5,3,0,15


In [31]:
# List of new column names where the transformed data for attempted strikes will be stored
attempted_col_names = ['Significant_Strikes_Attempted', 'Head_Attempted', 'Body_Attempted', 'Leg_Attempted', 'Distance_Attempted', 'Clinch_Attempted', 'Ground_Attempted']

# Use zip() to iterate over both col_names and attempted_col_names simultaneously
for i, j in zip(col_names, attempted_col_names):
    # For each pair of original and new column names, split the data in the original column by spaces
    # Select the third part (index 2) of the split data and assign it to the new column
    illia_significant_strikes[j] = illia_significant_strikes[i].str.split(' ').str[2]

# Display the first few rows of the DataFrame to check the results
illia_significant_strikes.head()


Unnamed: 0,Fighter,Sig. Str.,Sig. Str. %,Head,Body,Leg,Distance,Clinch,Ground,Significant_Strikes_Landed,...,Distance_Landed,Clinch_Landed,Ground_Landed,Significant_Strikes_Attempted,Head_Attempted,Body_Attempted,Leg_Attempted,Distance_Attempted,Clinch_Attempted,Ground_Attempted
0,Alexander Volkanovski Ilia Topuria,15 of 38,50% 39%,4 of 25,7 of 8,4 of 5,13 of 36,2 of 2,0 of 0,15,...,13,2,0,38,25,8,5,36,2,0
1,Josh Emmett Ilia Topuria,21 of 56,20% 37%,14 of 47,4 of 5,3 of 4,21 of 56,0 of 0,0 of 0,21,...,21,0,0,56,47,5,4,56,0,0
2,Bryce Mitchell Ilia Topuria,17 of 38,40% 44%,12 of 27,4 of 10,1 of 1,16 of 37,1 of 1,0 of 0,17,...,16,1,0,38,27,10,1,37,1,0
3,Jai Herbert Ilia Topuria,12 of 27,40% 44%,11 of 25,1 of 2,0 of 0,6 of 21,2 of 2,4 of 4,12,...,6,2,4,27,25,2,0,21,2,4
4,Ryan Hall Ilia Topuria,18 of 32,58% 56%,11 of 24,2 of 2,5 of 6,3 of 8,0 of 0,15 of 24,18,...,3,0,15,32,24,2,6,8,0,24


In [32]:
# Working on Sig. Str. % seperately out from the loop as its features are different from other columns.

# This ensures that there are no extra spaces in the 'Sig. Str. %' column, which can help with consistent data formatting
illia_significant_strikes['Sig. Str. %'] = illia_significant_strikes['Sig. Str. %'].str.replace(r'\s+', ' ', regex=True)

# This splits the cleaned column data by spaces and selects the second part (index 1), which should be the percentage value
illia_significant_strikes['Significant_Strikes_Percentage'] = illia_significant_strikes['Sig. Str. %'].str.split(' ').str[1]

# This removes the '%' character from the extracted percentage values to leave just the numeric part
illia_significant_strikes['Significant_Strikes_Percentage'] = illia_significant_strikes['Significant_Strikes_Percentage'].str.replace('%', '')


In [33]:
# Dropping unwanted columns
illia_significant_strikes=illia_significant_strikes.drop(['Fighter'], axis=1)

In [34]:
# Display the first few rows of the DataFrame to check the results
illia_significant_strikes.head()

Unnamed: 0,Sig. Str.,Sig. Str. %,Head,Body,Leg,Distance,Clinch,Ground,Significant_Strikes_Landed,Head_Landed,...,Clinch_Landed,Ground_Landed,Significant_Strikes_Attempted,Head_Attempted,Body_Attempted,Leg_Attempted,Distance_Attempted,Clinch_Attempted,Ground_Attempted,Significant_Strikes_Percentage
0,15 of 38,50% 39%,4 of 25,7 of 8,4 of 5,13 of 36,2 of 2,0 of 0,15,4,...,2,0,38,25,8,5,36,2,0,39
1,21 of 56,20% 37%,14 of 47,4 of 5,3 of 4,21 of 56,0 of 0,0 of 0,21,14,...,0,0,56,47,5,4,56,0,0,37
2,17 of 38,40% 44%,12 of 27,4 of 10,1 of 1,16 of 37,1 of 1,0 of 0,17,12,...,1,0,38,27,10,1,37,1,0,44
3,12 of 27,40% 44%,11 of 25,1 of 2,0 of 0,6 of 21,2 of 2,4 of 4,12,11,...,2,4,27,25,2,0,21,2,4,44
4,18 of 32,58% 56%,11 of 24,2 of 2,5 of 6,3 of 8,0 of 0,15 of 24,18,11,...,0,15,32,24,2,6,8,0,24,56


In [35]:
# Join both of the cleaned dataframe to create a master dataframe
ilia_topuria = illia_topuria.join(illia_significant_strikes)
ilia_topuria.head()

Unnamed: 0,W/L,Event,Round,Time,Knockdowns,Strike,Takedown,SubmissionAttempts,Method_of_Win,Opponent,...,Clinch_Landed,Ground_Landed,Significant_Strikes_Attempted,Head_Attempted,Body_Attempted,Leg_Attempted,Distance_Attempted,Clinch_Attempted,Ground_Attempted,Significant_Strikes_Percentage
0,win,UFC 298: Volkanovski vs. Topuria ...,2,3:32,1,35,0,0,KO/TKO,Alexander Volkanovski,...,2,0,38,25,8,5,36,2,0,39
1,win,UFC Fight Night: Emmett vs. Topuria ...,5,5:00,1,152,3,0,U-DEC,Josh Emmett,...,0,0,56,47,5,4,56,0,0,37
2,win,UFC 282: Blachowicz vs. Ankalaev ...,2,3:10,1,40,0,1,SUB,Bryce Mitchell,...,1,0,38,27,10,1,37,1,0,44
3,win,UFC Fight Night: Volkov vs. Aspinall ...,2,1:07,1,20,1,0,KO/TKO,Jai Herbert,...,2,4,27,25,2,0,21,2,4,44
4,win,UFC 264: Poirier vs. McGregor 3 ...,1,4:47,0,18,0,0,KO/TKO,Ryan Hall,...,0,15,32,24,2,6,8,0,24,56
