# Scraper for a fighters' specific data

## Imports for scraping data from a website

In [116]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

## Screenshot of table one
![Statistics table one](Images/statistics_table_one.png)

## Screenshot of table two
![Statistics table two](Images/statistics_table_two.png)

## Method for scraping fighter specific data

In [37]:
def scrape_fighter_URL(fighter_url):
    try:
        response = requests.get(fighter_url)

        soup = BeautifulSoup(response.content, 'html.parser')

        #Try finding the stats table using the BeautifulSoup, pass the name of the div of the DOM
        stats_table_one = soup.find('div', class_='b-list__info-box-left clearfix')
        
        #Since there are two tables that contain the fighter statistics I need to scrape both the divs
        stats_table_two = soup.find('div', class_='b-list__info-box-right b-list__info-box_style-margin-right')
        
        # A variable to store the data
        statistics = {}
        
        if stats_table_one:
            # Thats the div that contains the stats
            stats_list_one = stats_table_one.find('ul', class_='b-list__box-list b-list__box-list_margin-top')
            # If a the div with the stats is found we iterate through each entry and store it in the statistics datastructure
            if stats_list_one:
                for item in stats_list_one.find_all('li', class_='b-list__box-list-item b-list__box-list-item_type_block'):
                    # Extract the text from the list item
                    stat_text_one = item.get_text(strip=True)
                    # print(stat_text_one)
                    if ':' in stat_text_one:
                        key, value = [s.strip() for s in stat_text_one.split(':', 1)]
                        statistics[key] = value
        if stats_table_two:
            stats_list_two = stats_table_two.find('ul', class_='b-list__box-list b-list__box-list_margin-top')
            if stats_list_two:
                for item in stats_list_two.find_all('li', class_='b-list__box-list-item b-list__box-list-item_type_block'):
                    stat_text_two = item.get_text(strip=True)
                    # print(stat_text_two)
                    if ':' in stat_text_two:
                        key, value = [s.strip() for s in stat_text_two.split(':', 1)]
                        statistics[key] = value
                    
            return statistics if statistics else None
        else:
            print("Statistics section not found.")
            return None

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the fighter's page: {e}")
        return None

# # Example usage
fighter_url = "http://ufcstats.com/fighter-details/15df64c02b6b0fde"  # Replace with the actual fighter URL
fighter_statistics = scrape_fighter_URL(fighter_url)
print(fighter_statistics)

{'SLpM': '3.29', 'Str. Acc.': '38%', 'SApM': '4.41', 'Str. Def': '57%', 'TD Avg.': '0.00', 'TD Acc.': '0%', 'TD Def.': '77%', 'Sub. Avg.': '0.0'}


# Scrapper for UFC data

### Code for scraping the UFC statistics website

In [77]:
URL = "http://ufcstats.com/statistics/fighters?char=d&page=all"
page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")

table = soup.find('table', class_='b-statistics__table')

data = []
data_specific_fighter = []

for row in table.find_all('tr', class_='b-statistics__table-row'):
    cells = row.find_all('td')
    cell_value = [cell.get_text(strip=True) for cell in cells]
    
    data.append(cell_value)

    fighter_link = row.select_one('td a')
    
    if fighter_link and fighter_link.has_attr('href'):
        fighter_url = fighter_link.get('href')
        
        #this already should store the specific data for the fighter
        data_specific_fighter = scrape_fighter_URL(fighter_url)
        
        data.append(data_specific_fighter)
        
        df= pd.DataFrame(data)
        
        df.to_excel('test.xlsx', index=False)
        

In [115]:
URL = "http://ufcstats.com/statistics/fighters?char=z&page=all"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")

table = soup.find('table', class_='b-statistics__table')

# List to store combined data for all fighters
all_fighters_data = []

# Iterate through each row of the table to collect data
for row in table.find_all('tr', class_='b-statistics__table-row'):
    cells = row.find_all('td')

    # Extract the general information from the main page
    if len(cells) >= 8:
        # Split the name into first and last (assuming first and last name)
        first_name = cells[0].get_text(strip=True)
        last_name = cells[1].getText(strip=True)
        nickname = cells[2].getText(strip=True)
        height = cells[3].getText(strip=True)
        weight = cells[4].getText(strip=True)
        reach = cells[5].getText(strip=True)
        stance = cells[6].getText(strip=True)
        wins = cells[7].getText(strip=True)
        losses = cells[8].getText(strip=True)
        draws = cells[9].getText(strip=True)

        # Create a list to store all data in a single row
        fighter_data = [
            first_name,
            last_name,
            nickname,
            height,
            weight,
            reach,
            stance,
            wins,
            losses,
            draws
        ]

        # Find the fighter's profile link and scrape specific fighter data
        fighter_link = cells[0].find('a')
        if fighter_link and fighter_link.has_attr('href'):
            fighter_url = fighter_link['href']
            specific_fighter_data = scrape_fighter_URL(fighter_url)

            # Extract specific stats and append to fighter data
            specific_stats = [
                specific_fighter_data.get('SLpM', None),
                specific_fighter_data.get('Str. Acc.', None),
                specific_fighter_data.get('SApM', None),
                specific_fighter_data.get('Str. Def', None),
                specific_fighter_data.get('TD Avg.', None),
                specific_fighter_data.get('TD Def.', None),
                specific_fighter_data.get('Sub. Avg.', None)
            ]
            fighter_data.extend(specific_stats)

        # Append the complete data for the fighter as a single row
        all_fighters_data.append(fighter_data)

# Define column names
column_names = [
    "First Name", "Last Name", "Nickname", "Height", "Weight", "Reach", "Stance", "W", "L", "D",
    "SLpM", "Str. Acc.", "SApM", "Str. Def", "TD Avg.", "TD Def.", "Sub. Avg."
]

# Create a DataFrame from the combined data
df = pd.DataFrame(all_fighters_data, columns=column_names)

# Save the DataFrame to an Excel file
df.to_excel('all_fighters(Z).xlsx', index=False)

# Print the first few rows to verify
print(df.head())

  First Name Last Name            Nickname  Height    Weight  Reach    Stance  \
0       Luke  Zachrich                       6' 2"  185 lbs.  74.0"  Orthodox   
1      Anton     Zafir       The Professor  5' 11"  170 lbs.     --  Orthodox   
2    Aiemann    Zahabi                       5' 8"  135 lbs.  68.0"  Orthodox   
3       Joao    Zaiden                          --  205 lbs.     --             
4    Youssef     Zalal  The Moroccan Devil  5' 10"  145 lbs.  72.0"    Switch   

    W  L  D  SLpM Str. Acc.  SApM Str. Def TD Avg. TD Def. Sub. Avg.  
0  14  4  0  4.56       32%  4.46      62%    0.74     80%       0.0  
1   7  3  0  2.99       64%  2.31      52%    6.11      0%       0.0  
2  11  2  0  3.43       43%  3.61      71%    0.20     79%       0.0  
3  10  3  0  0.00        0%  0.00       0%    0.00      0%       0.0  
4  15  5  1  2.87       49%  1.73      66%    2.33     60%       1.4  


### Combining the files

In [206]:
dfs = []

folder = "Statistics"

path = os.path.join(os.getcwd(), folder)

files = sorted(os.listdir(path))

for file in files:
    if file.endswith(".xlsx"):
        file_path = os.path.join(path, file)
        
        df = pd.read_excel(file_path)
        
        dfs.append(df)
        
        print(f"Successfully read file: {file_path}")
        
if dfs:
    combined_df = pd.concat(dfs, ignore_index=True)
    combined_df.to_excel('Combined_UFC_Fighter_Statistics.xlsx', index=False)
    print("Files combined and saved!")

combined_df.head()

Successfully read file: /Users/danilburov/Desktop/Fontys/Semester5/MinorAI/PythonCR/Personal project/Statistics/all_fighters(A).xlsx
Successfully read file: /Users/danilburov/Desktop/Fontys/Semester5/MinorAI/PythonCR/Personal project/Statistics/all_fighters(B).xlsx
Successfully read file: /Users/danilburov/Desktop/Fontys/Semester5/MinorAI/PythonCR/Personal project/Statistics/all_fighters(C).xlsx
Successfully read file: /Users/danilburov/Desktop/Fontys/Semester5/MinorAI/PythonCR/Personal project/Statistics/all_fighters(D).xlsx
Successfully read file: /Users/danilburov/Desktop/Fontys/Semester5/MinorAI/PythonCR/Personal project/Statistics/all_fighters(E).xlsx
Successfully read file: /Users/danilburov/Desktop/Fontys/Semester5/MinorAI/PythonCR/Personal project/Statistics/all_fighters(F).xlsx
Successfully read file: /Users/danilburov/Desktop/Fontys/Semester5/MinorAI/PythonCR/Personal project/Statistics/all_fighters(G).xlsx
Successfully read file: /Users/danilburov/Desktop/Fontys/Semester5/Mi

Unnamed: 0,First Name,Last Name,Nickname,Height,Weight,Reach,Stance,W,L,D,SLpM,Str. Acc.,SApM,Str. Def,TD Avg.,TD Def.,Sub. Avg.
0,Tom,Aaron,,--,155 lbs.,--,,5,3,0,0.0,0%,0.0,0%,0.0,0%,0.0
1,Danny,Abbadi,The Assassin,"5' 11""",155 lbs.,--,Orthodox,4,6,0,3.29,38%,4.41,57%,0.0,77%,0.0
2,Nariman,Abbasov,Bayraktar,"5' 8""",155 lbs.,"66.0""",Orthodox,28,4,0,3.0,20%,5.67,46%,0.0,66%,0.0
3,David,Abbott,Tank,"6' 0""",265 lbs.,--,Switch,10,15,0,1.35,30%,3.55,38%,1.07,66%,0.0
4,Hamdy,Abdelwahab,The Hammer,"6' 2""",264 lbs.,"72.0""",Southpaw,5,0,0,3.87,52%,3.13,59%,3.0,0%,0.0


# UFC prediction model

## Imports

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

## Normalizing

### Converting 'Height' and 'Reach' to centimeteres

I have converted the 'Height' and 'Reach' features which were in inches to centimeters and rounded them to 0 for better readability

In [207]:
import numpy as np

# Convert heights from feet and inches to inches
def height_to_inches(height):
    if pd.isnull(height) or height == '--':
        return np.nan
    try:
        feet, inches = height.split("'")
        inches = int(feet) * 12 + int(inches.strip('"'))
        return inches
    except:
        return np.nan

# Apply the height conversion function
combined_df['Height'] = combined_df['Height'].apply(height_to_inches)

# Clean the Reach column by stripping the quotation marks and converting to float
combined_df['Reach'] = combined_df['Reach'].str.rstrip('"').replace('--', np.nan).astype(float)

# Fill missing height and reach values with the mean of the respective columns and round them to 0
combined_df['Height'].fillna(combined_df['Height'].mean(), inplace=True)
combined_df['Reach'].fillna(combined_df['Reach'].mean(), inplace=True)

# Round the Height and Reach columns to the nearest integer
combined_df['Height'] = combined_df['Height'].round(0)
combined_df['Reach'] = combined_df['Reach'].round(0)

# Display the updated DataFrame
combined_df.head()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['Height'].fillna(combined_df['Height'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['Reach'].fillna(combined_df['Reach'].mean(), inplace=True)


Unnamed: 0,First Name,Last Name,Nickname,Height,Weight,Reach,Stance,W,L,D,SLpM,Str. Acc.,SApM,Str. Def,TD Avg.,TD Def.,Sub. Avg.
0,Tom,Aaron,,70.0,155 lbs.,72.0,,5,3,0,0.0,0%,0.0,0%,0.0,0%,0.0
1,Danny,Abbadi,The Assassin,71.0,155 lbs.,72.0,Orthodox,4,6,0,3.29,38%,4.41,57%,0.0,77%,0.0
2,Nariman,Abbasov,Bayraktar,68.0,155 lbs.,66.0,Orthodox,28,4,0,3.0,20%,5.67,46%,0.0,66%,0.0
3,David,Abbott,Tank,72.0,265 lbs.,72.0,Switch,10,15,0,1.35,30%,3.55,38%,1.07,66%,0.0
4,Hamdy,Abdelwahab,The Hammer,74.0,264 lbs.,72.0,Southpaw,5,0,0,3.87,52%,3.13,59%,3.0,0%,0.0


### Stance conversion into 0, 1 or 2

In [208]:
combined_df['Stance'].fillna('Unknown', inplace=True)

stance_mapping = {'Switch' : 0, 'Orthodox': 1, 'Southpaw': 2}

combined_df['Stance'] = combined_df['Stance'].map(stance_mapping)

combined_df.to_excel('Updated_UFC_File.xlsx', index=False)

combined_df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['Stance'].fillna('Unknown', inplace=True)


Unnamed: 0,First Name,Last Name,Nickname,Height,Weight,Reach,Stance,W,L,D,SLpM,Str. Acc.,SApM,Str. Def,TD Avg.,TD Def.,Sub. Avg.
0,Tom,Aaron,,70.0,155 lbs.,72.0,,5,3,0,0.0,0%,0.0,0%,0.0,0%,0.0
1,Danny,Abbadi,The Assassin,71.0,155 lbs.,72.0,1.0,4,6,0,3.29,38%,4.41,57%,0.0,77%,0.0
2,Nariman,Abbasov,Bayraktar,68.0,155 lbs.,66.0,1.0,28,4,0,3.0,20%,5.67,46%,0.0,66%,0.0
3,David,Abbott,Tank,72.0,265 lbs.,72.0,0.0,10,15,0,1.35,30%,3.55,38%,1.07,66%,0.0
4,Hamdy,Abdelwahab,The Hammer,74.0,264 lbs.,72.0,2.0,5,0,0,3.87,52%,3.13,59%,3.0,0%,0.0


### Creating 'Total_Fights' and 'Win_Percentage' features for each fighter

In [209]:
combined_df['Total_Fights'] = combined_df['W'] + combined_df['L'] + combined_df['D']

combined_df['Win_Percentage'] = (combined_df['W'] / combined_df['Total_Fights']) * 100

# Fill NaN or infinite values with 0 before converting to integer
combined_df['Win_Percentage'] = combined_df['Win_Percentage'].fillna(0).replace([float('inf'), -float('inf')], 0).round(0).astype(int)

combined_df['Win_Percentage'] = combined_df['Win_Percentage'].astype(str) + '%'

#Grappler will be 0 and striker will be 1
combined_df['Grappler/Striker'] = combined_df['TD Avg.'].apply(lambda x: 0 if x > 1 else 1)

combined_df.to_excel('UFC_Statistics.xlsx', index=False)

# combined_df.head()

# Calculate the number of grapplers (0) and strikers (1)
grappler_count = combined_df[combined_df['Grappler/Striker'] == 0].shape[0]
striker_count = combined_df[combined_df['Grappler/Striker'] == 1].shape[0]

# Display the counts
print(f"Number of Grapplers: {grappler_count}")
print(f"Number of Strikers: {striker_count}")

Number of Grapplers: 1690
Number of Strikers: 2524


## Correlation matrisses

In [None]:
# Convert 'W', 'L', and 'D' to win percentage
combined_df['Total_Fights'] = combined_df['W'] + combined_df['L'] + combined_df['D']
combined_df['Win_Percentage'] = combined_df['W'] / combined_df['Total_Fights']

# Drop fighters with no fights (to avoid division by zero)
combined_df = combined_df[combined_df['Total_Fights'] > 0]

# Select relevant features for correlation
features = ['SLpM', 'Str. Acc.', 'SApM', 'Str. Def', 'TD Avg.', 'TD Def.', 'Sub. Avg.','Stance','Win_Percentage']

# Ensure the columns are strings before stripping '%' and converting to float
combined_df['Str. Acc.'] = combined_df['Str. Acc.'].astype(str).str.rstrip('%').astype(float)
combined_df['Str. Def'] = combined_df['Str. Def'].astype(str).str.rstrip('%').astype(float)
combined_df['TD Def.'] = combined_df['TD Def.'].astype(str).str.rstrip('%').astype(float)

# Calculate correlation
correlation_matrix = combined_df[features].corr()

# Display the correlation matrix
print(correlation_matrix)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot the heatmap for the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.show()

## Modelling the data

In [215]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Ensure the columns are strings before stripping '%' and converting to float
combined_df['Str. Acc.'] = combined_df['Str. Acc.'].astype(str).str.rstrip('%').astype(float) / 100.0
combined_df['Str. Def'] = combined_df['Str. Def'].astype(str).str.rstrip('%').astype(float) / 100.0
combined_df['Win_Percentage'] = combined_df['Win_Percentage'].astype(str).str.rstrip('%').astype(float) / 100.0
combined_df['TD Def.'] = combined_df['TD Def.'].astype(str).str.rstrip('%').astype(float) / 100.0

features = ['SLpM', 'Str. Acc.', 'SApM', 'Str. Def', 'TD Avg.', 'TD Def.', 'Sub. Avg.', 'Win_Percentage','Reach','Height','Grappler/Striker']
X = combined_df[features]

#Target variable
y = combined_df['W']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the data (since features have different scales)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy}")
print("Classification report:")
print(classification_report(y_test, y_pred))

Model accuracy: 0.17556346381969157
Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        29
           1       0.50      0.48      0.49        23
           2       0.21      0.29      0.24        21
           3       0.25      0.24      0.24        21
           4       0.14      0.18      0.16        17
           5       0.29      0.29      0.29        35
           6       0.13      0.21      0.16        52
           7       0.24      0.17      0.20        47
           8       0.14      0.21      0.17        48
           9       0.19      0.25      0.21        57
          10       0.07      0.07      0.07        42
          11       0.16      0.21      0.18        47
          12       0.07      0.11      0.09        38
          13       0.15      0.24      0.19        41
          14       0.20      0.16      0.18        50
          15       0.12      0.06      0.08        36
          16       0.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### I realized that I do not have enough data to make the model more accurate, it is possible that I might change the approach where instead of training the model solely based on the fighters' data and when training the model I basically use only the fighter specific data to predict a fight.

### In order to make the model more accurate I will create or import a new dataset with more features. This dataset will have all fights from the beginning of the UFC, how they ended the fighters, etc.