In [58]:
# Web Scraping
from bs4 import BeautifulSoup
import requests

# Data Cleaning/Feature Engineering
import numpy as np
import pandas as pd

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt

# Statistical Analysis
import statsmodels as sm
from statsmodels.formula.api import ols
from statsmodels.graphics.gofplots import qqplot
from scipy.stats import levene
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multitest import multipletests
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.sandbox.stats.multicomp import TukeyHSDResults

# Miscellaneous
import string
import time

In [59]:
from bs4 import BeautifulSoup
import requests

alphabet = string.ascii_lowercase

fighter_details = {}

for letter in alphabet:

    response = requests.get(f"http://www.ufcstats.com/statistics/fighters?char={letter}&page=all")
    
    soup = BeautifulSoup(response.text, "lxml")
    
    fighter_details[letter] = soup.find_all("td",{'class':'b-statistics__table-col'})

In [60]:
fighter_first_names = []

for key, value in fighter_details.items():
    for fighter in np.arange(0, len(fighter_details[key]), 11):
        fighter_first_name = value[fighter].text.split("\n")[1]
        fighter_first_names.append(fighter_first_name)

In [61]:
fighter_last_names = []

for key, value in fighter_details.items():
    for fighter in np.arange(1, len(fighter_details[key]) + 1, 11):
        fighter_last_name = value[fighter].text.split("\n")[1]
        fighter_last_names.append(fighter_last_name)

In [62]:
fighter_nick_names = []

for key, value in fighter_details.items():
    for fighter in np.arange(2, len(fighter_details[key]) + 2, 11):
        fighter_nick_name = value[fighter].text.split("\n")[1]
        fighter_nick_names.append(fighter_nick_name)

In [63]:
fighter_heights = []

for key, value in fighter_details.items():
    for fighter in np.arange(3, len(fighter_details[key]) + 3, 11):
        fighter_height = value[fighter].text.split("\n")[1][10:]
        fighter_heights.append(fighter_height)

In [64]:
fighter_weights = []

for key, value in fighter_details.items():
    for fighter in np.arange(4, len(fighter_details[key]) + 4, 11):
        fighter_weight = value[fighter].text.split("\n")[1][10:13]
        fighter_weights.append(fighter_weight)

In [65]:
fighter_reaches = []

for key, value in fighter_details.items():
    for fighter in np.arange(5, len(fighter_details[key]) + 5, 11):
        fighter_reach = value[fighter].text.split("\n")[1][10:14]
        fighter_reaches.append(fighter_reach)

In [66]:
fighter_stances = []

for key, value in fighter_details.items():
    for fighter in np.arange(6, len(fighter_details[key]) + 6, 11):
        fighter_stance = value[fighter].text.split("\n")[1][10:]
        fighter_stances.append(fighter_stance)

In [67]:
fighter_wins = []

for key, value in fighter_details.items():
    for fighter in np.arange(7, len(fighter_details[key]) + 7, 11):
        fighter_win = value[fighter].text.split("\n")[1][10:]
        fighter_wins.append(fighter_win)

In [68]:
fighter_losses = []

for key, value in fighter_details.items():
    for fighter in np.arange(8, len(fighter_details[key]) + 8, 11):
        fighter_loss = value[fighter].text.split("\n")[1][10:]
        fighter_losses.append(fighter_loss)

In [69]:
fighter_draws = []

for key, value in fighter_details.items():
    for fighter in np.arange(9, len(fighter_details[key]) + 9, 11):
        fighter_draw = value[fighter].text.split("\n")[1][10:]
        fighter_draws.append(fighter_draw)

In [70]:
fighter_df = pd.DataFrame(fighter_last_names)
fighter_df.rename(columns={0:"last_name"}, inplace = True)

fighter_df["first_name"] = fighter_first_names
fighter_df["nick_name"] = fighter_nick_names
fighter_df["height"] = fighter_heights
fighter_df["weight"] = fighter_weights
fighter_df["reach"] = fighter_reaches
fighter_df["stance"] = fighter_stances
fighter_df["wins"] = fighter_wins
fighter_df["losses"] = fighter_losses
fighter_df["draws"] = fighter_draws

In [71]:
len(fighter_df['reach'])

3417

In [72]:
fighter_df.shape

(3417, 10)

In [73]:
fighter_df.head()

Unnamed: 0,last_name,first_name,nick_name,height,weight,reach,stance,wins,losses,draws
0,Aaron,Tom,,--,155,--,,5,3,0
1,Abbadi,Danny,The Assassin,"5' 11""",155,--,Orthodox,4,6,0
2,Abbott,David,Tank,"6' 0""",265,--,Switch,10,14,0
3,Abdurakhimov,Shamil,Abrek,"6' 3""",235,76.0,Orthodox,20,5,0
4,Abe,Hiroyuki,Abe Ani,"5' 6""",145,--,Orthodox,8,14,3


In [74]:
fighter_df[fighter_df['last_name']=='McGregor']

Unnamed: 0,last_name,first_name,nick_name,height,weight,reach,stance,wins,losses,draws
1934,McGregor,Conor,The Notorious,"5' 9""",155,74.0,Southpaw,22,4,0


In [75]:
df_null = fighter_df[fighter_df['reach']=='--'].index

In [76]:
fighter_df.drop(df_null,inplace=True)

In [77]:
fighter_df.head()

Unnamed: 0,last_name,first_name,nick_name,height,weight,reach,stance,wins,losses,draws
3,Abdurakhimov,Shamil,Abrek,"6' 3""",235,76.0,Orthodox,20,5,0
5,Abe,Daichi,,"5' 11""",170,71.0,Orthodox,6,2,0
6,Abedi,Papy,Makambo,"5' 11""",185,74.0,Southpaw,9,3,0
7,Abreu,Ricardo,Demente,"5' 11""",185,73.0,Orthodox,5,3,0
8,Abreu,Klidson,White Bear,"6' 0""",205,74.0,Orthodox,15,4,0


In [79]:
fighter_df.sort_values(by='weight',inplace=True)

In [82]:
fighter_df.head()

Unnamed: 0,last_name,first_name,nick_name,height,weight,reach,stance,wins,losses,draws
1592,Kondo,Syuri,,"5' 4""",115,66.0,Orthodox,6,3,0
1409,Jandiroba,Virna,Carcara,"5' 3""",115,64.0,Orthodox,15,1,0
1420,Jedrzejczyk,Joanna,,"5' 6""",115,65.0,Orthodox,16,4,0
1428,Jeon,Chanmi,Ottogi Girl,"5' 5""",115,67.0,Orthodox,5,2,0
1473,Jones-Lybarger,Jocelyn,,"5' 7""",115,64.0,Orthodox,6,4,0


In [84]:
fighter_df.weight.unique()

array(['115', '125', '135', '139', '145', '155', '168', '170', '185',
       '205', '225', '230', '231', '234', '235', '238', '240', '241',
       '242', '243', '244', '245', '246', '247', '249', '250', '251',
       '253', '255', '257', '258', '260', '262', '263', '264', '265'],
      dtype=object)

In [108]:
fighter_df["wins"] = fighter_df["wins"].astype(int)
fighter_df["losses"] = fighter_df["losses"].astype(int)
fighter_df["draws"] = fighter_df["draws"].astype(int)

In [109]:
fighter_df["total_fights"] = fighter_df.wins + fighter_df.losses + fighter_df.draws
fighter_df["win_ratio"] = round((fighter_df.wins)/(fighter_df.total_fights), 2)

In [113]:
fighter_df.weight.value_counts().sort_values(ascending=False)

170    292
155    284
135    229
145    213
185    206
205    150
125    132
115     64
265     27
250     14
240     13
245     10
260      9
230      8
255      7
264      6
242      5
247      5
235      3
225      3
238      2
243      2
253      2
258      2
249      2
251      1
263      1
139      1
168      1
246      1
244      1
231      1
234      1
241      1
257      1
262      1
Name: weight, dtype: int64

In [114]:
fighter_df = fighter_df[fighter_df['total_fights'] >=5]

In [115]:
fighter_df.shape

(1689, 12)

In [116]:
fighter_df.isnull().sum().sum()

0