In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Run this when you've made some plots to see the difference

%matplotlib inline

In [3]:
data = pd.read_csv('ufc-fighters-statistics.csv')

### Getting a feel for the data ###

In [4]:
data.duplicated().sum() # Makes sure there isn't any duplicated rows in the data

0

In [5]:
data.shape # Check the number of rows and columns. This dataset contains 4111 rows and 18 columns

(4111, 18)

In [6]:
data.info() # Summary information about the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4111 entries, 0 to 4110
Data columns (total 18 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   name                                          4111 non-null   object 
 1   nickname                                      2257 non-null   object 
 2   wins                                          4111 non-null   int64  
 3   losses                                        4111 non-null   int64  
 4   draws                                         4111 non-null   int64  
 5   height_cm                                     3813 non-null   float64
 6   weight_in_kg                                  4024 non-null   float64
 7   reach_in_cm                                   2184 non-null   float64
 8   stance                                        3288 non-null   object 
 9   date_of_birth                                 2976 non-null   o

In [7]:
data.describe() # Generates some descriptive statistics about the data

Unnamed: 0,wins,losses,draws,height_cm,weight_in_kg,reach_in_cm,significant_strikes_landed_per_minute,significant_striking_accuracy,significant_strikes_absorbed_per_minute,significant_strike_defence,average_takedowns_landed_per_15_minutes,takedown_accuracy,takedown_defense,average_submissions_attempted_per_15_minutes
count,4111.0,4111.0,4111.0,3813.0,4024.0,2184.0,4111.0,4111.0,4111.0,4111.0,4111.0,4111.0,4111.0,4111.0
mean,12.366821,5.726344,0.264413,178.234325,77.395825,181.808874,2.437516,35.542447,3.145206,42.643639,1.250586,26.299927,38.958891,0.610095
std,9.374667,5.103768,0.822373,8.88805,17.982242,10.680804,1.990903,20.398502,2.848502,22.322427,1.935716,28.70098,34.426458,1.505924
min,0.0,0.0,0.0,152.4,47.63,147.32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7.0,2.0,0.0,172.72,65.77,175.26,0.83,27.0,1.55,36.0,0.0,0.0,0.0,0.0
50%,11.0,5.0,0.0,177.8,77.11,182.88,2.33,40.0,2.94,50.0,0.59,22.0,42.0,0.0
75%,17.0,8.0,0.0,185.42,83.91,190.5,3.6,49.0,4.23,58.0,1.94,45.0,66.0,0.7
max,253.0,83.0,11.0,226.06,349.27,213.36,17.65,100.0,52.5,100.0,32.14,100.0,100.0,21.9


In [8]:
# Noticed that the highest weight value is extremely big relative to the percentiles. Let's check a few of the heaviest fighters.

In [9]:
data.nlargest(5, 'weight_in_kg') # Displays 5 rows with the highest weight_in_kg

Unnamed: 0,name,nickname,wins,losses,draws,height_cm,weight_in_kg,reach_in_cm,stance,date_of_birth,significant_strikes_landed_per_minute,significant_striking_accuracy,significant_strikes_absorbed_per_minute,significant_strike_defence,average_takedowns_landed_per_15_minutes,takedown_accuracy,takedown_defense,average_submissions_attempted_per_15_minutes
4069,Emmanuel Yarborough,,1,2,0,203.2,349.27,,Open Stance,1960-09-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2731,Teila Tuli,,0,1,0,182.88,195.04,,Orthodox,1969-06-14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3815,Thomas Ramirez,,0,1,0,185.42,185.97,,Sideways,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2819,Cory Peterson,LA Giant,2,1,0,210.82,181.44,,Orthodox,,2.17,60.0,6.51,40.0,0.0,0.0,0.0,0.0
3174,John Matua,,1,4,0,187.96,181.44,,Orthodox,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### This checks out! A quick google search shows that Emmanuel Yarborough was ~720 pounds. ####

$\;\;\;\;\;\;$
## Changing some column names and units to my preference ##

In [10]:
data.iloc[:,6] *= 2.20462 # Converts weight values from kilograms to pounds

In [11]:
data.iloc[:,5] *= 0.393701 #Converts height values from centimeters to inches

In [12]:
data.iloc[:,7] *= 0.393701 #Converts reach values from centimeters to inches

In [13]:
data.rename(columns={'height_cm': 'height_inches', 'weight_in_kg': 'weight_in_pounds', 'reach_in_cm': 'reach_in_inches'}, inplace = True)
# Changes the column names to reflect the previous changes

$\;\;\;\;\;\;$
## Checking for unique and missing data ##

In [14]:
np.sum(data.isnull()) #Displays the number of rows with missing data per column

  return reduction(axis=axis, out=out, **passkwargs)


name                                               0
nickname                                        1854
wins                                               0
losses                                             0
draws                                              0
height_inches                                    298
weight_in_pounds                                  87
reach_in_inches                                 1927
stance                                           823
date_of_birth                                   1135
significant_strikes_landed_per_minute              0
significant_striking_accuracy                      0
significant_strikes_absorbed_per_minute            0
significant_strike_defence                         0
average_takedowns_landed_per_15_minutes            0
takedown_accuracy                                  0
takedown_defense                                   0
average_submissions_attempted_per_15_minutes       0
dtype: int64

In [15]:
data.columns[np.sum(data.isnull()) != 0] #Returns the names of the columns that contain missing data

Index(['nickname', 'height_inches', 'weight_in_pounds', 'reach_in_inches',
       'stance', 'date_of_birth'],
      dtype='object')

In [16]:
for col in data:
    unique_count = data[col].nunique()
    print(col + " has " + str(unique_count) + " unique values")

# Simple for loop to print the column names along with how many unique values they contain

name has 4105 unique values
nickname has 1784 unique values
wins has 57 unique values
losses has 40 unique values
draws has 12 unique values
height_inches has 26 unique values
weight_in_pounds has 112 unique values
reach_in_inches has 27 unique values
stance has 5 unique values
date_of_birth has 2565 unique values
significant_strikes_landed_per_minute has 698 unique values
significant_striking_accuracy has 83 unique values
significant_strikes_absorbed_per_minute has 813 unique values
significant_strike_defence has 84 unique values
average_takedowns_landed_per_15_minutes has 560 unique values
takedown_accuracy has 83 unique values
takedown_defense has 94 unique values
average_submissions_attempted_per_15_minutes has 99 unique values


$\;\;\;\;\;\;$
## Answering Questions ##

In [17]:
# What percentage of fighters are Southpaw?
data['stance'].value_counts()

stance
Orthodox       2526
Southpaw        560
Switch          192
Open Stance       7
Sideways          3
Name: count, dtype: int64

In [18]:
data.stance.value_counts(normalize=True).mul(100).round(1).astype(str) + '%'

stance
Orthodox       76.8%
Southpaw       17.0%
Switch          5.8%
Open Stance     0.2%
Sideways        0.1%
Name: proportion, dtype: object

In [61]:
#Is it an advantage to be a lefty? See if Southpaw fighters have a better record, on average

ttl_wins = data['wins'].sum() 
ttl_fights = data[['wins', 'losses', 'draws']].sum().sum()

ttl_wins/ttl_fights
# Win percentage for all fighters

0.6736630094874649

In [59]:
southpaws = data[data['stance'] == 'Southpaw']
sp_wins = southpaws['wins'].sum()
sp_fights = southpaws[['wins', 'losses', 'draws']].sum().sum()
sp_wins/sp_fights

0.6842369058558182

In [21]:
#We can see that southpaws do slightly win more!

$\;\;\;\;\;\;$

Anderson Silva is my favorite fighter. How can I see his stats?


In [22]:
data.loc[data['name'] == "Anderson Silva"]

Unnamed: 0,name,nickname,wins,losses,draws,height_inches,weight_in_pounds,reach_in_inches,stance,date_of_birth,significant_strikes_landed_per_minute,significant_striking_accuracy,significant_strikes_absorbed_per_minute,significant_strike_defence,average_takedowns_landed_per_15_minutes,takedown_accuracy,takedown_defense,average_submissions_attempted_per_15_minutes
888,Anderson Silva,The Spider,34,11,0,74.00004,184.989664,77.000042,Southpaw,1975-04-14,3.05,61.0,2.05,60.0,0.5,77.0,69.0,0.8
