# Data preprocessing

De volgende code is gebruikt om de datasets op te schonen, te filteren en samen te voegen.

In [1]:
import pandas as pd
# Stap 1: Laad de datasets 

df1 = pd.read_csv("data.csv") # Rajeevw (prestaties en fysieke data) 

df2 = pd.read_csv("ufc-master.csv") # Mdabbert (aanvullende info) 

 

# Stap 2: Harmoniseer kolomnamen in df2 

df2 = df2.rename(columns={ 

'RedFighter': 'R_fighter', 

'BlueFighter': 'B_fighter' 

}) 

 

# Stap 3: Merge datasets op vechtersnamen (datum wordt genegeerd) 

df_merged = pd.merge(df1, df2, on=['R_fighter', 'B_fighter'], how='inner') 

 

# Stap 4: Selecteer relevante kolommen 

relevant_columns = [ 

'R_age', 'B_age', 

'R_Height_cms', 'B_Height_cms', 

'R_Reach_cms', 'B_Reach_cms', 

'R_avg_SIG_STR_pct', 'B_avg_SIG_STR_pct', 

'R_avg_TD_pct', 'B_avg_TD_pct', 

'R_avg_SUB_ATT', 'B_avg_SUB_ATT', 

'weight_class', 

'Winner_x' 

] 

df_clean = df_merged[relevant_columns].copy() 

 

# Stap 5: Verwijder onbekende waarden 

df_clean = df_clean.replace(['Unknown', 'Unclear', 'unk', 'UNK', '?'], pd.NA) 

 

# Stap 6: Verwijder rijen met ontbrekende waarden 

df_clean = df_clean.dropna() 

 

# Stap 7: Verwijder catchweight- én vrouwelijke gewichtsklassen 

df_clean = df_clean[~df_clean['weight_class'].str.contains('Catch|Women', case=False, na=False)] 

 

# Stap 8: Hernoem 'Winner_x' naar 'Winner' 

df_clean = df_clean.rename(columns={'Winner_x': 'Winner'}) 

df_clean = df_clean.reset_index(drop=True)

# Stap 9: Voeg verschilvariabelen toe 

df_clean['age_diff'] = df_clean['R_age'] - df_clean['B_age'] 

df_clean['height_diff'] = df_clean['R_Height_cms'] - df_clean['B_Height_cms'] 

df_clean['reach_diff'] = df_clean['R_Reach_cms'] - df_clean['B_Reach_cms'] 

df_clean['strike_acc_diff'] = df_clean['R_avg_SIG_STR_pct'] - df_clean['B_avg_SIG_STR_pct'] 

df_clean['td_acc_diff'] = df_clean['R_avg_TD_pct'] - df_clean['B_avg_TD_pct'] 

df_clean['sub_att_diff'] = df_clean['R_avg_SUB_ATT'] - df_clean['B_avg_SUB_ATT'] 

 

# Stap 10: Opslaan als nieuwe CSV 

df_clean.to_csv("ufc_clean.csv", index=False) 

 

# Stap 11: Eerste 5 rijen laten zien 

print(df_clean.head(5)) 

   R_age  B_age  R_Height_cms  B_Height_cms  R_Reach_cms  B_Reach_cms  \
0   27.0   31.0        170.18        165.10       177.80       170.18   
1   28.0   32.0        182.88        187.96       187.96       193.04   
2   41.0   27.0        182.88        177.80       190.50       182.88   
3   31.0   35.0        182.88        180.34       180.34       193.04   
4   37.0   28.0        185.42        190.50       195.58       205.74   

   R_avg_SIG_STR_pct  B_avg_SIG_STR_pct  R_avg_TD_pct  B_avg_TD_pct  \
0           0.500000           0.420000      0.000000      0.330000   
1           0.576875           0.660000      0.406250      0.300000   
2           0.565156           0.515000      0.337031      0.435000   
3           0.493125           0.459277      0.000000      0.322188   
4           0.515636           0.791582      0.355824      0.269199   

   R_avg_SUB_ATT  B_avg_SUB_ATT  weight_class Winner  age_diff  height_diff  \
0       0.000000       0.500000  Bantamweight    Red   