In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('ufc_fights.csv')
# print(df.columns)
print(', '.join(df.columns))
# print(df.head())
# print(df.info())
# print(df.describe())

In [None]:
#analyze and handle missing data

def missing_value_analysis(df):
    missing_values = df.isnull().sum()
    missing_values_percentage = (missing_values / len(df)) * 100
    missing_values_table = pd.concat([missing_values, missing_values_percentage], axis=1)
    missing_values_table = missing_values_table.rename(columns={0: 'Missing Values', 1: 'Percentage'})

    return missing_values_table[missing_values_table['Missing Values'] > 0].sort_values(by='Percentage', ascending=False)


missing_data = missing_value_analysis(df)
print(missing_data)

In [None]:
# Handle missing values
def handle_missing_values(df):
   
    df_processed = df.copy()
    
    # 1.Drop EmptyArena column as it's not needed for meaningful analysis
    print(f"Dropping EmptyArena column as it's not needed for analysis")
    if 'EmptyArena' in df_processed.columns:
        df_processed = df_processed.drop('EmptyArena', axis=1)
    
    # 2.Handle FinishDetails (55.70% missing)
    print("Creating indicator for FinishDetails column")
    df_processed['FinishDetails_present'] = df_processed['FinishDetails'].notnull().astype(int)
    df_processed['FinishDetails_clean'] = df_processed['FinishDetails'].fillna("Not Recorded")
    
    # 3.Handle Match Weight Class Rank columns
    print("Creating indicators and categories for BMatchWCRank and RMatchWCRank")
    # For Blue corner
    df_processed['BMatchWCRank_present'] = df_processed['BMatchWCRank'].notnull().astype(int)
    # For Red corner
    df_processed['RMatchWCRank_present'] = df_processed['RMatchWCRank'].notnull().astype(int)
    
    # 4.Create ordinal categories for ranks
    if 'BMatchWCRank' in df_processed.columns and pd.api.types.is_numeric_dtype(df_processed['BMatchWCRank']):
        # Convert to float to ensure NaN values are handled properly
        rank_values = df_processed['BMatchWCRank'].astype(float)
        # Create a mask for NaN values
        na_mask = rank_values.isna()
        # Create categories for non-NaN values
        cats = pd.cut(rank_values[~na_mask], 
                     bins=[0, 10, 25, 50, 100, float('inf')],
                     labels=['Top 10', '11-25', '26-50', '51-100', '100+'],
                     include_lowest=True)
        # Initialize result series with proper size
        result = pd.Series(index=rank_values.index, dtype='object')
        # Assign categorized values
        result[~na_mask] = cats
        # Assign 'Unranked' to NaN positions
        result[na_mask] = 'Unranked'
        # Create final categorical with all possible values
        all_categories = ['Top 10', '11-25', '26-50', '51-100', '100+', 'Unranked']
        df_processed['BMatchWCRank_cat'] = pd.Categorical(result, categories=all_categories)
    
    if 'RMatchWCRank' in df_processed.columns and pd.api.types.is_numeric_dtype(df_processed['RMatchWCRank']):
        # Convert to float to ensure NaN values are handled properly
        rank_values = df_processed['RMatchWCRank'].astype(float)
        # Create a mask for NaN values
        na_mask = rank_values.isna()
        # Create categories for non-NaN values
        cats = pd.cut(rank_values[~na_mask], 
                     bins=[0, 10, 25, 50, 100, float('inf')],
                     labels=['Top 10', '11-25', '26-50', '51-100', '100+'],
                     include_lowest=True)
        # Initialize result series with proper size
        result = pd.Series(index=rank_values.index, dtype='object')
        # Assign categorized values
        result[~na_mask] = cats
        # Assign 'Unranked' to NaN positions
        result[na_mask] = 'Unranked'
        # Create final categorical with all possible values
        all_categories = ['Top 10', '11-25', '26-50', '51-100', '100+', 'Unranked']
        df_processed['RMatchWCRank_cat'] = pd.Categorical(result, categories=all_categories)
    
    # 5.Handle extreme missing values (>95%) - mostly weight class rankings
    extreme_missing_cols = missing_data[missing_data['Percentage'] > 95].index.tolist()
    print(f"Creating binary indicators for {len(extreme_missing_cols)} columns with >95% missing values")
    
    for col in extreme_missing_cols:
        df_processed[f"{col}_present"] = df_processed[col].notnull().astype(int)
    
    # 6.Create aggregated indicators for weight class rankings
    ranking_cols = [col for col in df_processed.columns if 'Rank' in col and col in extreme_missing_cols]
    
    # Aggregate by corner (blue/red)
    if ranking_cols:
        blue_rank_cols = [col for col in ranking_cols if col.startswith('B')]
        red_rank_cols = [col for col in ranking_cols if col.startswith('R')]
        
        if blue_rank_cols:
            df_processed['blue_has_any_rank'] = df_processed[blue_rank_cols].notnull().any(axis=1).astype(int)
        if red_rank_cols:
            df_processed['red_has_any_rank'] = df_processed[red_rank_cols].notnull().any(axis=1).astype(int)


     # 7. Handle columns with low data values missing (<21%)
    low_missing_cols = missing_data[missing_data['Percentage'] < 21].index.tolist()
    
    print(f"Standard imputation for {len(low_missing_cols)} columns with <23% missing values")
    
    for col in low_missing_cols:
        if col in df_processed.columns:
            # For numeric columns
            if pd.api.types.is_numeric_dtype(df_processed[col]):
                # Impute with median
                median_val = df_processed[col].median()
                df_processed[col] = df_processed[col].fillna(median_val)
                print(f"  - Imputed {col} with median: {median_val}")
            # For categorical/object columns
            else:
                # Impute with mode
                mode_val = df_processed[col].mode()[0] if not df_processed[col].mode().empty else "Unknown"
                df_processed[col] = df_processed[col].fillna(mode_val)
                print(f"  - Imputed {col} with mode: {mode_val}")       
    
    return df_processed

df_processed = handle_missing_values(df)
          

In [None]:
# Verify results
print("\nVerifying results after processing")
missing_after = missing_value_analysis(df_processed.drop([col for col in df_processed.columns 
                                       if col.endswith('_present') or col.endswith('_cat') or col.endswith('_clean')], axis=1))
print("Original columns with missing values after processing:")
if len(missing_after) > 0:
    display(missing_after)
else:
    print("No missing values remain in processed original columns!")

# Save processed dataset
df_processed.to_csv('processed_fighting_data.csv', index=False)
print("\nProcessed dataset saved!")
