In [3]:
# UFC Fight Predictor
import pandas as pd
import numpy as np
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

print("UFC FIGHT PREDICTION SYSTEM")
print("="*50)

UFC FIGHT PREDICTION SYSTEM


In [6]:
# Cell 1: Setup and Load Models
import pandas as pd
import numpy as np
import pickle
import os
from datetime import datetime

class UFCFightPredictor:
    def __init__(self):
        self.models = {}
        self.fighter_data = None
        self.load_fighter_data()
        self.load_models()
    
    def load_fighter_data(self):
        """Load current fighter profiles"""
        try:
            self.fighter_data = pd.read_csv("../build/fighter_profiles.csv")
            print(f"✓ Loaded {len(self.fighter_data)} fighter profiles")
        except FileNotFoundError:
            print("❌ fighter_profiles.csv not found!")
            return
    
    def load_models(self):
        """Load all trained models"""
        model_files = {
            'logistic': '../models/logistic_regression_model.pkl',
            'decision_tree': '../models/decision_tree_model.pkl', 
            'random_forest': '../models/random_forest_model.pkl'
        }
        
        for name, filepath in model_files.items():
            try:
                with open(filepath, 'rb') as f:
                    self.models[name] = pickle.load(f)
                print(f"✓ Loaded {name} model")
            except FileNotFoundError:
                print(f"⚠️  {name} model not found")
    
    def find_fighter(self, name):
        """Find fighter by name with fuzzy matching"""
        name_lower = name.lower().strip()
        
        # Exact match
        exact = self.fighter_data[
            self.fighter_data['fighter_name'].str.lower() == name_lower
        ]
        if not exact.empty:
            return exact.iloc[0]
        
        # Partial match
        partial = self.fighter_data[
            self.fighter_data['fighter_name'].str.lower().str.contains(name_lower, na=False)
        ]
        
        if partial.empty:
            print(f"❌ Fighter '{name}' not found!")
            return None
        elif len(partial) > 1:
            print(f"Multiple matches for '{name}':")
            for _, fighter in partial.head(3).iterrows():
                print(f"  - {fighter['fighter_name']} (ELO: {fighter['current_elo_rating']:.0f})")
        
        return partial.iloc[0]
    
    def create_fight_features(self, fighter_a, fighter_b):
        """Create feature vector for a fight"""
        features = {}
        
        # Core features that both models use
        features['current_elo_rating_A'] = fighter_a['current_elo_rating']
        features['current_elo_rating_B'] = fighter_b['current_elo_rating']
        features['current_elo_rating_diff'] = fighter_a['current_elo_rating'] - fighter_b['current_elo_rating']
        
        features['age_A'] = fighter_a.get('age', 30)  # Default age if missing
        features['age_B'] = fighter_b.get('age', 30)
        features['age_diff'] = features['age_A'] - features['age_B']
        
        features['reach_inches_A'] = fighter_a.get('reach_inches', 72)  # Default reach
        features['reach_inches_B'] = fighter_b.get('reach_inches', 72)
        features['reach_inches_diff'] = features['reach_inches_A'] - features['reach_inches_B']
        
        features['height_inches_A'] = fighter_a.get('height_inches', 70)
        features['height_inches_B'] = fighter_b.get('height_inches', 70)
        features['height_inches_diff'] = features['height_inches_A'] - features['height_inches_B']
        
        features['win_rate_A'] = fighter_a['win_rate']
        features['win_rate_B'] = fighter_b['win_rate']
        features['win_rate_diff'] = fighter_a['win_rate'] - fighter_b['win_rate']
        
        features['total_fights_A'] = fighter_a['total_fights']
        features['total_fights_B'] = fighter_b['total_fights']
        features['total_wins_A'] = fighter_a['total_wins']
        features['total_wins_B'] = fighter_b['total_wins']
        features['total_losses_A'] = fighter_a['total_losses']
        features['total_losses_B'] = fighter_b['total_losses']
        
        # Missing value indicators
        features['has_age_A'] = 1 if pd.notna(fighter_a.get('age')) else 0
        features['has_age_B'] = 1 if pd.notna(fighter_b.get('age')) else 0
        features['has_reach_A'] = 1 if pd.notna(fighter_a.get('reach_inches')) else 0
        features['has_reach_B'] = 1 if pd.notna(fighter_b.get('reach_inches')) else 0
        
        return features
    
    def predict_with_model(self, features, model_name):
        """Make prediction with specific model"""
        if model_name not in self.models:
            return None
        
        model_data = self.models[model_name]
        model = model_data['model']
        model_features = model_data['features']
        
        # Create feature vector in correct order
        feature_vector = []
        for feature in model_features:
            feature_vector.append(features.get(feature, 0))
        
        X = np.array(feature_vector).reshape(1, -1)
        
        # Apply scaling if logistic regression
        if model_name == 'logistic' and 'scaler' in model_data and model_data['scaler']:
            X = model_data['scaler'].transform(X)
        
        # Get prediction
        prediction = model.predict(X)[0]
        try:
            prob_a_wins = model.predict_proba(X)[0][1]
        except:
            # Fallback for models without predict_proba
            prob_a_wins = 0.7 if prediction == 1 else 0.3
        
        return {
            'prediction': prediction,
            'prob_a_wins': prob_a_wins,
            'prob_b_wins': 1 - prob_a_wins
        }
    
    def predict_fight(self, fighter_a_name, fighter_b_name):
        """Predict fight outcome using all available models"""
        print(f"\n{'='*60}")
        print(f"PREDICTING: {fighter_a_name} vs {fighter_b_name}")
        print(f"{'='*60}")
        
        # Find fighters
        fighter_a = self.find_fighter(fighter_a_name)
        fighter_b = self.find_fighter(fighter_b_name)
        
        if fighter_a is None or fighter_b is None:
            return None
        
        print(f"✓ {fighter_a['fighter_name']} (ELO: {fighter_a['current_elo_rating']:.0f})")
        print(f"✓ {fighter_b['fighter_name']} (ELO: {fighter_b['current_elo_rating']:.0f})")
        
        # Create features
        features = self.create_fight_features(fighter_a, fighter_b)
        
        # Get predictions from all models
        predictions = {}
        for model_name in self.models:
            result = self.predict_with_model(features, model_name)
            if result:
                predictions[model_name] = result
        
        # Display results
        print(f"\n📊 PREDICTIONS:")
        print("-" * 40)
        
        ensemble_prob = 0
        model_count = 0
        
        for model_name, pred in predictions.items():
            winner = fighter_a['fighter_name'] if pred['prediction'] == 1 else fighter_b['fighter_name']
            confidence = max(pred['prob_a_wins'], pred['prob_b_wins'])
            
            print(f"{model_name.upper():15}: {winner}")
            print(f"{'':15}  Confidence: {confidence:.1%}")
            print(f"{'':15}  {fighter_a['fighter_name']}: {pred['prob_a_wins']:.1%}")
            print(f"{'':15}  {fighter_b['fighter_name']}: {pred['prob_b_wins']:.1%}")
            print()
            
            ensemble_prob += pred['prob_a_wins']
            model_count += 1
        
        # Ensemble prediction
        if model_count > 0:
            ensemble_prob /= model_count
            ensemble_winner = fighter_a['fighter_name'] if ensemble_prob > 0.5 else fighter_b['fighter_name']
            ensemble_confidence = max(ensemble_prob, 1-ensemble_prob)
            
            print("🏆 ENSEMBLE PREDICTION:")
            print(f"   Winner: {ensemble_winner}")
            print(f"   Confidence: {ensemble_confidence:.1%}")
            print(f"   {fighter_a['fighter_name']}: {ensemble_prob:.1%}")
            print(f"   {fighter_b['fighter_name']}: {1-ensemble_prob:.1%}")
        
        # Key factors
        print(f"\n🔍 KEY FACTORS:")
        elo_diff = features['current_elo_rating_diff']
        print(f"   ELO Advantage: {elo_diff:+.0f} (favors {fighter_a['fighter_name'] if elo_diff > 0 else fighter_b['fighter_name']})")
        print(f"   Age Difference: {features['age_diff']:+.1f} years")
        print(f"   Reach Advantage: {features['reach_inches_diff']:+.1f} inches")
        print(f"   Win Rate: {fighter_a['win_rate']:.1%} vs {fighter_b['win_rate']:.1%}")
        
        return {
            'fighter_a': fighter_a['fighter_name'],
            'fighter_b': fighter_b['fighter_name'],
            'predictions': predictions,
            'ensemble_prob_a': ensemble_prob if model_count > 0 else 0.5,
            'features': features
        }

# Initialize predictor
predictor = UFCFightPredictor()

✓ Loaded 2624 fighter profiles
✓ Loaded logistic model
✓ Loaded decision_tree model
✓ Loaded random_forest model


In [8]:
# Quick ELO-based prediction for Roman Dolidze vs Anthony Hernandez
def quick_fight_prediction(fighter_a_name, fighter_b_name):
    print(f"\n{'='*60}")
    print(f"FIGHT PREDICTION: {fighter_a_name} vs {fighter_b_name}")
    print(f"{'='*60}")
    
    # Find fighters
    fighter_a = predictor.find_fighter(fighter_a_name)
    fighter_b = predictor.find_fighter(fighter_b_name)
    
    if fighter_a is None or fighter_b is None:
        return None
    
    print(f"✓ {fighter_a['fighter_name']} (ELO: {fighter_a['current_elo_rating']:.0f})")
    print(f"✓ {fighter_b['fighter_name']} (ELO: {fighter_b['current_elo_rating']:.0f})")
    
    # ELO-based win probability calculation
    elo_diff = fighter_a['current_elo_rating'] - fighter_b['current_elo_rating']
    prob_a_wins = 1 / (1 + 10**(-elo_diff/400))  # Standard ELO probability
    prob_b_wins = 1 - prob_a_wins
    
    winner = fighter_a['fighter_name'] if prob_a_wins > 0.5 else fighter_b['fighter_name']
    confidence = max(prob_a_wins, prob_b_wins)
    
    print(f"\n🥊 WIN PERCENTAGES:")
    print(f"   {fighter_a['fighter_name']}: {prob_a_wins:.1%}")
    print(f"   {fighter_b['fighter_name']}: {prob_b_wins:.1%}")
    print(f"\n🏆 PREDICTION: {winner} (Confidence: {confidence:.1%})")
    
    # Additional stats
    print(f"\n📊 FIGHTER COMPARISON:")
    print(f"   ELO Advantage: {abs(elo_diff):.0f} points (favors {fighter_a['fighter_name'] if elo_diff > 0 else fighter_b['fighter_name']})")
    print(f"   Fight Record: {fighter_a['total_wins']}-{fighter_a['total_losses']} vs {fighter_b['total_wins']}-{fighter_b['total_losses']}")
    print(f"   Win Rate: {fighter_a['win_rate']:.1%} vs {fighter_b['win_rate']:.1%}")
    
    age_a = fighter_a.get('age', 30) if pd.notna(fighter_a.get('age')) else 30
    age_b = fighter_b.get('age', 30) if pd.notna(fighter_b.get('age')) else 30
    print(f"   Age: {age_a:.0f} vs {age_b:.0f} years")
    
    reach_a = fighter_a.get('reach_inches', 72) if pd.notna(fighter_a.get('reach_inches')) else 72
    reach_b = fighter_b.get('reach_inches', 72) if pd.notna(fighter_b.get('reach_inches')) else 72
    print(f"   Reach: {reach_a:.0f}\" vs {reach_b:.0f}\"")
    
    return {
        'roman_dolidze_win_pct': prob_a_wins if fighter_a['fighter_name'].lower().find('dolidze') >= 0 else prob_b_wins,
        'anthony_hernandez_win_pct': prob_a_wins if fighter_a['fighter_name'].lower().find('hernandez') >= 0 else prob_b_wins,
        'prediction': winner,
        'confidence': confidence
    }

# Get the prediction
result = quick_fight_prediction("Roman Dolidze", "Anthony Hernandez")


FIGHT PREDICTION: Roman Dolidze vs Anthony Hernandez
✓ Roman Dolidze (ELO: 1584)
✓ Anthony Hernandez (ELO: 1587)

🥊 WIN PERCENTAGES:
   Roman Dolidze: 49.5%
   Anthony Hernandez: 50.5%

🏆 PREDICTION: Anthony Hernandez (Confidence: 50.5%)

📊 FIGHTER COMPARISON:
   ELO Advantage: 4 points (favors Anthony Hernandez)
   Fight Record: 9-3 vs 8-2
   Win Rate: 75.0% vs 80.0%
   Age: 37 vs 31 years
   Reach: 76" vs 75"


In [13]:
# Predict fights with all 3 models
# Try multiple fights to see how models compare
test_fights = [
    ("Miles Johns", "Jean Matsumoto"),
    ("Alex Pereira", "Magomed Ankalaev")
]

for fighter_a, fighter_b in test_fights:
    predictor.predict_fight(fighter_a, fighter_b)
    print("\n" + "="*40 + "\n")


PREDICTING: Miles Johns vs Jean Matsumoto
✓ Miles Johns (ELO: 1542)
✓ Jean Matsumoto (ELO: 1515)


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [11]:
# Advanced Analysis for All Test Fights
import pandas as pd
import numpy as np

def advanced_fight_analysis(fighter_a, fighter_b):
    """Comprehensive analysis of a single fight"""
    print(f"\n{'='*80}")
    print(f"ADVANCED ANALYSIS: {fighter_a} vs {fighter_b}")
    print(f"{'='*80}")
    
    # Get prediction results
    result = predictor.predict_fight(fighter_a, fighter_b)
    
    if not result:
        print("❌ Could not analyze this fight (fighters not found)")
        return None
    
    # Extract data
    predictions = result.get('predictions', {})
    features = result.get('features', {})
    
    print(f"\n📊 MODEL CONSENSUS ANALYSIS:")
    print("-" * 50)
    
    if predictions:
        # Model predictions
        model_winners = []
        model_probs_a = []
        model_names = []
        
        for model_name, pred in predictions.items():
            winner = fighter_a if pred['prediction'] == 1 else fighter_b
            prob_a = pred['prob_a_wins']
            
            model_winners.append(winner)
            model_probs_a.append(prob_a)
            model_names.append(model_name.upper())
            
            print(f"{model_name.upper():15}: {winner:20} ({prob_a:.1%} vs {1-prob_a:.1%})")
        
        # Consensus metrics
        fighter_a_wins = sum(1 for winner in model_winners if winner == fighter_a)
        consensus_strength = max(fighter_a_wins, len(model_winners) - fighter_a_wins) / len(model_winners)
        prob_spread = max(model_probs_a) - min(model_probs_a)
        avg_prob_a = np.mean(model_probs_a)
        
        print(f"\n🎯 CONSENSUS METRICS:")
        print(f"   Models favoring {fighter_a}: {fighter_a_wins}/{len(model_winners)}")
        print(f"   Models favoring {fighter_b}: {len(model_winners) - fighter_a_wins}/{len(model_winners)}")
        print(f"   Consensus Strength: {consensus_strength:.1%}")
        print(f"   Probability Spread: {prob_spread:.1%}")
        print(f"   Average {fighter_a} win prob: {avg_prob_a:.1%}")
        
        # Consensus interpretation
        if consensus_strength >= 1.0:
            consensus_level = "UNANIMOUS"
        elif consensus_strength >= 0.67:
            consensus_level = "STRONG MAJORITY"
        else:
            consensus_level = "SPLIT DECISION"
        
        print(f"   Consensus Level: {consensus_level}")
    
    # Fighter comparison analysis
    if features:
        print(f"\n⚔️  FIGHTER ADVANTAGE BREAKDOWN:")
        print("-" * 50)
        
        # ELO Analysis
        elo_diff = features.get('current_elo_rating_diff', 0)
        elo_a = features.get('current_elo_rating_A', 1500)
        elo_b = features.get('current_elo_rating_B', 1500)
        elo_favorite = fighter_a if elo_diff > 0 else fighter_b
        
        print(f"📈 ELO RATINGS:")
        print(f"   {fighter_a}: {elo_a:.0f}")
        print(f"   {fighter_b}: {elo_b:.0f}")
        print(f"   Difference: {abs(elo_diff):.0f} points → {elo_favorite}")
        
        # Experience Analysis
        fights_a = features.get('total_fights_A', 0)
        fights_b = features.get('total_fights_B', 0)
        wins_a = features.get('total_wins_A', 0)
        wins_b = features.get('total_wins_B', 0)
        winrate_a = features.get('win_rate_A', 0)
        winrate_b = features.get('win_rate_B', 0)
        
        print(f"\n🥊 EXPERIENCE & RECORD:")
        print(f"   {fighter_a}: {wins_a}-{fights_a-wins_a} ({winrate_a:.1%})")
        print(f"   {fighter_b}: {wins_b}-{fights_b-wins_b} ({winrate_b:.1%})")
        
        exp_favorite = fighter_a if fights_a > fights_b else fighter_b
        record_favorite = fighter_a if winrate_a > winrate_b else fighter_b
        print(f"   Experience Edge: {exp_favorite} ({max(fights_a, fights_b)} vs {min(fights_a, fights_b)} fights)")
        print(f"   Win Rate Edge: {record_favorite}")
        
        # Physical Attributes
        age_diff = features.get('age_diff', 0)
        reach_diff = features.get('reach_inches_diff', 0)
        height_diff = features.get('height_inches_diff', 0)
        
        print(f"\n👤 PHYSICAL ADVANTAGES:")
        if abs(age_diff) > 1:
            age_advantage = fighter_a if age_diff < 0 else fighter_b
            print(f"   Age: {age_advantage} ({abs(age_diff):.1f} years younger)")
        else:
            print(f"   Age: Similar ages")
            
        if abs(reach_diff) > 1:
            reach_advantage = fighter_a if reach_diff > 0 else fighter_b
            print(f"   Reach: {reach_advantage} (+{abs(reach_diff):.1f} inches)")
        else:
            print(f"   Reach: Similar reach")
            
        if abs(height_diff) > 1:
            height_advantage = fighter_a if height_diff > 0 else fighter_b
            print(f"   Height: {height_advantage} (+{abs(height_diff):.1f} inches)")
        else:
            print(f"   Height: Similar height")
    
    # Risk Assessment
    print(f"\n⚠️  PREDICTION CONFIDENCE ASSESSMENT:")
    print("-" * 50)
    
    if predictions:
        if prob_spread < 0.1:
            confidence_level = "HIGH"
            risk_level = "LOW"
        elif prob_spread < 0.2:
            confidence_level = "MODERATE"
            risk_level = "MODERATE"
        else:
            confidence_level = "LOW"
            risk_level = "HIGH"
        
        print(f"   Prediction Confidence: {confidence_level}")
        print(f"   Betting Risk Level: {risk_level}")
        print(f"   Model Agreement: {prob_spread:.1%} spread")
        
        # Betting recommendation
        if consensus_strength >= 0.67 and prob_spread < 0.15:
            recommendation = "RECOMMENDED"
        elif consensus_strength >= 0.67:
            recommendation = "MODERATE CONFIDENCE"
        else:
            recommendation = "HIGH RISK - MODELS DISAGREE"
        
        print(f"   Betting Recommendation: {recommendation}")
    
    return {
        'consensus_strength': consensus_strength if predictions else 0,
        'prob_spread': prob_spread if predictions else 0,
        'avg_prob_a': avg_prob_a if predictions else 0.5,
        'elo_diff': elo_diff,
        'experience_diff': fights_a - fights_b if features else 0,
        'physical_advantages': {
            'age_diff': age_diff,
            'reach_diff': reach_diff,
            'height_diff': height_diff
        }
    }

print("🔬 COMPREHENSIVE FIGHT ANALYSIS REPORT")
print("="*80)

analysis_results = []

for fighter_a, fighter_b in test_fights:
    analysis = advanced_fight_analysis(fighter_a, fighter_b)
    if analysis:
        analysis['fight'] = f"{fighter_a} vs {fighter_b}"
        analysis_results.append(analysis)

# Summary comparison
print(f"\n\n📋 ANALYSIS SUMMARY COMPARISON")
print("="*80)

if analysis_results:
    print(f"{'FIGHT':<40} {'CONSENSUS':<12} {'SPREAD':<10} {'ELO DIFF':<10} {'CONFIDENCE'}")
    print("-" * 80)
    
    for result in analysis_results:
        fight = result['fight']
        consensus = f"{result['consensus_strength']:.1%}"
        spread = f"{result['prob_spread']:.1%}"
        elo_diff = f"{result['elo_diff']:+.0f}"
        
        if result['consensus_strength'] >= 0.67 and result['prob_spread'] < 0.15:
            confidence = "HIGH"
        elif result['consensus_strength'] >= 0.67:
            confidence = "MODERATE"
        else:
            confidence = "LOW"
        
        print(f"{fight:<40} {consensus:<12} {spread:<10} {elo_diff:<10} {confidence}")

print(f"\n💡 LEGEND:")
print(f"   CONSENSUS: % of models agreeing on winner")
print(f"   SPREAD: Difference between highest and lowest win probability")
print(f"   ELO DIFF: ELO rating difference (+ favors first fighter)")
print(f"   CONFIDENCE: Overall prediction reliability")

🔬 COMPREHENSIVE FIGHT ANALYSIS REPORT

ADVANCED ANALYSIS: Islam Makhachev vs Ilia Topuria

PREDICTING: Islam Makhachev vs Ilia Topuria
✓ Islam Makhachev (ELO: 1686)
✓ Ilia Topuria (ELO: 1619)

📊 PREDICTIONS:
----------------------------------------
LOGISTIC       : Ilia Topuria
                 Confidence: 99.2%
                 Islam Makhachev: 0.8%
                 Ilia Topuria: 99.2%

DECISION_TREE  : Islam Makhachev
                 Confidence: 71.1%
                 Islam Makhachev: 71.1%
                 Ilia Topuria: 28.9%

RANDOM_FOREST  : Ilia Topuria
                 Confidence: 50.2%
                 Islam Makhachev: 49.8%
                 Ilia Topuria: 50.2%

🏆 ENSEMBLE PREDICTION:
   Winner: Ilia Topuria
   Confidence: 59.4%
   Islam Makhachev: 40.6%
   Ilia Topuria: 59.4%

🔍 KEY FACTORS:
   ELO Advantage: +66 (favors Islam Makhachev)
   Age Difference: +5.0 years
   Reach Advantage: +1.0 inches
   Win Rate: 93.8% vs 100.0%

📊 MODEL CONSENSUS ANALYSIS:
--------------------