# IE 582 Project - Live Betting Strategy for Soccer

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# increase the max column displayed
pd.set_option('display.max_columns', 500)

# Part 1: Inspecting Data

In [2]:
data = pd.read_csv('match_data.csv')

### Correlations with different features

In [5]:
copy_data = data.copy()
# Creating new columns
copy_data['dangerous_attack_diff'] = copy_data['Dangerous Attacks - home'] - copy_data['Dangerous Attacks - away']
copy_data['shots_on_target_diff'] = copy_data['Shots On Target - home'] - copy_data['Shots On Target - away']

# Result numeric
copy_data['result_numeric'] = copy_data['result'].map({'1': 1, '2': 2, 'X': 0})

# Ensure the columns are numeric
columns_to_check = [
    'Dangerous Attacks - home',
    'Dangerous Attacks - away',
    'Shots On Target - home',
    'Shots On Target - away',
    'shots_on_target_diff',
    'Shots Total - home',
    'Shots Total - away',
    'Goals - home',
    'Redcards - home',
    'Yellowcards - home',
    'dangerous_attack_diff',
    'Saves - away',
    'Saves - home',
]
for col in columns_to_check:
    if not pd.api.types.is_numeric_dtype(copy_data[col]):
        copy_data[col] = pd.to_numeric(copy_data[col], errors='coerce')  # Convert to numeric, handle errors with NaN

# Calculate correlations
correlations = {}
for col in columns_to_check:
    correlations[col] = copy_data['result_numeric'].corr(copy_data[col])

# Display results
for feature, corr_value in correlations.items():
    print(f"Correlation between result and {feature}: {corr_value:.2f}")


Correlation between result and Dangerous Attacks - home: -0.02
Correlation between result and Dangerous Attacks - away: 0.05
Correlation between result and Shots On Target - home: -0.06
Correlation between result and Shots On Target - away: 0.14
Correlation between result and shots_on_target_diff: -0.17
Correlation between result and Shots Total - home: -0.06
Correlation between result and Shots Total - away: 0.10
Correlation between result and Goals - home: -0.09
Correlation between result and Redcards - home: 0.06
Correlation between result and Yellowcards - home: 0.03
Correlation between result and dangerous_attack_diff: -0.07
Correlation between result and Saves - away: -0.03
Correlation between result and Saves - home: 0.01


In [6]:
# Split training and test sets based on temporal constraints
train_data = data[data['current_time'] < '2024-11-01']
test_data = data[data['current_time'] >= '2024-11-01']

# Part 2: Preprocessing and Training Functions

In [9]:
def preprocess_data_with_states(data):
    """
    Simplifies `current_state` and `result` by mapping '1', '2', and 'X' to 1, 2, and 0.
    """
    # Mapping for `current_state` and `result`
    state_mapping = {"1": 1, "2": 2, "X": 0}

    # Convert `current_state` and `result` to numeric values
    data['current_state_numeric'] = data['current_state'].map(state_mapping).fillna(-1).astype(int)
    data['result_numeric'] = data['result'].map(state_mapping).fillna(-1).astype(int)

    return data


# Feature Engineering for Match Data
def preprocess_match_data(match_data, significant_columns):
    """
    Preprocesses data for a single match:
    - Computes absolute match time.
    - Handles cumulative data for detecting significant events.
    - Generates features for each row dynamically.
    """
    # Calculate absolute minute
    match_data['absolute_minute'] = match_data.apply(
        lambda row: (row['halftime'] == "2nd-half") * 45 + row['minute'], axis=1
    )

    # Forward fill missing data and drop columns with all NaN
    match_data = match_data.ffill().dropna(axis=1, how='all')

    # Calculate deltas for numeric columns
    numeric_columns = match_data.select_dtypes(include=[np.number]).columns
    deltas = match_data[numeric_columns].diff().fillna(0)  # First row delta will be zero

    # Filter rows with significant changes in specified columns
    significant_change = deltas[significant_columns].abs().sum(axis=1) > 0
    filtered_match_data = match_data[significant_change]

    # Generate features for each remaining row
    features = []
    for _, row in filtered_match_data.iterrows():
        current_features = {}

        # Add absolute minute as a feature
        current_features['absolute_minute'] = row['absolute_minute']

        # Add cumulative features
        if 'Goals - home' in match_data.columns and 'Goals - away' in match_data.columns:
            goal_diff = row['Goals - home'] - row['Goals - away']
            current_features['goal_diff_cubed'] = goal_diff ** 3

        if 'Dangerous Attacks - home' in match_data.columns and 'Dangerous Attacks - away' in match_data.columns:
            current_features['dangerous_attack_diff'] = (
                row['Dangerous Attacks - home'] - row['Dangerous Attacks - away']
            )

        if 'Ball Possession % - home' in match_data.columns and 'Ball Possession % - away' in match_data.columns:
            current_features['ball_possession_diff'] = (
                row['Ball Possession % - home'] - row['Ball Possession % - away']
            )
        
        # Red cards
        red_card_k = 30  # Decay factor of red cards
        red_card_weight = np.exp(-row['absolute_minute'] / red_card_k)
        weighted_red_card_diff = (row['Redcards - home'] - row['Redcards - away']) * red_card_weight
        current_features['weighted_red_card_diff'] = weighted_red_card_diff

        # Yellow cards
        yellow_card_k = 10  # Decay factor of yellow cards
        yellow_card_weight = np.exp(-row['absolute_minute'] / yellow_card_k)
        weighted_yellow_card_diff = (row['Yellowcards - home'] - row['Yellowcards - away']) * yellow_card_weight
        current_features['weighted_yellow_card_diff'] = weighted_yellow_card_diff

        # Original odds
        current_features['odds_home'] = row['1']
        current_features['odds_away'] = row['2']
        current_features['odds_draw'] = row['X']

        # Normalized odds
        if "1" in match_data.columns and "2" in match_data.columns and "X" in match_data.columns:
            odds = row[["1", "2", "X"]]

            # Handle missing or zero odds (to avoid division by zero)
            if odds.notnull().all() and (odds > 0).all():
                implied_probs = 1 / odds
                normalized_probs = implied_probs / implied_probs.sum()

                current_features['prob_home'] = normalized_probs["1"]
                current_features['prob_away'] = normalized_probs["2"]
                current_features['prob_draw'] = normalized_probs["X"]

        # Key passes
        current_features['keypass_home'] = row['Key Passes - home']
        current_features['keypass_away'] = row['Key Passes - away']
        current_features['keypass_diff'] = row['Key Passes - home'] - row['Key Passes - away']

        # Shots on target
        current_features['shots_on_target_home'] = row['Shots On Target - home']
        current_features['shots_on_target_away'] = row['Shots On Target - away']

        features.append(current_features)

    return pd.DataFrame(features)


# Full Preprocessing Pipeline
def preprocess_data(data, significant_columns):
    """
    Applies preprocessing to use every row of every match as an instance.
    Filters rows based on significant changes.
    """
    # Step 1: Handle `current_state`, `result`, and `final_score`
    data = preprocess_data_with_states(data)

    # Step 2: Generate features for each match dynamically
    X = []
    y = []
    match_ids = []

    for match_id, match_data in data.groupby('fixture_id'):
        match_features = preprocess_match_data(match_data, significant_columns)

        # Append features and labels for each row
        X.append(match_features)
        y.extend([match_data['result_numeric'].iloc[0]] * len(match_features))  # Label for each row
        match_ids.extend([match_id] * len(match_features))  # Match context for each row

    # Concatenate all match features into a single DataFrame
    X = pd.concat(X, ignore_index=True)
    y = pd.Series(y, name="label")

    return X, y, match_ids

### Creating training data

In [10]:
# Define columns for significant changes
significant_columns = [
    'Redcards - home', 'Redcards - away',
    'Goals - home', 'Goals - away',
    'Dangerous Attacks - home', 'Dangerous Attacks - away'
]


# Preprocess Training Data
X_train, y_train, match_ids_train = preprocess_data(train_data, significant_columns)

# Drop original odds columns
X_train = X_train.drop(columns=['odds_home', 'odds_away', 'odds_draw'])

# Fill NaN values with 0
X_train = X_train.fillna(0)

# Minutes
train_minutes = X_train['absolute_minute']

# Standardize Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Save the scaler for use during testing
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# Keep the scaled features and the feature names
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)

# Part 3: Testing Functions and Perform Training

### Weight, Dynamic/Fixed approach implementation

In [114]:
# Weighted Prediction Function
def weighted_prediction(clf, X, current_minute):
    """
    Applies non-linear weighting to predictions based on the current minute.
    Adjusts probabilities slightly at earlier minutes and less at later minutes.
    """
    raw_probs = clf.predict_proba(X)  # Original probabilities

    # Define a decay factor based on current minute
    decay_factor = np.exp(-current_minute / 60)  # Faster decay at early minutes, slower at later ones

    # Apply weighting to probabilities
    adjusted_probs = raw_probs * (1 - decay_factor) + raw_probs.mean(axis=1, keepdims=True) * decay_factor

    # Renormalize probabilities to sum to 1
    weighted_probs = adjusted_probs / adjusted_probs.sum(axis=1, keepdims=True)

    return weighted_probs, raw_probs

# Threshold function
def adjust_threshold_with_clamp(base_threshold, odds, alpha=0.3):
    """
    Adjusts the threshold dynamically using a clamped formula with square root scaling.
    
    Parameters:
    - base_threshold: The base threshold value.
    - odds: The odds for a specific outcome.
    - alpha: Smoothing constant for controlling the adjustment intensity.
    
    Returns:
    - Adjusted threshold value.
    """
    if odds <= 1:
        return base_threshold  # Avoid invalid odds
    
    # Apply the formula: sqrt(((sqrt(odds) + 1) / odds) + alpha), clamped to 1
    formula_result = (((odds ** 0.5) + 1) / odds) + alpha
    adjusted_threshold = min(1, formula_result ** 0.5)
    
    # Scale the base threshold
    return base_threshold * adjusted_threshold


# Dynamic Decision Function
def dynamic_decision(match_data, clf, threshold, weighted_prediction_fn, feature_columns, minutes):
    """
    Makes a dynamic decision for a match based on the given threshold and weighted prediction function.
    """
    for i, row in match_data.iterrows():
        # Extract features for the current row (exclude odds)
        features = row[feature_columns]
        features = np.array(features).reshape(1, -1)

        # Apply weighted prediction
        current_minute = int(minutes.loc[i, "absolute_minute"])
        
        weighted_probs, raw_probs = weighted_prediction_fn(clf, features, current_minute)
        # Decision based on threshold
        if weighted_probs.max() > threshold:
            bet = weighted_probs.argmax()  # Outcome with the highest weighted probability
            odds = [row["odds_draw"], row["odds_home"], row["odds_away"]]  # Use real odds for ROI
            chosen_odd = odds[bet]

            return {
                "decision_minute": current_minute,
                "bet": bet,
                "odds": chosen_odd,
                "real_probs": raw_probs.tolist()[0],
                "weighted_probs": weighted_probs.tolist()[0],
                "adjusted_thresholds": None,
            }

    # If no decision is made, return "no bet"
    return {
        "decision_minute": None,
        "bet": -1,  # No bet
        "odds": None,
        "real_probs": None,
        "weighted_probs": None,
        "adjusted_thresholds": None,
    }

# Dynamic Decision Function with Clamped Threshold
def dynamic_decision_with_clamped_threshold(match_data, clf, base_threshold, weighted_prediction_fn, feature_columns, minutes, alpha=0.3):
    """
    Makes a dynamic decision for a match based on dynamically adjusted thresholds using the new clamped formula.
    """
    for i, row in match_data.iterrows():
        # Extract features for the current row
        features = row[feature_columns]
        features = np.array(features).reshape(1, -1)

        # Apply weighted prediction
        current_minute = int(minutes.loc[i, "absolute_minute"])
        weighted_probs, raw_probs = weighted_prediction_fn(clf, features, current_minute)

        # Get bookmaker odds
        odds = [row["odds_draw"], row["odds_home"], row["odds_away"]]

        # Adjust thresholds dynamically for each bet
        adjusted_thresholds = [adjust_threshold_with_clamp(base_threshold, odd, alpha) for odd in odds]

        # Check if any weighted probability exceeds its adjusted threshold
        valid_bets = [
            (idx, prob, odds[idx])
            for idx, (prob, threshold) in enumerate(zip(weighted_probs[0], adjusted_thresholds))
            if prob > threshold
        ]

        if valid_bets:
            # Choose the option with the highest probability among valid bets
            best_bet = max(valid_bets, key=lambda x: x[1])  # x[1] is probability
            return {
                "decision_minute": current_minute,
                "bet": best_bet[0],  # Bet index
                "odds": best_bet[2],  # Chosen odds
                "weighted_probs": weighted_probs.tolist()[0],
                "real_probs": raw_probs.tolist()[0],
                "adjusted_thresholds": [round(i, 2) for i in adjusted_thresholds],
            }

    # If no decision is made, return "no bet"
    return {
        "decision_minute": None,
        "bet": -1,  # No bet
        "odds": None,
        "weighted_probs": None,
        "real_probs": None,
        "adjusted_thresholds": None,
    }


# Fixed-Time Decision Function
def fixed_decision(match_data, clf, decision_time, threshold, weighted_prediction_fn, feature_columns, minutes):
    """
    Makes a fixed decision for a match at a specified decision time.
    """
    match_data = match_data.reset_index(drop=True)
    # Find the closest minute that's greater than or equal to decision_time
    available_minutes = minutes["absolute_minute"]
    valid_minutes = available_minutes[available_minutes >= decision_time]
    available_minutes = available_minutes.tolist()
    
    selected_time = valid_minutes.min()
    selected_time_idx = available_minutes.index(selected_time)
    # Select the row closest to the decision time
    row = match_data.iloc[selected_time_idx]
    if row.empty:
        return {
            "decision_minute": None,
            "bet": -1,  # No bet
            "odds": None,
            "weighted_probs": None,
            "real_probs": None,
            "adjusted_thresholds": None,
        }
    
    features = row[feature_columns]
    features = np.array(features).reshape(1, -1)

    # Apply weighted prediction
    weighted_probs, raw_probs = weighted_prediction_fn(clf, features, decision_time)

    # Decision based on threshold
    if weighted_probs.max() > threshold:
        bet = weighted_probs.argmax()  # Outcome with the highest weighted probability
        odds = [row["odds_draw"], row["odds_home"], row["odds_away"]]  # Use real odds for ROI
        chosen_odd = odds[bet]

        return {
            "decision_minute": selected_time,
            "bet": bet,
            "odds": chosen_odd,
            "weighted_probs": weighted_probs.tolist()[0],
            "real_probs": raw_probs.tolist()[0],
            "adjusted_thresholds": None,
        }

    return {
        "decision_minute": decision_time,
        "bet": -1,  # No bet
        "odds": None,
        "weighted_probs": weighted_probs.tolist()[0],
        "real_probs": raw_probs.tolist()[0],
        "adjusted_thresholds": None,
    }

# Fixed-Time Decision Function with Clamped Threshold
def fixed_decision_with_clamped_threshold(
    match_data, clf, decision_time, base_threshold, weighted_prediction_fn, feature_columns, minutes, alpha=0.3
):
    """
    Makes a fixed decision for a match at a specified decision time using dynamically adjusted thresholds.
    """
    match_data = match_data.reset_index(drop=True)
    
    # Find the closest minute that's greater than or equal to decision_time
    available_minutes = minutes["absolute_minute"]
    valid_minutes = available_minutes[available_minutes >= decision_time]
    available_minutes = available_minutes.tolist()

    if valid_minutes.empty:
        # No valid time available
        return {
            "decision_minute": None,
            "bet": -1,  # No bet
            "odds": None,
            "weighted_probs": None,
            "real_probs": None,
            "adjusted_thresholds": None,
        }
    
    selected_time = valid_minutes.min()
    selected_time_idx = available_minutes.index(selected_time)

    # Select the row closest to the decision time
    row = match_data.iloc[selected_time_idx]
    features = row[feature_columns]
    features = np.array(features).reshape(1, -1)

    # Apply weighted prediction
    weighted_probs, raw_probs = weighted_prediction_fn(clf, features, decision_time)

    # Get bookmaker odds
    odds = [row["odds_draw"], row["odds_home"], row["odds_away"]]

    # Adjust thresholds dynamically for each bet
    adjusted_thresholds = [adjust_threshold_with_clamp(base_threshold, odd, alpha) for odd in odds]

    # Decision based on adjusted thresholds
    valid_bets = [
        (idx, prob, odds[idx])
        for idx, (prob, threshold) in enumerate(zip(weighted_probs[0], adjusted_thresholds))
        if prob > threshold
    ]

    if valid_bets:
        # Choose the option with the highest probability among valid bets
        best_bet = max(valid_bets, key=lambda x: x[1])  # x[1] is probability
        return {
            "decision_minute": selected_time,
            "bet": best_bet[0],  # Bet index
            "odds": best_bet[2],  # Chosen odds
            "weighted_probs": weighted_probs.tolist()[0],
            "real_probs": raw_probs.tolist()[0],
            "adjusted_thresholds": [round(i, 2) for i in adjusted_thresholds],
        }

    # If no valid bets, return "no bet"
    return {
        "decision_minute": decision_time,
        "bet": -1,  # No bet
        "odds": None,
        "weighted_probs": weighted_probs.tolist()[0],
        "real_probs": raw_probs.tolist()[0],
        "adjusted_thresholds": [round(i, 2) for i in adjusted_thresholds],
    }


### Testing function

In [118]:
def test_model_on_matches(X_test, y_test, match_ids_test, model, threshold, weighted_prediction_fn, feature_columns, minutes):
    """
    Simulates decision-making for test matches and evaluates model performance for both normal and clamped thresholds.

    Parameters:
    - X_test: Preprocessed feature set for test matches.
    - y_test: Actual labels for test matches.
    - match_ids_test: Match IDs corresponding to rows in X_test.
    - model: Trained machine learning model.
    - threshold: Decision threshold for betting.
    - weighted_prediction_fn: Function to compute weighted probabilities.
    - feature_columns: Selected features for the model.
    - minutes: DataFrame with absolute minutes for each match and row.

    Returns:
    - results_dict: Dictionary containing results for all four approaches.
    """
    results = {
        "dynamic_normal": [],
        "dynamic_clamped": [],
        "fixed_normal": [],
        "fixed_clamped": [],
    }

    test_data = pd.concat([X_test, pd.Series(match_ids_test, name="match_id"), y_test.rename("actual_result")], axis=1)

    # Group by match_id to simulate decision-making per match
    for match_id, match_data in test_data.groupby("match_id"):
        # Dynamic Decision (Normal)
        dynamic_normal = dynamic_decision(
            match_data,
            model,
            threshold,
            weighted_prediction_fn,
            feature_columns,
            minutes[minutes["match_id"] == match_id],
        )

        # Dynamic Decision (Clamped)
        dynamic_clamped = dynamic_decision_with_clamped_threshold(
            match_data,
            model,
            threshold,
            weighted_prediction_fn,
            feature_columns,
            minutes[minutes["match_id"] == match_id],
        )

        # Fixed Decision (Normal)
        fixed_normal = fixed_decision(
            match_data,
            model,
            decision_time=75,
            threshold=threshold,
            weighted_prediction_fn=weighted_prediction_fn,
            feature_columns=feature_columns,
            minutes=minutes[minutes["match_id"] == match_id],
        )

        # Fixed Decision (Clamped)
        fixed_clamped = fixed_decision_with_clamped_threshold(
            match_data,
            model,
            decision_time=75,
            base_threshold=threshold,
            weighted_prediction_fn=weighted_prediction_fn,
            feature_columns=feature_columns,
            minutes=minutes[minutes["match_id"] == match_id],
        )

        # Append results to corresponding lists
        for key, result in zip(
            ["dynamic_normal", "dynamic_clamped", "fixed_normal", "fixed_clamped"],
            [dynamic_normal, dynamic_clamped, fixed_normal, fixed_clamped],
        ):
            results[key].append({
                "match_id": match_id,
                "decision_minute": result["decision_minute"],
                "bet": result["bet"],
                "odds": result["odds"],
                "real_probs": [round(i, 2) for i in result["real_probs"] or []],
                "weighted_probs": [round(i, 2) for i in result["weighted_probs"] or []],
                "actual_result": match_data["actual_result"].iloc[0],
                "correct": int(result["bet"] == match_data["actual_result"].iloc[0]) if result["bet"] != -1 else None,
                "roi": (result["odds"] - 1) if result["bet"] == match_data["actual_result"].iloc[0] else -1 if result["bet"] != -1 else None,
                "adjusted_thresholds": result["adjusted_thresholds"],
            })

    # Convert results to DataFrames
    results_dynamic_normal = pd.DataFrame(results["dynamic_normal"])
    results_dynamic_clamped = pd.DataFrame(results["dynamic_clamped"])
    results_fixed_normal = pd.DataFrame(results["fixed_normal"])
    results_fixed_clamped = pd.DataFrame(results["fixed_clamped"])

    # Calculate metrics for each approach
    results_dict = {}
    for key, df in zip(
        ["dynamic_normal", "dynamic_clamped", "fixed_normal", "fixed_clamped"],
        [results_dynamic_normal, results_dynamic_clamped, results_fixed_normal, results_fixed_clamped],
    ):
        decisions_made = (df["bet"] != -1).sum()
        correct_decisions = df["correct"].sum()
        accuracy = df["correct"].dropna().mean()
        roi = df["roi"].dropna().sum()
        average_decision_time = df["decision_minute"].dropna().mean()
        
        results_dict[key] = {
            "results_df": df,
            "accuracy": accuracy,
            "roi": roi,
            "mean_roi": roi / decisions_made,
            "decisions_made": decisions_made,
            "correct_decisions": correct_decisions,
            "average_decision_time": average_decision_time,
            "percentage_return": round(roi / decisions_made * 100, 2),
        }

    return results_dict


### Model Training

#### Feature Selection with Random Forest

In [13]:
# Define the parameter distribution
rf_param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# Initialize the Random Forest
rf = RandomForestClassifier(random_state=42)

# Perform Randomized Search
rf_random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=rf_param_dist,
    n_iter=50,  # Number of random configurations to try
    cv=3,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1,
    random_state=42
)
rf_random_search.fit(X_train_scaled, y_train)

# Best Parameters and Model
best_rf = rf_random_search.best_estimator_
print("Best Random Forest Parameters:", rf_random_search.best_params_)

# Feature Importance
feature_importances = best_rf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
display(feature_importance_df)


Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Random Forest Parameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 10}


Unnamed: 0,Feature,Importance
6,prob_home,0.232222
7,prob_away,0.229374
1,goal_diff_cubed,0.132305
8,prob_draw,0.122246
3,ball_possession_diff,0.045236
11,keypass_diff,0.035915
2,dangerous_attack_diff,0.034327
0,absolute_minute,0.029567
12,shots_on_target_home,0.027464
5,weighted_yellow_card_diff,0.026452


In [14]:
# Define the parameter distribution for Decision Tree
dt_param_dist = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', None],
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random']
}

# Initialize the Decision Tree
dt = DecisionTreeClassifier(random_state=42)

# Perform Randomized Search
dt_random_search = RandomizedSearchCV(
    estimator=dt,
    param_distributions=dt_param_dist,
    n_iter=50,  # Number of random configurations to try
    cv=3,
    scoring='accuracy', 
    verbose=2,
    n_jobs=-1,
    random_state=42
)
dt_random_search.fit(X_train_scaled, y_train)

# Best Parameters and Model
best_dt = dt_random_search.best_estimator_
print("Best Decision Tree Parameters:", dt_random_search.best_params_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Decision Tree Parameters: {'splitter': 'random', 'min_samples_split': 2, 'min_samples_leaf': 8, 'max_features': None, 'max_depth': 5, 'criterion': 'entropy'}


In [15]:
# Define the parameter distribution
xgb_param_dist = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

# Initialize the XGBoost Classifier
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# Perform Randomized Search
xgb_random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=xgb_param_dist,
    n_iter=50,  # Number of random configurations to try
    cv=3,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1,
    random_state=42
)
xgb_random_search.fit(X_train_scaled, y_train)

# Best Parameters and Model
best_xgb = xgb_random_search.best_estimator_
print("Best XGBoost Parameters:", xgb_random_search.best_params_)


Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best XGBoost Parameters: {'n_estimators': 100, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.01, 'colsample_bytree': 0.7}


In [123]:
# Define the parameter distribution
svm_param_dist = {
    'C': [0.1, 1, 5, 10],
    'gamma': [0.01, 0.1, 0.5, 1],
    'probability': [True]  # Ensure SVC provides probabilities
}

# Initialize the SVM
svm = SVC(kernel='rbf', random_state=42)

# Perform Randomized Search
svm_random_search = RandomizedSearchCV(
    estimator=svm,
    param_distributions=svm_param_dist,
    n_iter=20,  # Number of random configurations to try
    cv=3,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1,
    random_state=42
)
svm_random_search.fit(X_train_scaled, y_train)

# Best Parameters and Model
best_svm = svm_random_search.best_estimator_
print("Best SVM Parameters:", svm_random_search.best_params_)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best SVM Parameters: {'probability': True, 'gamma': 0.01, 'C': 1}


In [None]:
# Preprocess Testing Data
X_test, y_test, match_ids_test = preprocess_data(test_data, significant_columns)

# Check for NaN values in the X_test
display(X_test.isna().sum())

# Save the odds columns
odds_columns = ['odds_home', 'odds_away', 'odds_draw']
odds_test = X_test[odds_columns]

# Drop original odds columns (if present in test data)
X_test = X_test.drop(columns=['odds_home', 'odds_away', 'odds_draw'], errors='ignore')

# Fill NaN values with 0
X_test = X_test.fillna(0)

# Minutes
test_minutes = pd.concat([pd.Series(match_ids_test, name="match_id"), X_test["absolute_minute"]], axis=1)

# Load the scaler and apply to the test data
with open("scaler.pkl", "rb") as f:
    loaded_scaler = pickle.load(f)

X_test_scaled = loaded_scaler.transform(X_test)

# Convert back to DataFrame
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Add the odds columns back to the test data
X_test_scaled = pd.concat([X_test_scaled, odds_test], axis=1)

# Feature Columns
feature_columns = X_train.columns

# Part 4: Implementation

#### Display function

In [116]:
# Display Results
def display_results(results_dict):
    """
    Displays the results for all four decision approaches.
    
    Parameters:
    - results_dict: Dictionary containing results for all four approaches.
    """
    for key, result in results_dict.items():
        print(f"--- {key.replace('_', ' ').title()} Results ---")
        print(f"Decisions Made: {result['decisions_made']} / Correct: {result['correct_decisions']}")
        print(f"Average Decision Time: {result['average_decision_time']:.2f}")
        print(f"Accuracy: {result['accuracy']:.2f}\n")
        print(f"ROI: {result['roi']:.2f}")
        print(f"Mean ROI: {result['mean_roi']:.2f}")
        print(f"Percentage Return: {result['percentage_return']}%\n")
        
        # Display a preview of the results DataFrame
        display(result["results_df"].head(5))


## Decision Tree

In [124]:
# Decision Tree with t=0.85
results_dt_85 = test_model_on_matches(
    X_test_scaled,
    y_test,
    match_ids_test,
    model=best_dt,
    threshold=0.85,
    weighted_prediction_fn=weighted_prediction,
    feature_columns=feature_columns,
    minutes=test_minutes,
)
display_results(results_dt_85)

--- Dynamic Normal Results ---
Decisions Made: 33 / Correct: 33.0
Average Decision Time: 93.09
Accuracy: 1.00

ROI: 28.01
Mean ROI: 0.85
Percentage Return: 84.88%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,,-1,,[],[],1,,,
1,19134534.0,,-1,,[],[],1,,,
2,19134535.0,,-1,,[],[],0,,,
3,19134536.0,91.0,2,1.0,"[0.0, 0.0, 1.0]","[0.08, 0.07, 0.85]",2,1.0,0.0,
4,19134537.0,93.0,1,1.0,"[0.01, 0.99, 0.0]","[0.08, 0.85, 0.07]",1,1.0,0.0,


--- Dynamic Clamped Results ---
Decisions Made: 34 / Correct: 34.0
Average Decision Time: 92.85
Accuracy: 1.00

ROI: 34.01
Mean ROI: 1.00
Percentage Return: 100.03%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,,-1,,[],[],1,,,
1,19134534.0,,-1,,[],[],1,,,
2,19134535.0,,-1,,[],[],0,,,
3,19134536.0,91.0,2,1.0,"[0.0, 0.0, 1.0]","[0.08, 0.07, 0.85]",2,1.0,0.0,"[0.58, 0.5, 0.85]"
4,19134537.0,93.0,1,1.0,"[0.01, 0.99, 0.0]","[0.08, 0.85, 0.07]",1,1.0,0.0,"[0.58, 0.85, 0.5]"


--- Fixed Normal Results ---
Decisions Made: 0 / Correct: 0
Average Decision Time: 75.00
Accuracy: nan

ROI: 0.00
Mean ROI: nan
Percentage Return: nan%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,75,-1,,"[0.29, 0.65, 0.05]","[0.3, 0.56, 0.13]",1,,,
1,19134534.0,75,-1,,"[0.19, 0.02, 0.79]","[0.23, 0.11, 0.66]",1,,,
2,19134535.0,75,-1,,"[0.7, 0.17, 0.12]","[0.6, 0.22, 0.18]",0,,,
3,19134536.0,75,-1,,"[0.16, 0.0, 0.84]","[0.21, 0.1, 0.69]",2,,,
4,19134537.0,75,-1,,"[0.18, 0.82, 0.0]","[0.23, 0.68, 0.1]",1,,,


--- Fixed Clamped Results ---
Decisions Made: 0 / Correct: 0
Average Decision Time: 75.00
Accuracy: nan

ROI: 0.00
Mean ROI: nan
Percentage Return: nan%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,75,-1,,"[0.29, 0.65, 0.05]","[0.3, 0.56, 0.13]",1,,,"[0.85, 0.85, 0.64]"
1,19134534.0,75,-1,,"[0.19, 0.02, 0.79]","[0.23, 0.11, 0.66]",1,,,"[0.83, 0.64, 0.85]"
2,19134535.0,75,-1,,"[0.7, 0.17, 0.12]","[0.6, 0.22, 0.18]",0,,,"[0.85, 0.84, 0.85]"
3,19134536.0,75,-1,,"[0.16, 0.0, 0.84]","[0.21, 0.1, 0.69]",2,,,"[0.73, 0.58, 0.85]"
4,19134537.0,75,-1,,"[0.18, 0.82, 0.0]","[0.23, 0.68, 0.1]",1,,,"[0.77, 0.85, 0.58]"


In [125]:
# Decision Tree with t=0.7
results_dt_70 = test_model_on_matches(
    X_test_scaled,
    y_test,
    match_ids_test,
    model=best_dt,
    threshold=0.70,
    weighted_prediction_fn=weighted_prediction,
    feature_columns=feature_columns,
    minutes=test_minutes,
)
display_results(results_dt_70)

--- Dynamic Normal Results ---
Decisions Made: 110 / Correct: 98.0
Average Decision Time: 79.05
Accuracy: 0.89

ROI: 5.02
Mean ROI: 0.05
Percentage Return: 4.56%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,89.0,1,1.1,"[0.18, 0.82, 0.0]","[0.22, 0.71, 0.08]",1,1.0,0.1,
1,19134534.0,82.0,1,6.0,"[0.1, 0.9, 0.0]","[0.16, 0.76, 0.08]",1,1.0,5.0,
2,19134535.0,92.0,0,1.16,"[0.88, 0.03, 0.09]","[0.76, 0.1, 0.14]",0,1.0,0.16,
3,19134536.0,80.0,2,1.07,"[0.16, 0.0, 0.84]","[0.21, 0.09, 0.71]",2,1.0,0.07,
4,19134537.0,81.0,1,1.08,"[0.07, 0.91, 0.02]","[0.14, 0.76, 0.1]",1,1.0,0.08,


--- Dynamic Clamped Results ---
Decisions Made: 110 / Correct: 98.0
Average Decision Time: 79.05
Accuracy: 0.89

ROI: 5.02
Mean ROI: 0.05
Percentage Return: 4.56%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,89.0,1,1.1,"[0.18, 0.82, 0.0]","[0.22, 0.71, 0.08]",1,1.0,0.1,"[0.63, 0.7, 0.45]"
1,19134534.0,82.0,1,6.0,"[0.1, 0.9, 0.0]","[0.16, 0.76, 0.08]",1,1.0,5.0,"[0.7, 0.65, 0.7]"
2,19134535.0,92.0,0,1.16,"[0.88, 0.03, 0.09]","[0.76, 0.1, 0.14]",0,1.0,0.16,"[0.7, 0.55, 0.63]"
3,19134536.0,80.0,2,1.07,"[0.16, 0.0, 0.84]","[0.21, 0.09, 0.71]",2,1.0,0.07,"[0.6, 0.46, 0.7]"
4,19134537.0,81.0,1,1.08,"[0.07, 0.91, 0.02]","[0.14, 0.76, 0.1]",1,1.0,0.08,"[0.62, 0.7, 0.46]"


--- Fixed Normal Results ---
Decisions Made: 38 / Correct: 37.0
Average Decision Time: 75.36
Accuracy: 0.97

ROI: 0.44
Mean ROI: 0.01
Percentage Return: 1.16%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,75,-1,,"[0.29, 0.65, 0.05]","[0.3, 0.56, 0.13]",1,,,
1,19134534.0,75,-1,,"[0.19, 0.02, 0.79]","[0.23, 0.11, 0.66]",1,,,
2,19134535.0,75,-1,,"[0.7, 0.17, 0.12]","[0.6, 0.22, 0.18]",0,,,
3,19134536.0,75,-1,,"[0.16, 0.0, 0.84]","[0.21, 0.1, 0.69]",2,,,
4,19134537.0,75,-1,,"[0.18, 0.82, 0.0]","[0.23, 0.68, 0.1]",1,,,


--- Fixed Clamped Results ---
Decisions Made: 38 / Correct: 37.0
Average Decision Time: 75.36
Accuracy: 0.97

ROI: 0.44
Mean ROI: 0.01
Percentage Return: 1.16%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,75,-1,,"[0.29, 0.65, 0.05]","[0.3, 0.56, 0.13]",1,,,"[0.7, 0.7, 0.53]"
1,19134534.0,75,-1,,"[0.19, 0.02, 0.79]","[0.23, 0.11, 0.66]",1,,,"[0.68, 0.53, 0.7]"
2,19134535.0,75,-1,,"[0.7, 0.17, 0.12]","[0.6, 0.22, 0.18]",0,,,"[0.7, 0.69, 0.7]"
3,19134536.0,75,-1,,"[0.16, 0.0, 0.84]","[0.21, 0.1, 0.69]",2,,,"[0.6, 0.47, 0.7]"
4,19134537.0,75,-1,,"[0.18, 0.82, 0.0]","[0.23, 0.68, 0.1]",1,,,"[0.63, 0.7, 0.47]"


In [126]:
# Decision Tree with t=0.55
results_dt_55 = test_model_on_matches(
    X_test_scaled,
    y_test,
    match_ids_test,
    model=best_dt,
    threshold=0.55,
    weighted_prediction_fn=weighted_prediction,
    feature_columns=feature_columns,
    minutes=test_minutes,
)
display_results(results_dt_55)

--- Dynamic Normal Results ---
Decisions Made: 111 / Correct: 87
Average Decision Time: 58.67
Accuracy: 0.78

ROI: -1.72
Mean ROI: -0.02
Percentage Return: -1.55%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,71,1,1.33,"[0.29, 0.65, 0.05]","[0.3, 0.56, 0.14]",1,1,0.33,
1,19134534.0,41,2,1.28,"[0.19, 0.02, 0.79]","[0.26, 0.18, 0.56]",1,0,-1.0,
2,19134535.0,60,2,1.36,"[0.19, 0.02, 0.79]","[0.25, 0.13, 0.62]",0,0,-1.0,
3,19134536.0,47,2,1.33,"[0.19, 0.02, 0.79]","[0.26, 0.16, 0.58]",2,1,0.33,
4,19134537.0,68,1,1.14,"[0.18, 0.82, 0.0]","[0.23, 0.66, 0.11]",1,1,0.14,


--- Dynamic Clamped Results ---
Decisions Made: 111 / Correct: 87
Average Decision Time: 58.67
Accuracy: 0.78

ROI: -1.72
Mean ROI: -0.02
Percentage Return: -1.55%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,71,1,1.33,"[0.29, 0.65, 0.05]","[0.3, 0.56, 0.14]",1,1,0.33,"[0.55, 0.55, 0.43]"
1,19134534.0,41,2,1.28,"[0.19, 0.02, 0.79]","[0.26, 0.18, 0.56]",1,0,-1.0,"[0.54, 0.45, 0.55]"
2,19134535.0,60,2,1.36,"[0.19, 0.02, 0.79]","[0.25, 0.13, 0.62]",0,0,-1.0,"[0.55, 0.44, 0.55]"
3,19134536.0,47,2,1.33,"[0.19, 0.02, 0.79]","[0.26, 0.16, 0.58]",2,1,0.33,"[0.55, 0.45, 0.55]"
4,19134537.0,68,1,1.14,"[0.18, 0.82, 0.0]","[0.23, 0.66, 0.11]",1,1,0.14,"[0.51, 0.55, 0.4]"


--- Fixed Normal Results ---
Decisions Made: 91 / Correct: 75.0
Average Decision Time: 75.71
Accuracy: 0.82

ROI: -2.32
Mean ROI: -0.03
Percentage Return: -2.55%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,75,1,1.25,"[0.29, 0.65, 0.05]","[0.3, 0.56, 0.13]",1,1.0,0.25,
1,19134534.0,75,2,1.22,"[0.19, 0.02, 0.79]","[0.23, 0.11, 0.66]",1,0.0,-1.0,
2,19134535.0,75,0,1.72,"[0.7, 0.17, 0.12]","[0.6, 0.22, 0.18]",0,1.0,0.72,
3,19134536.0,77,2,1.07,"[0.16, 0.0, 0.84]","[0.21, 0.1, 0.69]",2,1.0,0.07,
4,19134537.0,75,1,1.1,"[0.18, 0.82, 0.0]","[0.23, 0.68, 0.1]",1,1.0,0.1,


--- Fixed Clamped Results ---
Decisions Made: 91 / Correct: 75.0
Average Decision Time: 75.71
Accuracy: 0.82

ROI: -2.32
Mean ROI: -0.03
Percentage Return: -2.55%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,75,1,1.25,"[0.29, 0.65, 0.05]","[0.3, 0.56, 0.13]",1,1.0,0.25,"[0.55, 0.55, 0.41]"
1,19134534.0,75,2,1.22,"[0.19, 0.02, 0.79]","[0.23, 0.11, 0.66]",1,0.0,-1.0,"[0.54, 0.41, 0.55]"
2,19134535.0,75,0,1.72,"[0.7, 0.17, 0.12]","[0.6, 0.22, 0.18]",0,1.0,0.72,"[0.55, 0.54, 0.55]"
3,19134536.0,77,2,1.07,"[0.16, 0.0, 0.84]","[0.21, 0.1, 0.69]",2,1.0,0.07,"[0.47, 0.37, 0.55]"
4,19134537.0,75,1,1.1,"[0.18, 0.82, 0.0]","[0.23, 0.68, 0.1]",1,1.0,0.1,"[0.5, 0.55, 0.37]"


## XG Boost

In [127]:
# XGBoost with t=0.35
results_xgb_35 = test_model_on_matches(
    X_test_scaled,
    y_test,
    match_ids_test,
    model=best_xgb,
    threshold=0.35,
    weighted_prediction_fn=weighted_prediction,
    feature_columns=feature_columns,
    minutes=test_minutes,
)
display_results(results_xgb_35)


--- Dynamic Normal Results ---
Decisions Made: 111 / Correct: 65
Average Decision Time: 20.61
Accuracy: 0.59

ROI: -5.44
Mean ROI: -0.05
Percentage Return: -4.9%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,17,2,1.5,"[0.28, 0.24, 0.48]","[0.32, 0.31, 0.37]",1,0,-1.0,
1,19134534.0,9,2,1.72,"[0.29, 0.25, 0.46]","[0.33, 0.32, 0.35]",1,0,-1.0,
2,19134535.0,32,2,1.57,"[0.31, 0.25, 0.44]","[0.32, 0.3, 0.38]",0,0,-1.0,
3,19134536.0,26,2,2.2,"[0.35, 0.27, 0.39]","[0.34, 0.31, 0.35]",2,1,1.2,
4,19134537.0,8,1,1.44,"[0.29, 0.48, 0.23]","[0.33, 0.35, 0.32]",1,1,0.44,


--- Dynamic Clamped Results ---
Decisions Made: 111 / Correct: 46
Average Decision Time: 19.23
Accuracy: 0.41

ROI: -21.32
Mean ROI: -0.19
Percentage Return: -19.21%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,17,2,1.5,"[0.28, 0.24, 0.48]","[0.32, 0.31, 0.37]",1,0,-1.0,"[0.35, 0.32, 0.35]"
1,19134534.0,9,2,1.72,"[0.29, 0.25, 0.46]","[0.33, 0.32, 0.35]",1,0,-1.0,"[0.35, 0.35, 0.35]"
2,19134535.0,32,2,1.57,"[0.31, 0.25, 0.44]","[0.32, 0.3, 0.38]",0,0,-1.0,"[0.35, 0.32, 0.35]"
3,19134536.0,26,2,2.2,"[0.35, 0.27, 0.39]","[0.34, 0.31, 0.35]",2,1,1.2,"[0.35, 0.35, 0.35]"
4,19134537.0,2,2,6.0,"[0.3, 0.46, 0.24]","[0.33, 0.34, 0.33]",1,0,-1.0,"[0.34, 0.35, 0.33]"


--- Fixed Normal Results ---
Decisions Made: 111 / Correct: 90
Average Decision Time: 75.84
Accuracy: 0.81

ROI: 2.89
Mean ROI: 0.03
Percentage Return: 2.6%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,75,1,1.25,"[0.27, 0.55, 0.18]","[0.29, 0.49, 0.23]",1,1,0.25,
1,19134534.0,75,2,1.22,"[0.26, 0.17, 0.57]","[0.28, 0.22, 0.51]",1,0,-1.0,
2,19134535.0,75,0,1.72,"[0.45, 0.28, 0.27]","[0.41, 0.3, 0.29]",0,1,0.72,
3,19134536.0,77,2,1.07,"[0.2, 0.16, 0.64]","[0.24, 0.21, 0.55]",2,1,0.07,
4,19134537.0,75,1,1.1,"[0.23, 0.6, 0.17]","[0.26, 0.52, 0.22]",1,1,0.1,


--- Fixed Clamped Results ---
Decisions Made: 111 / Correct: 90
Average Decision Time: 75.84
Accuracy: 0.81

ROI: 2.89
Mean ROI: 0.03
Percentage Return: 2.6%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,75,1,1.25,"[0.27, 0.55, 0.18]","[0.29, 0.49, 0.23]",1,1,0.25,"[0.35, 0.35, 0.26]"
1,19134534.0,75,2,1.22,"[0.26, 0.17, 0.57]","[0.28, 0.22, 0.51]",1,0,-1.0,"[0.34, 0.26, 0.35]"
2,19134535.0,75,0,1.72,"[0.45, 0.28, 0.27]","[0.41, 0.3, 0.29]",0,1,0.72,"[0.35, 0.34, 0.35]"
3,19134536.0,77,2,1.07,"[0.2, 0.16, 0.64]","[0.24, 0.21, 0.55]",2,1,0.07,"[0.3, 0.24, 0.35]"
4,19134537.0,75,1,1.1,"[0.23, 0.6, 0.17]","[0.26, 0.52, 0.22]",1,1,0.1,"[0.32, 0.35, 0.24]"


In [128]:
# XGBoost with t=0.55
results_xgb_55 = test_model_on_matches(
    X_test_scaled,
    y_test,
    match_ids_test,
    model=best_xgb,
    threshold=0.55,
    weighted_prediction_fn=weighted_prediction,
    feature_columns=feature_columns,
    minutes=test_minutes,
)
display_results(results_xgb_55)

--- Dynamic Normal Results ---
Decisions Made: 91 / Correct: 84.0
Average Decision Time: 78.74
Accuracy: 0.92

ROI: -3.05
Mean ROI: -0.03
Percentage Return: -3.35%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,91.0,1,1.07,"[0.17, 0.68, 0.15]","[0.2, 0.6, 0.19]",1,1.0,0.07,
1,19134534.0,97.0,1,1.07,"[0.18, 0.66, 0.16]","[0.21, 0.6, 0.2]",1,1.0,0.07,
2,19134535.0,,-1,,[],[],0,,,
3,19134536.0,77.0,2,1.07,"[0.2, 0.16, 0.64]","[0.24, 0.21, 0.55]",2,1.0,0.07,
4,19134537.0,76.0,1,1.09,"[0.18, 0.67, 0.15]","[0.22, 0.57, 0.2]",1,1.0,0.09,


--- Dynamic Clamped Results ---
Decisions Made: 91 / Correct: 84.0
Average Decision Time: 78.74
Accuracy: 0.92

ROI: -3.05
Mean ROI: -0.03
Percentage Return: -3.35%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,91.0,1,1.07,"[0.17, 0.68, 0.15]","[0.2, 0.6, 0.19]",1,1.0,0.07,"[0.48, 0.55, 0.35]"
1,19134534.0,97.0,1,1.07,"[0.18, 0.66, 0.16]","[0.21, 0.6, 0.2]",1,1.0,0.07,"[0.47, 0.55, 0.34]"
2,19134535.0,,-1,,[],[],0,,,
3,19134536.0,77.0,2,1.07,"[0.2, 0.16, 0.64]","[0.24, 0.21, 0.55]",2,1.0,0.07,"[0.47, 0.37, 0.55]"
4,19134537.0,76.0,1,1.09,"[0.18, 0.67, 0.15]","[0.22, 0.57, 0.2]",1,1.0,0.09,"[0.49, 0.55, 0.37]"


--- Fixed Normal Results ---
Decisions Made: 37 / Correct: 37.0
Average Decision Time: 75.37
Accuracy: 1.00

ROI: 1.03
Mean ROI: 0.03
Percentage Return: 2.78%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,75,-1,,"[0.27, 0.55, 0.18]","[0.29, 0.49, 0.23]",1,,,
1,19134534.0,75,-1,,"[0.26, 0.17, 0.57]","[0.28, 0.22, 0.51]",1,,,
2,19134535.0,75,-1,,"[0.45, 0.28, 0.27]","[0.41, 0.3, 0.29]",0,,,
3,19134536.0,77,2,1.07,"[0.2, 0.16, 0.64]","[0.24, 0.21, 0.55]",2,1.0,0.07,
4,19134537.0,75,-1,,"[0.23, 0.6, 0.17]","[0.26, 0.52, 0.22]",1,,,


--- Fixed Clamped Results ---
Decisions Made: 37 / Correct: 37.0
Average Decision Time: 75.37
Accuracy: 1.00

ROI: 1.03
Mean ROI: 0.03
Percentage Return: 2.78%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,75,-1,,"[0.27, 0.55, 0.18]","[0.29, 0.49, 0.23]",1,,,"[0.55, 0.55, 0.41]"
1,19134534.0,75,-1,,"[0.26, 0.17, 0.57]","[0.28, 0.22, 0.51]",1,,,"[0.54, 0.41, 0.55]"
2,19134535.0,75,-1,,"[0.45, 0.28, 0.27]","[0.41, 0.3, 0.29]",0,,,"[0.55, 0.54, 0.55]"
3,19134536.0,77,2,1.07,"[0.2, 0.16, 0.64]","[0.24, 0.21, 0.55]",2,1.0,0.07,"[0.47, 0.37, 0.55]"
4,19134537.0,75,-1,,"[0.23, 0.6, 0.17]","[0.26, 0.52, 0.22]",1,,,"[0.5, 0.55, 0.37]"


## SVM

In [149]:
# SVM with t=0.75
results_svm_75 = test_model_on_matches(
    X_test_scaled,
    y_test,
    match_ids_test,
    model=best_svm,
    threshold=0.75,
    weighted_prediction_fn=weighted_prediction,
    feature_columns=feature_columns,
    minutes=test_minutes,
)
display_results(results_svm_75)

--- Dynamic Normal Results ---
Decisions Made: 13 / Correct: 12.0
Average Decision Time: 88.85
Accuracy: 0.92

ROI: -0.73
Mean ROI: -0.06
Percentage Return: -5.62%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,,-1,,[],[],1,,,
1,19134534.0,,-1,,[],[],1,,,
2,19134535.0,,-1,,[],[],0,,,
3,19134536.0,,-1,,[],[],2,,,
4,19134537.0,,-1,,[],[],1,,,


--- Dynamic Clamped Results ---
Decisions Made: 13 / Correct: 12.0
Average Decision Time: 88.85
Accuracy: 0.92

ROI: -0.73
Mean ROI: -0.06
Percentage Return: -5.62%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,,-1,,[],[],1,,,
1,19134534.0,,-1,,[],[],1,,,
2,19134535.0,,-1,,[],[],0,,,
3,19134536.0,,-1,,[],[],2,,,
4,19134537.0,,-1,,[],[],1,,,


--- Fixed Normal Results ---
Decisions Made: 0 / Correct: 0
Average Decision Time: 75.00
Accuracy: nan

ROI: 0.00
Mean ROI: nan
Percentage Return: nan%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,75,-1,,"[0.19, 0.77, 0.04]","[0.23, 0.64, 0.12]",1,,,
1,19134534.0,75,-1,,"[0.23, 0.03, 0.74]","[0.26, 0.12, 0.62]",1,,,
2,19134535.0,75,-1,,"[0.5, 0.32, 0.18]","[0.46, 0.32, 0.22]",0,,,
3,19134536.0,75,-1,,"[0.14, 0.03, 0.83]","[0.2, 0.12, 0.69]",2,,,
4,19134537.0,75,-1,,"[0.16, 0.81, 0.02]","[0.21, 0.67, 0.11]",1,,,


--- Fixed Clamped Results ---
Decisions Made: 0 / Correct: 0
Average Decision Time: 75.00
Accuracy: nan

ROI: 0.00
Mean ROI: nan
Percentage Return: nan%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,75,-1,,"[0.19, 0.77, 0.04]","[0.23, 0.64, 0.12]",1,,,"[0.75, 0.75, 0.56]"
1,19134534.0,75,-1,,"[0.23, 0.03, 0.74]","[0.26, 0.12, 0.62]",1,,,"[0.73, 0.56, 0.75]"
2,19134535.0,75,-1,,"[0.5, 0.32, 0.18]","[0.46, 0.32, 0.22]",0,,,"[0.75, 0.74, 0.75]"
3,19134536.0,75,-1,,"[0.14, 0.03, 0.83]","[0.2, 0.12, 0.69]",2,,,"[0.65, 0.51, 0.75]"
4,19134537.0,75,-1,,"[0.16, 0.81, 0.02]","[0.21, 0.67, 0.11]",1,,,"[0.68, 0.75, 0.51]"


In [130]:
# SVM with t=0.55
results_svm_55 = test_model_on_matches(
    X_test_scaled,
    y_test,
    match_ids_test,
    model=best_svm,
    threshold=0.55,
    weighted_prediction_fn=weighted_prediction,
    feature_columns=feature_columns,
    minutes=test_minutes,
)
display_results(results_svm_55)

--- Dynamic Normal Results ---
Decisions Made: 111 / Correct: 80
Average Decision Time: 52.59
Accuracy: 0.72

ROI: -7.01
Mean ROI: -0.06
Percentage Return: -6.32%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,49,2,1.44,"[0.2, 0.05, 0.75]","[0.26, 0.18, 0.57]",1,0,-1.0,
1,19134534.0,47,2,1.28,"[0.2, 0.04, 0.76]","[0.26, 0.18, 0.56]",1,0,-1.0,
2,19134535.0,60,2,1.36,"[0.13, 0.04, 0.83]","[0.2, 0.15, 0.65]",0,0,-1.0,
3,19134536.0,47,2,1.33,"[0.19, 0.03, 0.78]","[0.26, 0.17, 0.58]",2,1,0.33,
4,19134537.0,37,1,1.22,"[0.16, 0.81, 0.03]","[0.25, 0.55, 0.2]",1,1,0.22,


--- Dynamic Clamped Results ---
Decisions Made: 111 / Correct: 80
Average Decision Time: 52.59
Accuracy: 0.72

ROI: -7.01
Mean ROI: -0.06
Percentage Return: -6.32%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,49,2,1.44,"[0.2, 0.05, 0.75]","[0.26, 0.18, 0.57]",1,0,-1.0,"[0.55, 0.49, 0.55]"
1,19134534.0,47,2,1.28,"[0.2, 0.04, 0.76]","[0.26, 0.18, 0.56]",1,0,-1.0,"[0.54, 0.44, 0.55]"
2,19134535.0,60,2,1.36,"[0.13, 0.04, 0.83]","[0.2, 0.15, 0.65]",0,0,-1.0,"[0.55, 0.44, 0.55]"
3,19134536.0,47,2,1.33,"[0.19, 0.03, 0.78]","[0.26, 0.17, 0.58]",2,1,0.33,"[0.55, 0.45, 0.55]"
4,19134537.0,37,1,1.22,"[0.16, 0.81, 0.03]","[0.25, 0.55, 0.2]",1,1,0.22,"[0.52, 0.55, 0.44]"


--- Fixed Normal Results ---
Decisions Made: 98 / Correct: 80.0
Average Decision Time: 75.75
Accuracy: 0.82

ROI: -2.78
Mean ROI: -0.03
Percentage Return: -2.84%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,75,1,1.25,"[0.19, 0.77, 0.04]","[0.23, 0.64, 0.12]",1,1.0,0.25,
1,19134534.0,75,2,1.22,"[0.23, 0.03, 0.74]","[0.26, 0.12, 0.62]",1,0.0,-1.0,
2,19134535.0,75,-1,,"[0.5, 0.32, 0.18]","[0.46, 0.32, 0.22]",0,,,
3,19134536.0,77,2,1.07,"[0.14, 0.03, 0.83]","[0.2, 0.12, 0.69]",2,1.0,0.07,
4,19134537.0,75,1,1.1,"[0.16, 0.81, 0.02]","[0.21, 0.67, 0.11]",1,1.0,0.1,


--- Fixed Clamped Results ---
Decisions Made: 98 / Correct: 80.0
Average Decision Time: 75.75
Accuracy: 0.82

ROI: -2.78
Mean ROI: -0.03
Percentage Return: -2.84%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,75,1,1.25,"[0.19, 0.77, 0.04]","[0.23, 0.64, 0.12]",1,1.0,0.25,"[0.55, 0.55, 0.41]"
1,19134534.0,75,2,1.22,"[0.23, 0.03, 0.74]","[0.26, 0.12, 0.62]",1,0.0,-1.0,"[0.54, 0.41, 0.55]"
2,19134535.0,75,-1,,"[0.5, 0.32, 0.18]","[0.46, 0.32, 0.22]",0,,,"[0.55, 0.54, 0.55]"
3,19134536.0,77,2,1.07,"[0.14, 0.03, 0.83]","[0.2, 0.12, 0.69]",2,1.0,0.07,"[0.47, 0.37, 0.55]"
4,19134537.0,75,1,1.1,"[0.16, 0.81, 0.02]","[0.21, 0.67, 0.11]",1,1.0,0.1,"[0.5, 0.55, 0.37]"


In [145]:
# SVM with t=0.35
results_svm_35 = test_model_on_matches(
    X_test_scaled,
    y_test,
    match_ids_test,
    model=best_svm,
    threshold=0.35,
    weighted_prediction_fn=weighted_prediction,
    feature_columns=feature_columns,
    minutes=test_minutes,
)
display_results(results_svm_35)

--- Dynamic Normal Results ---
Decisions Made: 111 / Correct: 59
Average Decision Time: 7.11
Accuracy: 0.53

ROI: -5.77
Mean ROI: -0.05
Percentage Return: -5.2%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,17,2,1.5,"[0.19, 0.07, 0.74]","[0.3, 0.27, 0.43]",1,0,-1.0,
1,19134534.0,4,2,1.72,"[0.15, 0.1, 0.75]","[0.32, 0.32, 0.36]",1,0,-1.0,
2,19134535.0,15,2,2.4,"[0.28, 0.3, 0.42]","[0.32, 0.33, 0.35]",0,0,-1.0,
3,19134536.0,8,2,2.25,"[0.33, 0.19, 0.48]","[0.33, 0.32, 0.35]",2,1,1.25,
4,19134537.0,4,1,1.5,"[0.15, 0.78, 0.07]","[0.32, 0.36, 0.32]",1,1,0.5,


--- Dynamic Clamped Results ---
Decisions Made: 111 / Correct: 55
Average Decision Time: 7.00
Accuracy: 0.50

ROI: -6.23
Mean ROI: -0.06
Percentage Return: -5.61%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,17,2,1.5,"[0.19, 0.07, 0.74]","[0.3, 0.27, 0.43]",1,0,-1.0,"[0.35, 0.32, 0.35]"
1,19134534.0,4,2,1.72,"[0.15, 0.1, 0.75]","[0.32, 0.32, 0.36]",1,0,-1.0,"[0.35, 0.35, 0.35]"
2,19134535.0,15,2,2.4,"[0.28, 0.3, 0.42]","[0.32, 0.33, 0.35]",0,0,-1.0,"[0.35, 0.35, 0.35]"
3,19134536.0,8,2,2.25,"[0.33, 0.19, 0.48]","[0.33, 0.32, 0.35]",2,1,1.25,"[0.35, 0.35, 0.35]"
4,19134537.0,4,1,1.5,"[0.15, 0.78, 0.07]","[0.32, 0.36, 0.32]",1,1,0.5,"[0.35, 0.35, 0.33]"


--- Fixed Normal Results ---
Decisions Made: 111 / Correct: 88
Average Decision Time: 75.84
Accuracy: 0.79

ROI: -1.76
Mean ROI: -0.02
Percentage Return: -1.59%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,75,1,1.25,"[0.19, 0.77, 0.04]","[0.23, 0.64, 0.12]",1,1,0.25,
1,19134534.0,75,2,1.22,"[0.23, 0.03, 0.74]","[0.26, 0.12, 0.62]",1,0,-1.0,
2,19134535.0,75,0,1.72,"[0.5, 0.32, 0.18]","[0.46, 0.32, 0.22]",0,1,0.72,
3,19134536.0,77,2,1.07,"[0.14, 0.03, 0.83]","[0.2, 0.12, 0.69]",2,1,0.07,
4,19134537.0,75,1,1.1,"[0.16, 0.81, 0.02]","[0.21, 0.67, 0.11]",1,1,0.1,


--- Fixed Clamped Results ---
Decisions Made: 111 / Correct: 88
Average Decision Time: 75.84
Accuracy: 0.79

ROI: -1.76
Mean ROI: -0.02
Percentage Return: -1.59%



Unnamed: 0,match_id,decision_minute,bet,odds,real_probs,weighted_probs,actual_result,correct,roi,adjusted_thresholds
0,19134533.0,75,1,1.25,"[0.19, 0.77, 0.04]","[0.23, 0.64, 0.12]",1,1,0.25,"[0.35, 0.35, 0.26]"
1,19134534.0,75,2,1.22,"[0.23, 0.03, 0.74]","[0.26, 0.12, 0.62]",1,0,-1.0,"[0.34, 0.26, 0.35]"
2,19134535.0,75,0,1.72,"[0.5, 0.32, 0.18]","[0.46, 0.32, 0.22]",0,1,0.72,"[0.35, 0.34, 0.35]"
3,19134536.0,77,2,1.07,"[0.14, 0.03, 0.83]","[0.2, 0.12, 0.69]",2,1,0.07,"[0.3, 0.24, 0.35]"
4,19134537.0,75,1,1.1,"[0.16, 0.81, 0.02]","[0.21, 0.67, 0.11]",1,1,0.1,"[0.32, 0.35, 0.24]"


# Part 5: Results

#### Display function

In [137]:
def create_comparison_matrix(models, metric='roi'):
    """
    Create comparison matrix for either ROI or accuracy metrics.
    
    Parameters:
    - models (dict): Dictionary of model names and their results from `test_model_on_matches`.
    - metric (str): Either 'roi' or 'accuracy' to specify which metric to analyze.
    
    Returns:
    - pd.DataFrame: Comparison matrix with metrics for different versions.
    """
    data = {
        'Model': [],
        'Dynamic': [],
        'Dynamic Clamped': [],
        'Fixed': [],
        'Fixed Clamped': []
    }

    for model_name, results in models.items():
        # Extract metrics for each approach
        dynamic = results['dynamic_normal'][metric]
        dynamic_clamped = results['dynamic_clamped'][metric]
        fixed = results['fixed_normal'][metric]
        fixed_clamped = results['fixed_clamped'][metric]

        # Append to the data dictionary
        data['Model'].append(model_name)
        data['Dynamic'].append(round(dynamic, 3))
        data['Dynamic Clamped'].append(round(dynamic_clamped, 3))
        data['Fixed'].append(round(fixed, 3))
        data['Fixed Clamped'].append(round(fixed_clamped, 3))

    # Create DataFrame
    matrix = pd.DataFrame(data)
    matrix.set_index('Model', inplace=True)
    return matrix

# Define models and their results
models = {
    'Decision Tree t=0.85': results_dt_85,
    'Decision Tree t=0.70': results_dt_70,
    'Decision Tree t=0.55': results_dt_55,
    'XGBoost t=0.35': results_xgb_35,
    'XGBoost t=0.55': results_xgb_55,
    'SVM t=0.75': results_svm_75,
    'SVM t=0.55': results_svm_55,
    'SVM t=0.35': results_svm_35
}

### Result matrices

In [138]:
# Create a comparison matrix for ROI
roi_matrix = create_comparison_matrix(models, metric='roi')
print("ROI Comparison Matrix:")
display(roi_matrix)

ROI Comparison Matrix:


Unnamed: 0_level_0,Dynamic,Dynamic Clamped,Fixed,Fixed Clamped
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Decision Tree t=0.85,28.01,34.01,0.0,0.0
Decision Tree t=0.70,5.02,5.02,0.44,0.44
Decision Tree t=0.55,-1.72,-1.72,-2.32,-2.32
XGBoost t=0.35,-5.44,-21.32,2.89,2.89
XGBoost t=0.55,-3.05,-3.05,1.03,1.03
SVM t=0.75,-0.73,-0.73,0.0,0.0
SVM t=0.55,-7.01,-7.01,-2.78,-2.78
SVM t=0.35,-5.77,-6.23,-1.76,-1.76


In [139]:
# Create a comparison matrix for accuracy
accuracy_matrix = create_comparison_matrix(models, metric='accuracy')
print("Accuracy Comparison Matrix:")
display(accuracy_matrix)

Accuracy Comparison Matrix:


Unnamed: 0_level_0,Dynamic,Dynamic Clamped,Fixed,Fixed Clamped
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Decision Tree t=0.85,1.0,1.0,,
Decision Tree t=0.70,0.891,0.891,0.974,0.974
Decision Tree t=0.55,0.784,0.784,0.824,0.824
XGBoost t=0.35,0.586,0.414,0.811,0.811
XGBoost t=0.55,0.923,0.923,1.0,1.0
SVM t=0.75,0.923,0.923,,
SVM t=0.55,0.721,0.721,0.816,0.816
SVM t=0.35,0.532,0.495,0.793,0.793


In [140]:
# Create a comparison matrix for number of decisions made
decisions_matrix = create_comparison_matrix(models, metric='decisions_made')
print("Decisions Comparison Matrix:")
display(decisions_matrix)

Decisions Comparison Matrix:


Unnamed: 0_level_0,Dynamic,Dynamic Clamped,Fixed,Fixed Clamped
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Decision Tree t=0.85,33,34,0,0
Decision Tree t=0.70,110,110,38,38
Decision Tree t=0.55,111,111,91,91
XGBoost t=0.35,111,111,111,111
XGBoost t=0.55,91,91,37,37
SVM t=0.75,13,13,0,0
SVM t=0.55,111,111,98,98
SVM t=0.35,111,111,111,111


In [141]:
# Create a comparison matrix for percentage return
percentage_return_matrix = create_comparison_matrix(models, metric='percentage_return')
print("Percentage Return Comparison Matrix:")
display(percentage_return_matrix)

Percentage Return Comparison Matrix:


Unnamed: 0_level_0,Dynamic,Dynamic Clamped,Fixed,Fixed Clamped
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Decision Tree t=0.85,84.88,100.03,,
Decision Tree t=0.70,4.56,4.56,1.16,1.16
Decision Tree t=0.55,-1.55,-1.55,-2.55,-2.55
XGBoost t=0.35,-4.9,-19.21,2.6,2.6
XGBoost t=0.55,-3.35,-3.35,2.78,2.78
SVM t=0.75,-5.62,-5.62,,
SVM t=0.55,-6.32,-6.32,-2.84,-2.84
SVM t=0.35,-5.2,-5.61,-1.59,-1.59


In [151]:
# Create a comparison matrix for average decision time
average_decision_time_matrix = create_comparison_matrix(models, metric='average_decision_time')
print("Average Decision Time Comparison Matrix:")
average_decision_time_matrix = average_decision_time_matrix.round(2)
display(average_decision_time_matrix)

Average Decision Time Comparison Matrix:


Unnamed: 0_level_0,Dynamic,Dynamic Clamped,Fixed,Fixed Clamped
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Decision Tree t=0.85,93.09,92.85,75.0,75.0
Decision Tree t=0.70,79.06,79.06,75.36,75.36
Decision Tree t=0.55,58.67,58.67,75.71,75.71
XGBoost t=0.35,20.61,19.23,75.84,75.84
XGBoost t=0.55,78.74,78.74,75.37,75.37
SVM t=0.75,88.85,88.85,75.0,75.0
SVM t=0.55,52.59,52.59,75.75,75.75
SVM t=0.35,7.11,7.0,75.84,75.84
