In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import pandas as pd
import numpy as np
import io
import random
import math
import scipy.stats
import matplotlib.pylab as plt

  from IPython.core.display import display, HTML


In [64]:
# Read in data
data = pd.read_csv('18_19tourneycs.csv')
data.head()

Unnamed: 0,Region,Seed,School,SRS
0,1,1,Duke,26.9
1,1,16,North Dakota State,-4.01
2,1,8,Virginia Commonwealth,11.96
3,1,9,Central Florida,13.37
4,1,4,Virginia Tech,19.28


In [65]:
# Rename columns of data DataFrame
data.columns = ['Region', 'Seed', 'Team', 'SRS']
data

Unnamed: 0,Region,Seed,Team,SRS
0,1,1,Duke,26.90
1,1,16,North Dakota State,-4.01
2,1,8,Virginia Commonwealth,11.96
3,1,9,Central Florida,13.37
4,1,4,Virginia Tech,19.28
...,...,...,...,...
59,4,11,Ohio State,13.89
60,4,7,Wofford,13.92
61,4,10,Seton Hall,10.25
62,4,2,Kentucky,21.43


In [56]:
num_of_sims = 1000  # Number of simulations
teams = data.to_numpy().tolist()  # Convert DataFrame to a nested list
team_dict = {}  # Dictionary to store team data

# Iterate through each team
for i in range(0, len(teams)):
    # Initialize an empty list for each team in the team_dict
    team_dict[teams[i][2]] = [0, 0, 0, 0, 0, 0, 0]
    # The team name is accessed using teams[i][2], assuming the team name is in the third column of the data

In [59]:
"""
Given a list of teams in a specific round and the round number,
find_winners randomly determines the winners of each matchup based on the
SRS difference between the teams and returns a list of the winners
"""
def find_winners(input_list, round):

    winners = []

    # Iterate over the input list of teams
    for i in range(0, len(input_list)):
        # Skips if the index is odd (represents the second team in a matchup)
        if i % 2 != 0:
            pass
        else:
            # Get the first team and the second team in the matchup
            team1 = input_list[i]
            team2 = input_list[i+1]

            # Calculate the SRS difference between the teams
            srs_diff = team1[3] - team2[3]

            # Calculate the win probability of Team 1 using the SRS difference
            team1_prob = scipy.stats.norm(0, 10.36).cdf(srs_diff)

            # Generate a random draw between 0 and 1
            draw = random.random()

            # Determine the winner based on the draw and win probability
            if draw <= team1_prob:
                # Team 1 wins
                winners.append(team1)
                team_dict[team1[2]][round] += 1
            else:
                # Team 2 wins
                winners.append(team2)
                team_dict[team2[2]][round] += 1

    return winners


In [60]:
"""
Prints out the bracket representation of the tournament rounds
based on the provided round_list
"""
def bracket_printout(round_list):

    for i in range(0, len(round_list)):
        temp_list = round_list[i]

        # Iterate over the matchups in the current round
        for j in range(0, len(temp_list)):
            # Print the team seed, name, and SRS for each matchup
            print(str(temp_list[j][1]) + " " + temp_list[j][2] + " " + str(temp_list[j][3]))

            # Print an empty line after every even-indexed matchup
            if j % 2 != 0:
                print("")

        # Print empty lines between each round
        print("")
        print("")
        print("")


In [61]:
"""
Simulates a tournament by finding winners in each round and printing the bracket
"""
def tourney_sim(teams):
    # Increment the count of tournaments for each team
    for x in team_dict:
        team_dict[x][0] += 1

    # Find winners in the Round of 64
    round_32 = find_winners(teams, 1)

    # Find winners in the Round of 32
    sweet_16 = find_winners(round_32, 2)

    # Find winners in the Sweet 16
    elite_8 = find_winners(sweet_16, 3)

    # Find winners in the Elite 8
    final_4 = find_winners(elite_8, 4)

    # Find winners in the Final 4
    championship = find_winners(final_4, 5)

    # Find the winner in the Championship
    winner = find_winners(championship, 6)

    # Print the bracket for all rounds
    # Uncomment below to see bracket format of simulation. Heads up: it is a lot of output!
    # bracket_printout([teams, round_32, sweet_16, elite_8, final_4, championship, winner])


In [62]:
%%time
# Run the monte-carlo tournament simulation
for i in range(1, num_of_sims+1):
    tourney_sim(teams)

# Calculate the average statistics for each team
for x in team_dict:
    # Divide the count of wins in each round by the total number of simulations
    for i in range(0, len(team_dict[x])):
        team_dict[x][i] = team_dict[x][i] / num_of_sims

    # Print the team name and its average statistics
    print(x + " " + str(team_dict[x]))


Duke [1.0, 0.998, 0.902, 0.718, 0.485, 0.261, 0.178]
North Dakota State [1.0, 0.002, 0.0, 0.0, 0.0, 0.0, 0.0]
Virginia Commonwealth [1.0, 0.442, 0.04, 0.011, 0.001, 0.0, 0.0]
Central Florida [1.0, 0.558, 0.058, 0.017, 0.003, 0.0, 0.0]
Virginia Tech [1.0, 0.906, 0.589, 0.176, 0.062, 0.018, 0.003]
Saint Louis [1.0, 0.094, 0.018, 0.0, 0.0, 0.0, 0.0]
Mississippi State [1.0, 0.845, 0.373, 0.078, 0.027, 0.007, 0.001]
Liberty [1.0, 0.155, 0.02, 0.0, 0.0, 0.0, 0.0]
Louisiana State [1.0, 0.86, 0.471, 0.122, 0.032, 0.005, 0.001]
Yale [1.0, 0.14, 0.027, 0.002, 0.001, 0.0, 0.0]
Maryland [1.0, 0.768, 0.435, 0.109, 0.022, 0.005, 0.001]
Belmont [1.0, 0.232, 0.067, 0.008, 0.001, 0.0, 0.0]
Louisville [1.0, 0.682, 0.138, 0.071, 0.017, 0.007, 0.004]
Minnesota [1.0, 0.318, 0.042, 0.021, 0.003, 0.0, 0.0]
Michigan State [1.0, 0.991, 0.82, 0.667, 0.346, 0.164, 0.103]
Bradley [1.0, 0.009, 0.0, 0.0, 0.0, 0.0, 0.0]
Gonzaga [1.0, 0.999, 0.914, 0.798, 0.59, 0.363, 0.247]
Fairleigh Dickinson [1.0, 0.001, 0.0, 0.0,

In [67]:
point_system = [0, 1, 2, 3, 5, 8, 13]  # Predefined point system for each round

expected_values = {}  # Dictionary to store the expected value for each team

# Calculate the expected value for each team
for team, probabilities in team_dict.items():
    expected_value = sum(probabilities[i] * point_system[i] for i in range(len(probabilities)))
    expected_values[team] = expected_value

# Sort teams by expected value in descending order
sorted_teams = sorted(expected_values.items(), key=lambda x: x[1], reverse=True)

# Print the expected value for each team
for team, expected_value in sorted_teams:
    print(f"{team}: {expected_value:.3f}")

Gonzaga: 14.286
Virginia: 11.920
Duke: 11.783
North Carolina: 10.211
Michigan State: 9.013
Texas Tech: 6.480
Kentucky: 6.418
Tennessee: 6.123
Purdue: 5.553
Michigan: 4.856
Houston: 3.641
Auburn: 3.458
Virginia Tech: 3.105
Kansas: 2.570
Florida State: 2.549
Louisiana State: 2.381
Wisconsin: 2.305
Iowa State: 2.144
Maryland: 2.128
Mississippi State: 2.029
Kansas State: 1.982
Marquette: 1.579
Louisville: 1.364
Buffalo: 1.311
Wofford: 1.168
Villanova: 1.161
Oklahoma: 1.103
Nevada: 1.002
Cincinnati: 0.954
Iowa: 0.943
Florida: 0.867
Oregon: 0.831
Ohio State: 0.776
Central Florida: 0.740
Syracuse: 0.705
Washington: 0.684
Utah State: 0.682
Saint Mary's (CA): 0.675
Baylor: 0.658
Virginia Commonwealth: 0.560
Seton Hall: 0.522
Mississippi: 0.488
Minnesota: 0.480
Murray State: 0.476
Arizona State: 0.396
Belmont: 0.395
UC-Irvine: 0.232
New Mexico State: 0.230
Yale: 0.205
Liberty: 0.195
Vermont: 0.189
Saint Louis: 0.130
Northeastern: 0.099
Georgia State: 0.075
Northern Kentucky: 0.059
Old Dominion: 

In [69]:
# Convert dictionary to DataFrame
df_expected_values = pd.DataFrame.from_dict(expected_values, orient='index', columns=['Expected Value'])

# Sort teams by expected value in descending order
df_expected_values = df_expected_values.sort_values(by='Expected Value', ascending=False)

# Reset the index
df_expected_values = df_expected_values.reset_index().rename(columns={'index': 'Team'})

# Display the DataFrame
display(df_expected_values)

Unnamed: 0,Team,Expected Value
0,Gonzaga,14.286
1,Virginia,11.920
2,Duke,11.783
3,North Carolina,10.211
4,Michigan State,9.013
...,...,...
59,Bradley,0.009
60,Gardner-Webb,0.006
61,Iona,0.005
62,North Dakota State,0.002


In [70]:
# Merge the dataframes based on the 'Team' column
df_merged = data.merge(df_expected_values, on='Team')

# Display the merged DataFrame
df_merged.head()

Unnamed: 0,Region,Seed,Team,SRS,Expected Value
0,1,1,Duke,26.9,11.783
1,1,16,North Dakota State,-4.01,0.002
2,1,8,Virginia Commonwealth,11.96,0.56
3,1,9,Central Florida,13.37,0.74
4,1,4,Virginia Tech,19.28,3.105


In [71]:
# Define the mapping of seed values to cost values
seed_cost_mapping = {
    1: 75,
    2: 40,
    3: 25,
    4: 20,
    5: 17,
    6: 15,
    7: 12,
    8: 10,
    9: 9,
    10: 8,
    11: 7,
    12: 6,
    13: 5,
    14: 4,
    15: 3,
    16: 1,
}

# Add a new column 'Cost' to df_merged based on the 'Seed' column
df_merged['Cost'] = df_merged['Seed'].map(seed_cost_mapping)

# Display the selected teams and their information
df_selected

Unnamed: 0,Region,Seed,Team,SRS,Expected Value,Cost
0,4,2,Kentucky,21.43,6.625,40
1,4,5,Auburn,20.84,3.793,17
2,3,3,Purdue,21.4,5.738,25
3,3,16,Gardner-Webb,-2.61,0.007,1
4,2,3,Texas Tech,22.79,5.866,25
5,2,16,Fairleigh Dickinson,-6.09,0.002,1
6,2,1,Gonzaga,27.79,14.06,75
7,1,2,Michigan State,24.93,8.895,40


In [72]:
# Define the knapsack function
def knapsack(items, budget):
    n = len(items)
    dp = [[0] * (budget + 1) for _ in range(n + 1)]

    for i in range(1, n + 1):
        for j in range(1, budget + 1):
            cost = items[i - 1]['Cost']
            value = items[i - 1]['Expected Value']
            if cost > j:
                dp[i][j] = dp[i - 1][j]
            else:
                dp[i][j] = max(dp[i - 1][j], value + dp[i - 1][j - cost])

    selected = []
    j = budget
    for i in range(n, 0, -1):
        if dp[i][j] != dp[i - 1][j]:
            selected.append(items[i - 1])
            j -= items[i - 1]['Cost']

    return selected

# Run the knapsack optimization on df_merged
selected_teams = knapsack(df_merged.to_dict('records'), budget=224)

# Create a new DataFrame for the selected teams
df_selected = pd.DataFrame(selected_teams)

# Display the selected teams and their information
display(df_selected)

# Calculate the total sum of the cost and expected value columns
total_cost = df_selected['Cost'].sum()
total_expected_value = df_selected['Expected Value'].sum()

# Print the total sums
print(f"Total Cost: {total_cost}")
print(f"Total Expected Value: {total_expected_value:.3f}")

Unnamed: 0,Region,Seed,Team,SRS,Expected Value,Cost
0,4,2,Kentucky,21.43,6.418,40
1,4,5,Auburn,20.84,3.458,17
2,4,16,Iona,-4.78,0.005,1
3,3,3,Purdue,21.4,5.553,25
4,3,16,Gardner-Webb,-2.61,0.006,1
5,2,3,Texas Tech,22.79,6.48,25
6,2,1,Gonzaga,27.79,14.286,75
7,1,2,Michigan State,24.93,9.013,40


Total Cost: 224
Total Expected Value: 45.219
