<a href="https://colab.research.google.com/github/AnthonyF98/NCAA-Mens-Lax/blob/main/mens_lax.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime, timedelta

# Function to get game results for a specific date
def get_game_results(year, month, day):
    url = f"https://www.ncaa.com/scoreboard/lacrosse-men/d1/{year}/{month:02d}/{day:02d}/all-conf"
    print(f"Fetching data from: {url}")

    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve data for {year}-{month:02d}-{day:02d}: {response.status_code}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')
    games = []

    # Find game containers
    for game in soup.find_all("div", class_="gamePod"):
        try:
            date = f"{year}-{month:02d}-{day:02d}"
            status = game.find("div", class_="gamePod-status").text.strip()  # e.g., "FINAL"
            round_info = game.find("span", class_="game-round")
            round_name = round_info.text.strip() if round_info else "Unknown Round"

            teams = game.find_all("span", class_="gamePod-game-team-name")
            scores = game.find_all("span", class_="gamePod-game-team-score")
            ranks = game.find_all("span", class_="gamePod-game-team-rank")

            team1 = teams[0].text.strip() if len(teams) > 0 else "Unknown"
            team2 = teams[1].text.strip() if len(teams) > 1 else "Unknown"
            score1 = scores[0].text.strip() if len(scores) > 0 else "N/A"
            score2 = scores[1].text.strip() if len(scores) > 1 else "N/A"
            rank1 = ranks[0].text.strip() if len(ranks) > 0 else "N/A"
            rank2 = ranks[1].text.strip() if len(ranks) > 1 else "N/A"

            winner_class = game.find_all("li", class_="winner")
            winner = winner_class[0].find("span", class_="gamePod-game-team-name").text.strip() if winner_class else "N/A"

            games.append([date, round_name, status, team1, rank1, score1, team2, rank2, score2, winner])

        except Exception as e:
            print(f"Error parsing game data for {date}: {e}")

    return games

# Set the range of years
START_YEAR = 2014
END_YEAR = 2025

# Loop through each year (only January thru May)
for year in range(START_YEAR, END_YEAR + 1):
    all_games = []
    start_date = datetime(year, 1, 1)  # January 1st
    end_date = datetime(year, 5, 30)  # May 30st
    current_date = start_date

    while current_date <= end_date:
        month, day = current_date.month, current_date.day
        games_data = get_game_results(year, month, day)

        if games_data:
            all_games.extend(games_data)

        # Respectful scraping: delay between requests
        time.sleep(1.5)

        # Move to the next day
        current_date += timedelta(days=1)

    # Save data for the year (if any games were found)
    if all_games:
        df = pd.DataFrame(all_games, columns=["Date", "Round", "Status", "Team 1", "Rank 1", "Score 1", "Team 2", "Rank 2", "Score 2", "Winner"])
        filename = f"ncaa_mens_lacrosse_results_{year}.csv"
        df.to_csv(filename, index=False)
        print(f"Data for {year} (Jan-May) saved to {filename}")
    else:
        print(f"No games found for {year} (Jan-May)")

print("Scraping completed for the last 10 seasons (Jan-May).")

Fetching data from: https://www.ncaa.com/scoreboard/lacrosse-men/d1/2025/01/01/all-conf
Failed to retrieve data for 2025-01-01: 404
Fetching data from: https://www.ncaa.com/scoreboard/lacrosse-men/d1/2025/01/02/all-conf
Failed to retrieve data for 2025-01-02: 404
Fetching data from: https://www.ncaa.com/scoreboard/lacrosse-men/d1/2025/01/03/all-conf
Failed to retrieve data for 2025-01-03: 404
Fetching data from: https://www.ncaa.com/scoreboard/lacrosse-men/d1/2025/01/04/all-conf
Failed to retrieve data for 2025-01-04: 404
Fetching data from: https://www.ncaa.com/scoreboard/lacrosse-men/d1/2025/01/05/all-conf
Failed to retrieve data for 2025-01-05: 404
Fetching data from: https://www.ncaa.com/scoreboard/lacrosse-men/d1/2025/01/06/all-conf
Failed to retrieve data for 2025-01-06: 404
Fetching data from: https://www.ncaa.com/scoreboard/lacrosse-men/d1/2025/01/07/all-conf
Failed to retrieve data for 2025-01-07: 404
Fetching data from: https://www.ncaa.com/scoreboard/lacrosse-men/d1/2025/01/

In [None]:
from google.colab import files

# This will prompt you to upload files
uploaded = files.upload()

# Displaying the names of the uploaded files
for filename in uploaded.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(
        name=filename, length=len(uploaded[filename])))




Saving ncaa_mens_lacrosse_results_2014.csv to ncaa_mens_lacrosse_results_2014.csv
Saving ncaa_mens_lacrosse_results_2015.csv to ncaa_mens_lacrosse_results_2015.csv
Saving ncaa_mens_lacrosse_results_2016.csv to ncaa_mens_lacrosse_results_2016.csv
Saving ncaa_mens_lacrosse_results_2017.csv to ncaa_mens_lacrosse_results_2017.csv
Saving ncaa_mens_lacrosse_results_2018.csv to ncaa_mens_lacrosse_results_2018.csv
Saving ncaa_mens_lacrosse_results_2019.csv to ncaa_mens_lacrosse_results_2019.csv
Saving ncaa_mens_lacrosse_results_2020.csv to ncaa_mens_lacrosse_results_2020.csv
Saving ncaa_mens_lacrosse_results_2021.csv to ncaa_mens_lacrosse_results_2021.csv
Saving ncaa_mens_lacrosse_results_2022.csv to ncaa_mens_lacrosse_results_2022.csv
Saving ncaa_mens_lacrosse_results_2023.csv to ncaa_mens_lacrosse_results_2023.csv
Saving ncaa_mens_lacrosse_results_2024.csv to ncaa_mens_lacrosse_results_2024.csv
Saving ncaa_mens_lacrosse_results_2025.csv to ncaa_mens_lacrosse_results_2025.csv
User uploaded fi

In [None]:
import pandas as pd
import io

all_data = []
for filename, content in uploaded.items():
    df = pd.read_csv(io.BytesIO(content))  # Read each file from memory
    all_data.append(df)

# Concatenate all the DataFrames into one DataFrame
combined_data = pd.concat(all_data, ignore_index=True)
combined_data['Date'] = pd.to_datetime(combined_data['Date'])  # Convert 'Date' to datetime
combined_data.sort_values('Date', inplace=True)  # Sort the data by date

# Show the first few rows to confirm
print(combined_data.head())


         Date          Round Status      Team 1  Rank 1  Score 1     Team 2  \
0  2014-02-08  Unknown Round  Final      Denver     NaN     14.0  Air Force   
10 2014-02-08  Unknown Round    NaN  High Point     NaN      0.0     Towson   
8  2014-02-08  Unknown Round    NaN   Boston U.     NaN      0.0     Mercer   
7  2014-02-08  Unknown Round  Final         VMI     NaN      5.0       Navy   
6  2014-02-08  Unknown Round  Final    Delaware     NaN      4.0   Bucknell   

    Rank 2  Score 2    Winner  
0      NaN      8.0    Denver  
10     NaN      0.0       NaN  
8      NaN      0.0       NaN  
7      NaN     18.0      Navy  
6      NaN      6.0  Bucknell  


In [None]:
import math

# Dictionary to standardize team names
team_aliases = {
    "Penn St.": "Penn State",
    "Johns Hopkins University": "Johns Hopkins",
    "UVA": "Virginia",
    "UNC": "North Carolina",
    "UMass": "Massachusetts",
    "UMBC": "UMBC",
    "LIU Brooklyn": "LIU",
    "St. Joe's": "Saint Joseph's",
    "St. John's (NY)": "St. John's",
    "Saint Joseph’s": "Saint Joseph's",
    "Mt. St. Mary's": "Mount St. Mary's",
    "Mount St. Mary's": "Mount St. Mary's",
    "NJ Inst. of Tech.": "NJIT",
    "NJIT": "NJIT",
    "Hobart and William Smith": "Hobart",
    "Boston U.": "Boston University",
    "Ohio St.": "Ohio State",
    "Detroit": "Detroit Mercy",
    "Albany (NY)": "Albany",
    "Mass.-Lowell": "UMass Lowell",
    "Loyola (Md.)": "Loyola Maryland",
    "Army West Point": "Army",
    "Cleveland St.": "Cleveland State",
    "Westmin. (Pa.)": "Westminster (PA)",
    "Greensboro": "Greensboro College",
    "SUNY Delhi": "SUNY Delhi",
    "St. Andrews": "St. Andrews University",
    "Randolph": "Randolph College",
    "Gordon": "Gordon College",
    "Regis (MA)": "Regis College (MA)",
    "Saint Joseph (CT)": "University of Saint Joseph (CT)",
    "Salem St.": "Salem State",
    "Pace": "Pace University",
    "Mitchell": "Mitchell College",
    "Chowan": "Chowan University",
    "Lindenwood University": "Lindenwood",
    "Queens University of Charlotte": "Queens (NC)",
    "Le Moyne College": "Le Moyne",
    "Merrimack College": "Merrimack",
    "Sacred Heart University": "Sacred Heart",
    "University of Richmond": "Richmond",
    "Virginia Military Institute": "VMI"
}

# Standardize the team names in "Team 1" column
combined_data["Team 1"] = combined_data["Team 1"].replace(team_aliases)
combined_data["Team 2"] = combined_data["Team 2"].replace(team_aliases)


# Function to standardize team names
def standardize_team_name(team):
    return team_aliases.get(team, team)  # Return mapped name if exists, else return original name

# List of current NCAA Division I Men's Lacrosse teams
d1_teams = {
    "Air Force", "Albany", "Army", "Bellarmine", "Binghamton", "Boston University", "Brown", "Bryant",
    "Bucknell", "Canisius", "Cleveland State", "Colgate", "Cornell", "Dartmouth", "Delaware", "Denver",
    "Detroit Mercy", "Drexel", "Duke", "Fairfield", "Georgetown", "Hampton", "Harvard", "High Point",
    "Hobart", "Hofstra", "Holy Cross", "Jacksonville", "Johns Hopkins", "Lafayette", "Lehigh", "Le Moyne", "LIU",
    "Loyola Maryland", "Manhattan", "Marist", "Marquette", "Maryland", "Massachusetts", "Mercer",
    "Merrimack", "Michigan", "Monmouth", "Mount St. Mary's", "Navy", "NJIT", "North Carolina",
    "Notre Dame", "Ohio State", "Penn", "Penn State", "Princeton", "Providence", "Queens (NC)", "Quinnipiac",
    "Richmond", "Robert Morris", "Rutgers", "Sacred Heart", "St. Bonaventure", "St. John's",
    "Saint Joseph's", "Siena", "Stony Brook", "Syracuse", "Towson", "UMass Lowell", "UMBC", "Utah",
    "Vermont", "Villanova", "Virginia", "VMI", "Wagner", "Yale"
}

class EloRatingSystem:
    def __init__(self, k=32, initial_rating=1500, d1_teams=None):
        self.k = k  # Standard Elo adjustment factor
        self.ratings = {}  # Dictionary to store team ratings
        self.initial_rating = initial_rating  # Default initial Elo rating
        self.d1_teams = d1_teams if d1_teams else set()  # Set of Division I teams

    def expected_score(self, rating1, rating2):
        return 1 / (1 + 10 ** ((rating2 - rating1) / 400))

    def update_ratings(self, team1, team2, score1, score2):
        # Standardize team names
        team1 = standardize_team_name(team1)
        team2 = standardize_team_name(team2)

        # Skip Elo updates if either team is not in the Division I set
        if team1 not in self.d1_teams or team2 not in self.d1_teams:
            return

        # Initialize ratings for Division I teams only
        if team1 not in self.ratings:
            self.ratings[team1] = self.initial_rating
        if team2 not in self.ratings:
            self.ratings[team2] = self.initial_rating

        rating1 = self.ratings[team1]
        rating2 = self.ratings[team2]

        exp_score1 = self.expected_score(rating1, rating2)
        exp_score2 = self.expected_score(rating2, rating1)

        actual_score1 = 1 if score1 > score2 else 0.5 if score1 == score2 else 0
        actual_score2 = 1 - actual_score1

        # Margin of Victory Multiplier
        mov = abs(score1 - score2)
        mov_multiplier = math.log(mov + 1) * (2.2 / (1 + 0.001 * abs(rating1 - rating2)))

        self.ratings[team1] += self.k * mov_multiplier * (actual_score1 - exp_score1)
        self.ratings[team2] += self.k * mov_multiplier * (actual_score2 - exp_score2)

    def normalize_end_of_season(self):
        """Applies regression to the mean to avoid long-term inflation/deflation."""
        for team in list(self.ratings.keys()):
            if team not in self.d1_teams:
                del self.ratings[team]  # Remove non-D1 teams explicitly
            elif self.ratings[team] > self.initial_rating:
                self.ratings[team] -= (self.ratings[team] - self.initial_rating) * 0.25
            else:
                self.ratings[team] += (self.initial_rating - self.ratings[team]) * 0.25

    def get_ratings(self):
        """Returns Elo ratings, including only Division I teams."""
        return {team: rating for team, rating in self.ratings.items() if team in self.d1_teams}



# Instantiate the EloRatingSystem with the list of Division I teams
elo_system = EloRatingSystem(d1_teams=d1_teams)





In [None]:
unique_team_names2 = combined_data["Team 1"].unique()
print(unique_team_names2)

unique_team_names = combined_data["Team 2"].unique()
print(unique_team_names)

['Denver' 'High Point' 'Boston University' 'VMI' 'Delaware' 'Marquette'
 "Mount St. Mary's" 'Manhattan' 'Massachusetts' 'Jacksonville' 'Bryant'
 'Ohio State' 'North Carolina' 'Marist' 'UMBC' 'Canisius' 'Furman'
 'Towson' 'Robert Morris' 'Vermont' 'Fairfield' 'Georgetown'
 'Loyola Maryland' 'Siena' 'Rutgers' 'Lafayette' 'Penn' 'Bellarmine'
 "St. John's" 'Providence' 'Navy' 'Lehigh' 'Monmouth' 'Army' 'Harvard'
 'Dartmouth' 'Colgate' 'Maryland' 'Penn State' 'Hofstra' 'Hobart' 'Drexel'
 'Brown' 'Cornell' "Saint Joseph's" 'Virginia' 'Princeton' 'Holy Cross'
 'Binghamton' 'Yale' 'Villanova' 'Hartford' 'Bucknell' 'Johns Hopkins'
 'Syracuse' 'Duke' 'Sacred Heart' 'Quinnipiac' 'Detroit Mercy'
 'Notre Dame' 'Albany' 'Mercer' 'Air Force' 'Stony Brook' 'Wagner'
 'UMass Lowell' 'NJIT' 'Michigan' 'Richmond' 'Roberts Wesleyan' 'Hampton'
 'Cleveland State' 'Westminster (PA)' 'Bethany (WV)' 'St. Bonaventure'
 'Utah' 'Florida Tech' 'LIU' 'Merrimack' 'St. Thomas Aquinas'
 'Wilmington (DE)' 'Pace Universi

In [None]:
# Ensure combined_data is properly loaded
if 'combined_data' in locals():  # Check if DataFrame exists
    for index, row in combined_data.iterrows():
        if pd.notna(row['Score 1']) and pd.notna(row['Score 2']):
            score1, score2 = int(row['Score 1']), int(row['Score 2'])
            elo_system.update_ratings(row['Team 1'], row['Team 2'], score1, score2)
else:
    print("Error: 'combined_data' is not defined. Ensure you have loaded your dataset.")

# **Extract and Display Final Ratings**
final_ratings = elo_system.get_ratings()
final_ratings_df = pd.DataFrame(final_ratings.items(), columns=["Team", "Elo Rating"])
print(final_ratings_df)





                 Team   Elo Rating
0              Denver  1811.847246
1           Air Force  1495.636277
2          High Point  1507.369594
3              Towson  1709.828383
4   Boston University  1782.264535
..                ...          ...
70    St. Bonaventure   980.556398
71          Merrimack  1065.439526
72                LIU  1285.015719
73        Queens (NC)   774.605688
74           Le Moyne  1160.803405

[75 rows x 2 columns]


In [None]:
unique_team_names2 = final_ratings_df["Team"].unique()
print(unique_team_names2)

['Denver' 'Air Force' 'High Point' 'Towson' 'Boston University' 'Mercer'
 'VMI' 'Navy' 'Delaware' 'Bucknell' 'Marquette' 'Lehigh'
 "Mount St. Mary's" 'Maryland' 'Manhattan' 'Rutgers' 'Massachusetts'
 'Army' 'Jacksonville' 'Duke' 'Bryant' 'Colgate' 'Ohio State'
 'Johns Hopkins' 'Marist' 'Stony Brook' 'UMBC' 'Canisius' 'Hofstra'
 'Robert Morris' 'Bellarmine' 'Vermont' 'Fairfield' 'Georgetown'
 'Loyola Maryland' 'Penn State' 'Siena' 'Hobart' 'Wagner' 'Lafayette'
 'Penn' "Saint Joseph's" "St. John's" 'Yale' 'Providence' 'Monmouth'
 'Sacred Heart' 'Harvard' 'Holy Cross' 'Dartmouth' 'North Carolina'
 'Virginia' 'Syracuse' 'Notre Dame' 'Princeton' 'Cornell' 'Drexel'
 'Albany' 'Binghamton' 'Brown' 'Quinnipiac' 'Villanova' 'Detroit Mercy'
 'Richmond' 'Michigan' 'UMass Lowell' 'NJIT' 'Hampton' 'Cleveland State'
 'Utah' 'St. Bonaventure' 'Merrimack' 'LIU' 'Queens (NC)' 'Le Moyne']


In [None]:
def display_elo_ratings(elo_system):
    # Ensure the ratings are sorted by Elo rating value. This sorts from highest to lowest by default.
    sorted_ratings = sorted(elo_system.ratings.items(), key=lambda x: x[1], reverse=True)

    # Print the sorted ratings
    print("Team Names and Their Elo Ratings:")
    for team, rating in sorted_ratings:
        print(f"{team}: {rating:.2f}")


display_elo_ratings(elo_system)


Team Names and Their Elo Ratings:
Notre Dame: 2435.43
Maryland: 2308.41
Duke: 2142.67
Cornell: 2115.51
Johns Hopkins: 2108.21
Syracuse: 2088.64
Penn State: 2080.33
Georgetown: 2078.86
Army: 2045.61
North Carolina: 2035.64
Princeton: 2035.51
Saint Joseph's: 1954.13
Virginia: 1940.89
Ohio State: 1890.32
Richmond: 1867.40
Penn: 1860.13
Harvard: 1827.23
Michigan: 1826.09
Denver: 1811.85
Yale: 1800.45
Boston University: 1782.26
Colgate: 1754.18
Lehigh: 1714.18
Towson: 1709.83
Massachusetts: 1685.72
Delaware: 1681.01
Navy: 1676.43
Jacksonville: 1675.14
Rutgers: 1663.82
Villanova: 1638.77
Fairfield: 1623.84
Utah: 1612.91
Dartmouth: 1573.79
Albany: 1508.53
High Point: 1507.37
Loyola Maryland: 1500.04
Air Force: 1495.64
Providence: 1491.72
Bryant: 1487.52
Sacred Heart: 1485.07
UMBC: 1476.08
Lafayette: 1453.57
Marquette: 1452.90
Brown: 1439.34
Vermont: 1383.04
Hofstra: 1376.23
Drexel: 1371.00
Bucknell: 1358.48
Stony Brook: 1346.80
Hobart: 1330.20
Siena: 1293.37
Robert Morris: 1288.56
LIU: 1285.0

In [None]:
# Define function to calculate win probabilities and spread
def calculate_win_probability(elo_system, team1, team2):
    if team1 in elo_system.ratings and team2 in elo_system.ratings:
        rating1 = elo_system.ratings[team1]
        rating2 = elo_system.ratings[team2]
        score_diff = (rating2 - rating1) / 175 * 1  # Assuming each 175 elo difference means 1 goal
        expected_score_team1 = 1 / (1 + 10 ** ((rating2 - rating1) / 400))
        expected_score_team2 = 1 - expected_score_team1
        return expected_score_team1, expected_score_team2, score_diff
    else:
        return None  # If one or both teams not in ratings



In [None]:
team1 = input("Enter the name of Team 1: ")
team2 = input("Enter the name of Team 2: ")
probabilities = calculate_win_probability(elo_system, team1, team2)
if probabilities:
    print(f"Probability of {team1} winning: {probabilities[0] * 100:.2f}%")
    print(f"Probability of {team2} winning: {probabilities[1] * 100:.2f}%")
    print(f"Point spread (in goals favoring {team2}): {probabilities[2]:.2f}")
else:
    print("One or both of the teams are not found in the current Elo ratings.")


Enter the name of Team 1: Dartmouth
Enter the name of Team 2: Penn
Probability of Dartmouth winning: 16.13%
Probability of Penn winning: 83.87%
Point spread (in goals favoring Penn): 1.15
