In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime, timedelta

# Function to get game results for a specific date
def get_game_results(year, month, day):
    url = f"https://www.ncaa.com/scoreboard/lacrosse-men/d1/{year}/{month:02d}/{day:02d}/all-conf"
    print(f"Fetching data from: {url}")

    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve data for {year}-{month:02d}-{day:02d}: {response.status_code}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')
    games = []

    # Find game containers
    for game in soup.find_all("div", class_="gamePod"):
        try:
            date = f"{year}-{month:02d}-{day:02d}"
            status = game.find("div", class_="gamePod-status").text.strip()  # e.g., "FINAL"
            round_info = game.find("span", class_="game-round")
            round_name = round_info.text.strip() if round_info else "Unknown Round"

            teams = game.find_all("span", class_="gamePod-game-team-name")
            scores = game.find_all("span", class_="gamePod-game-team-score")
            ranks = game.find_all("span", class_="gamePod-game-team-rank")

            team1 = teams[0].text.strip() if len(teams) > 0 else "Unknown"
            team2 = teams[1].text.strip() if len(teams) > 1 else "Unknown"
            score1 = scores[0].text.strip() if len(scores) > 0 else "N/A"
            score2 = scores[1].text.strip() if len(scores) > 1 else "N/A"
            rank1 = ranks[0].text.strip() if len(ranks) > 0 else "N/A"
            rank2 = ranks[1].text.strip() if len(ranks) > 1 else "N/A"

            winner_class = game.find_all("li", class_="winner")
            winner = winner_class[0].find("span", class_="gamePod-game-team-name").text.strip() if winner_class else "N/A"

            games.append([date, round_name, status, team1, rank1, score1, team2, rank2, score2, winner])

        except Exception as e:
            print(f"Error parsing game data for {date}: {e}")

    return games

# Set the range of years
START_YEAR = 2014
END_YEAR = 2024

# Loop through each year (only January thru May)
for year in range(START_YEAR, END_YEAR + 1):
    all_games = []
    start_date = datetime(year, 1, 1)  # January 1st
    end_date = datetime(year, 5, 30)  # May 30st
    current_date = start_date

    while current_date <= end_date:
        month, day = current_date.month, current_date.day
        games_data = get_game_results(year, month, day)

        if games_data:
            all_games.extend(games_data)

        # Respectful scraping: delay between requests
        time.sleep(1.5)

        # Move to the next day
        current_date += timedelta(days=1)

    # Save data for the year (if any games were found)
    if all_games:
        df = pd.DataFrame(all_games, columns=["Date", "Round", "Status", "Team 1", "Rank 1", "Score 1", "Team 2", "Rank 2", "Score 2", "Winner"])
        filename = f"ncaa_mens_lacrosse_results_{year}.csv"
        df.to_csv(filename, index=False)
        print(f"Data for {year} (Jan-May) saved to {filename}")
    else:
        print(f"No games found for {year} (Jan-May)")

print("Scraping completed for the last 10 seasons (Jan-May).")

Fetching data from: https://www.ncaa.com/scoreboard/lacrosse-men/d1/2014/01/01/all-conf
Failed to retrieve data for 2014-01-01: 404
Fetching data from: https://www.ncaa.com/scoreboard/lacrosse-men/d1/2014/01/02/all-conf
Failed to retrieve data for 2014-01-02: 404
Fetching data from: https://www.ncaa.com/scoreboard/lacrosse-men/d1/2014/01/03/all-conf
Failed to retrieve data for 2014-01-03: 404
Fetching data from: https://www.ncaa.com/scoreboard/lacrosse-men/d1/2014/01/04/all-conf
Failed to retrieve data for 2014-01-04: 404
Fetching data from: https://www.ncaa.com/scoreboard/lacrosse-men/d1/2014/01/05/all-conf
Failed to retrieve data for 2014-01-05: 404
Fetching data from: https://www.ncaa.com/scoreboard/lacrosse-men/d1/2014/01/06/all-conf
Failed to retrieve data for 2014-01-06: 404
Fetching data from: https://www.ncaa.com/scoreboard/lacrosse-men/d1/2014/01/07/all-conf
Failed to retrieve data for 2014-01-07: 404
Fetching data from: https://www.ncaa.com/scoreboard/lacrosse-men/d1/2014/01/

In [2]:
import math

class EloRatingSystem:
    def __init__(self, k=32, initial_rating=1500):
        self.k = k  # Standard Elo adjustment factor
        self.ratings = {}  # Dictionary to store team ratings
        self.initial_rating = initial_rating  # Default initial Elo rating


    def expected_score(self, rating1, rating2):
        return 1 / (1 + 10 ** ((rating2 - rating1) / 400))

    def update_ratings(self, team1, team2, score1, score2):
        if team1 not in self.ratings:
            self.ratings[team1] = self.initial_rating
        if team2 not in self.ratings:
            self.ratings[team2] = self.initial_rating

        rating1, rating2 = self.ratings[team1], self.ratings[team2]
        exp_score1 = self.expected_score(rating1, rating2)
        exp_score2 = self.expected_score(rating2, rating1)

        actual_score1 = 1 if score1 > score2 else 0.5 if score1 == score2 else 0
        actual_score2 = 1 - actual_score1

        # Margin of Victory Multiplier
        mov = abs(score1 - score2)
        mov_multiplier = math.log(mov + 1) * (2.2 / (1 + 0.001 * abs(rating1 - rating2)))

        # Update ratings
        self.ratings[team1] += self.k * mov_multiplier * (actual_score1 - exp_score1)
        self.ratings[team2] += self.k * mov_multiplier * (actual_score2 - exp_score2)

    def normalize_end_of_season(self):
        """Applies regression to the mean to avoid long-term inflation/deflation."""
        for team in self.ratings:
            if self.ratings[team] > self.initial_rating:
                self.ratings[team] -= (self.ratings[team] - self.initial_rating) * 0.25
            else:
                self.ratings[team] += (self.initial_rating - self.ratings[team]) * 0.25

    def get_ratings(self):
        return self.ratings







In [3]:
from google.colab import files

# This will prompt you to upload files
uploaded = files.upload()

# Displaying the names of the uploaded files
for filename in uploaded.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(
        name=filename, length=len(uploaded[filename])))




Saving ncaa_mens_lacrosse_results_2014.csv to ncaa_mens_lacrosse_results_2014.csv
Saving ncaa_mens_lacrosse_results_2015.csv to ncaa_mens_lacrosse_results_2015.csv
Saving ncaa_mens_lacrosse_results_2016.csv to ncaa_mens_lacrosse_results_2016.csv
Saving ncaa_mens_lacrosse_results_2017.csv to ncaa_mens_lacrosse_results_2017.csv
Saving ncaa_mens_lacrosse_results_2018.csv to ncaa_mens_lacrosse_results_2018.csv
Saving ncaa_mens_lacrosse_results_2019.csv to ncaa_mens_lacrosse_results_2019.csv
Saving ncaa_mens_lacrosse_results_2020.csv to ncaa_mens_lacrosse_results_2020.csv
Saving ncaa_mens_lacrosse_results_2021.csv to ncaa_mens_lacrosse_results_2021.csv
Saving ncaa_mens_lacrosse_results_2022.csv to ncaa_mens_lacrosse_results_2022.csv
Saving ncaa_mens_lacrosse_results_2023.csv to ncaa_mens_lacrosse_results_2023.csv
Saving ncaa_mens_lacrosse_results_2024.csv to ncaa_mens_lacrosse_results_2024.csv
User uploaded file "ncaa_mens_lacrosse_results_2014.csv" with length 23256 bytes
User uploaded fil

In [4]:
elo_system = EloRatingSystem()


In [35]:
import pandas as pd
import io

# Assuming 'uploaded' is the dictionary returned by files.upload() containing the content of the files
all_data = []
for filename, content in uploaded.items():
    df = pd.read_csv(io.BytesIO(content))  # Read each file from memory
    all_data.append(df)

# Concatenate all the DataFrames into one DataFrame
combined_data = pd.concat(all_data, ignore_index=True)
combined_data['Date'] = pd.to_datetime(combined_data['Date'])  # Convert 'Date' to datetime
combined_data.sort_values('Date', inplace=True)  # Sort the data by date

# Show the first few rows to confirm
print(combined_data.head())


         Date          Round Status      Team 1  Rank 1  Score 1     Team 2  \
0  2014-02-08  Unknown Round  Final      Denver     NaN     14.0  Air Force   
10 2014-02-08  Unknown Round    NaN  High Point     NaN      0.0     Towson   
8  2014-02-08  Unknown Round    NaN   Boston U.     NaN      0.0     Mercer   
7  2014-02-08  Unknown Round  Final         VMI     NaN      5.0       Navy   
6  2014-02-08  Unknown Round  Final    Delaware     NaN      4.0   Bucknell   

    Rank 2  Score 2    Winner  
0      NaN      8.0    Denver  
10     NaN      0.0       NaN  
8      NaN      0.0       NaN  
7      NaN     18.0      Navy  
6      NaN      6.0  Bucknell  


In [43]:
import pandas as pd
import difflib

# Standardized names mapping
name_corrections = {
    "Penn St.": "Penn State",
    "Ohio St.": "Ohio State",
    "Albany (NY)": "Albany",
    "UAlbany": "Albany",
    "Army West Point": "Army",
    "Loyola (Md.)": "Loyola Maryland",
    "Westminster (PA)": "Westminster (Pa.)",
    "Westmin. (Pa.)": "Westminster (Pa.)",
    "St. John's (NY)": "St. John's",
    "Detroit Mercy": "Detroit",
    "Mount St. Mary's": "Mt. St. Mary's",
    "Cleveland St.": "Cleveland State",
    "Mass.-Lowell": "UMass Lowell",
}


# Apply corrections directly to the existing columns
combined_data["Team 1"] = combined_data["Team 1"].replace(name_corrections)
combined_data["Team 2"] = combined_data["Team 2"].replace(name_corrections)

# Display the corrected dataset
print(combined_data)

           Date          Round       Status      Team 1  Rank 1  Score 1  \
0    2014-02-08  Unknown Round        Final      Denver     NaN     14.0   
10   2014-02-08  Unknown Round          NaN  High Point     NaN      0.0   
8    2014-02-08  Unknown Round          NaN   Boston U.     NaN      0.0   
7    2014-02-08  Unknown Round        Final         VMI     NaN      5.0   
6    2014-02-08  Unknown Round        Final    Delaware     NaN      4.0   
...         ...            ...          ...         ...     ...      ...   
5344 2024-05-19  Quarterfinals        FINAL      Denver     5.0     10.0   
5345 2024-05-19  Quarterfinals  FINAL (2OT)    Virginia     6.0     11.0   
5347 2024-05-25     Semifinals        FINAL    Maryland     7.0     12.0   
5346 2024-05-25     Semifinals        FINAL      Denver     5.0      6.0   
5348 2024-05-27   Championship        FINAL    Maryland     7.0      5.0   

             Team 2  Rank 2  Score 2      Winner  
0         Air Force     NaN      8.0

In [44]:
# Update ratings based on the game results
for index, row in combined_data.iterrows():
    if pd.notna(row['Score 1']) and pd.notna(row['Score 2']):
        score1, score2 = int(row['Score 1']), int(row['Score 2'])
        elo_system.update_ratings(row['Team 1'], row['Team 2'], score1, score2)

# Extract and display the final ratings
final_ratings = pd.DataFrame(list(elo_system.get_ratings().items()), columns=['Team', 'Rating']).sort_values(by='Rating', ascending=False)
print(final_ratings)



            Team       Rating
55    Notre Dame  2634.739342
13      Maryland  2344.108036
35    Georgetown  2259.497443
19          Duke  2252.379007
53      Virginia  2241.005916
..           ...          ...
84  UMass Lowell   706.004795
40        Wagner   670.090460
97   Queens (NC)   636.689513
96    Lindenwood   539.843266
70       Hampton   361.673070

[110 rows x 2 columns]


In [45]:
def display_elo_ratings(elo_system):
    # Ensure the ratings are sorted by Elo rating value. This sorts from highest to lowest by default.
    sorted_ratings = sorted(elo_system.ratings.items(), key=lambda x: x[1], reverse=True)

    # Print the sorted ratings
    print("Team Names and Their Elo Ratings:")
    for team, rating in sorted_ratings:
        print(f"{team}: {rating:.2f}")


display_elo_ratings(elo_system)


Team Names and Their Elo Ratings:
Notre Dame: 2634.74
Maryland: 2344.11
Georgetown: 2259.50
Duke: 2252.38
Virginia: 2241.01
Denver: 2214.55
Johns Hopkins: 2202.87
Princeton: 2175.17
Penn State: 2159.51
Penn St.: 2139.28
Syracuse: 2130.81
Michigan: 2117.63
Cornell: 2090.38
Army: 2067.04
Loyola (Md.): 2058.57
Yale: 2052.61
Army West Point: 2048.56
Penn: 2038.26
North Carolina: 2011.18
Saint Joseph's: 2010.02
Boston U.: 1932.36
Towson: 1929.61
Richmond: 1916.02
Lehigh: 1881.72
Rutgers: 1878.15
Villanova: 1869.53
Harvard: 1846.67
Ohio State: 1806.70
Utah: 1793.43
Loyola Maryland: 1792.09
Ohio St.: 1786.57
Delaware: 1769.00
Albany (NY): 1760.08
Jacksonville: 1736.16
Albany: 1709.65
High Point: 1704.88
UAlbany: 1691.41
Navy: 1681.49
Bryant: 1675.78
Colgate: 1663.12
Brown: 1633.13
Providence: 1623.52
Massachusetts: 1621.05
Vermont: 1606.14
Tufts: 1592.84
Air Force: 1559.58
Greensboro: 1500.51
Roberts Wesleyan: 1500.00
Thiel: 1500.00
Westminster (PA): 1500.00
Bethany (WV): 1500.00
Chowan: 1500

In [39]:
def calculate_win_probability(elo_system, team1, team2):
    if team1 in elo_system.ratings and team2 in elo_system.ratings:
        rating1 = elo_system.ratings[team1]
        rating2 = elo_system.ratings[team2]
        expected_score_team1 = 1 / (1 + 10 ** ((rating2 - rating1) / 400))
        expected_score_team2 = 1 - expected_score_team1
        return expected_score_team1, expected_score_team2
    else:
        return None  # Handle case where one or both teams are not in the ratings




In [42]:
team1 = input("Enter the name of Team 1: ")
team2 = input("Enter the name of Team 2: ")

probabilities = calculate_win_probability(elo_system, team1, team2)
if probabilities:
    print(f"Probability of {team1} winning: {probabilities[0] * 100:.2f}%")
    print(f"Probability of {team2} winning: {probabilities[1] * 100:.2f}%")
else:
    print("One or both of the teams are not found in the current Elo ratings.")


Enter the name of Team 1: UMass Lowell
Enter the name of Team 2: Wagner
Probability of UMass Lowell winning: 54.25%
Probability of Wagner winning: 45.75%
