In [3]:
import pandas as pd
from math import radians, sin, cos, sqrt, atan2

# Function to calculate distance between two geographical coordinates using Haversine formula
def calculate_distance(lat1, lon1, lat2, lon2):
    # Radius of the Earth in km
    R = 6371.0
    
    # Convert latitude and longitude from degrees to radians
    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)
    
    # Calculate the change in coordinates
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    
    # Apply Haversine formula
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = R * c
    
    return distance

# Function to get average distance traveled for a given year and team
def get_average_distance(year, team):
    # Load schedule data for the given year
    schedule_filename = f'example_data/schedules/{year}_schedule.csv'
    total_distance = 0
    games_played = 0
    
    # Parse the schedule and calculate distances
    with open(schedule_filename, 'r') as file:
        next(file)  # Skip header
        for line in file:
            schedule_year, _, home_team, away_team = line.strip().split(',')[0:4]
            
            # Check if it's an away game for the given team
            if schedule_year == year and away_team == team:
                # Check if coordinates for both cities are available
                if home_team in city_coordinates and away_team in city_coordinates:
                    home_lat, home_lon = city_coordinates[home_team]
                    away_lat, away_lon = city_coordinates[away_team]
                    
                    # Calculate distance between home and away cities
                    distance = calculate_distance(home_lat, home_lon, away_lat, away_lon)
                    total_distance += distance
                    games_played += 1
    
    # Calculate average distance traveled
    if games_played > 0:
        return total_distance / games_played
    else:
        return None

# Dictionary to store the coordinates of each city
city_coordinates = {
    'Royals': (39.0513, -94.4805),     # Kansas City (Royals)
    'Braves': (33.8908, -84.4679),     # Atlanta (Braves)
    'Rays': (27.7684, -82.6483),       # St. Petersburg (Rays)
    'Blue Jays': (43.6414, -79.3894),  # Toronto (Blue Jays)
    'Diamondbacks': (33.4455, -112.0667),  # Phoenix (Diamondbacks)
    'Astros': (29.7572, -95.3554),     # Houston (Astros)
    'Pirates': (40.4469, -80.0057),    # Pittsburgh (Pirates)
    'Dodgers': (34.0736, -118.2400),   # Los Angeles (Dodgers)
    'Rockies': (39.7554, -104.9881),   # Denver (Rockies)
    'Nationals': (38.8729, -77.0074),  # Washington D.C. (Nationals)
    'Cardinals': (38.6226, -90.1928),  # St. Louis (Cardinals)
    'Red Sox': (42.3467, -71.0972),    # Boston (Red Sox)
    'Orioles': (39.2839, -76.6217),    # Baltimore (Orioles)
    'Giants': (37.7786, -122.3893),    # San Francisco (Giants)
    'Reds': (39.0979, -84.5086),       # Cincinnati (Reds)
    'Indians': (41.4959, -81.6853),    # Cleveland (Indians)
    'Padres': (32.7076, -117.1570),    # San Diego (Padres)
    'Phillies': (39.9054, -75.1669),   # Philadelphia (Phillies)
    'White Sox': (41.8301, -87.6347),  # Chicago (White Sox)
    'Brewers': (43.0280, -87.9712),    # Milwaukee (Brewers)
    'Yankees': (40.8296, -73.9262),    # New York City (Yankees)
    'Mets': (40.7571, -73.8458),       # New York City (Mets)
    'Rangers': (32.7511, -97.0820),    # Arlington (Rangers)
    'Marlins': (25.7780, -80.2195),    # Miami (Marlins)
    'Mariners': (47.5914, -122.3325),  # Seattle (Mariners)
    'Twins': (44.9817, -93.2784),      # Minneapolis (Twins)
    'Angels': (33.8003, -117.8827),    # Anaheim (Angels)
    'Cubs': (41.9484, -87.6553),       # Chicago (Cubs)
    'Athletics': (37.7516, -122.2005), # Oakland (Athletics)
    'Tigers': (42.3391, -83.0487)      # Detroit (Tigers
}

BSS_data = pd.read_csv('example_data/batting_season_summary.csv')

# Iterate over each row in the DataFrame to calculate and assign average distance traveled
BSS_data['average_distance_traveled'] = BSS_data.apply(lambda row: get_average_distance(str(row['Year']), row['team']), axis=1)

# Display the updated DataFrame
print(BSS_data)


                    Name  age      team pos   PA   AB    H  2B  3B  HR  BB  \
0     Abel De Los Santos   25      Cubs  1B  721  676  151  36  28   4  45   
1       Adalberto Mendez   28  Phillies  LF  653  623  110  19  18   3  30   
2         Adam Greenberg   28   Pirates  RF  579  514  144  29  21  12  65   
3          Adam Peterson   21    Braves  1B  742  668  192  45  40  10  74   
4           Adam Shabala   34  Mariners  SS  637  600  109  26  12  10  37   
...                  ...  ...       ...  ..  ...  ...  ...  ..  ..  ..  ..   
2695       Victor Garate   29  Phillies  RF  646  582  162  40  30   7  64   
2696       Vince Belnome   31    Giants  SS  710  671  176  47  28  12  39   
2697         Wes Whisler   32      Reds  2B  722  649  217  52  25  18  73   
2698      Wilfredo Tovar   27  Mariners  1B  756  673  147  41  22   1  83   
2699   Wilking Rodriguez   34     Twins  2B  707  644  153  39  30   5  63   

       SO      P/PA        BA       OBP       SLG       OPS  Ye