In [119]:
import requests
from bs4  import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt

In [120]:
sports_teams = {
    'mens_volleyball': [
        'https://ccnyathletics.com/sports/mens-volleyball/roster?view=2',
        'https://lehmanathletics.com/sports/mens-volleyball/roster?view=2',
        'https://www.brooklyncollegeathletics.com/sports/mens-volleyball/roster?view=2',
        'https://johnjayathletics.com/sports/mens-volleyball/roster?view=2',
        'https://athletics.baruch.cuny.edu/sports/mens-volleyball/roster?view=2',
        'https://mecathletics.com/sports/mens-volleyball/roster?view=2',
        'https://www.huntercollegeathletics.com/sports/mens-volleyball/roster?view=2',
        'https://yorkathletics.com/sports/mens-volleyball/roster?view=2',
        'https://ballstatesports.com/sports/mens-volleyball/roster?view=2'
    ],
    'womens_volleyball': [
        'https://bmccathletics.com/sports/womens-volleyball/roster?view=2',
        'https://yorkathletics.com/sports/womens-volleyball/roster?view=2',
        'https://hostosathletics.com/sports/womens-volleyball/roster?view=2',
        'https://bronxbroncos.com/sports/womens-volleyball/roster/2021?view=2',
        'https://queensknights.com/sports/womens-volleyball/roster?view=2',
        'https://augustajags.com/sports/wvball/roster?view=2',
        'https://flaglerathletics.com/sports/womens-volleyball/roster?view=2',
        'https://pacersports.com/sports/womens-volleyball/roster?view=2',
        'https://www.golhu.com/sports/womens-volleyball/roster?view=2'
    ],
    'mens_swimming_diving': [
        'https://csidolphins.com/sports/mens-swimming-and-diving/roster/2023-2024?view=2',
        'https://yorkathletics.com/sports/mens-swimming-and-diving/roster?view=2',
        'https://athletics.baruch.cuny.edu/sports/mens-swimming-and-diving/roster?view=2',
        'https://www.brooklyncollegeathletics.com/sports/mens-swimming-and-diving/roster?view=2',
        'https://lindenwoodlions.com/sports/mens-swimming-and-diving/roster?view=2',
        'https://mckbearcats.com/sports/mens-swimming-and-diving/roster?view=2',
        'https://ramapoathletics.com/sports/mens-swimming-and-diving/roster?view=2',
        'https://oneontaathletics.com/sports/mens-swimming-and-diving/roster?view=2',
        'https://bubearcats.com/sports/mens-swimming-and-diving/roster/2021-22?view=2',
        'https://albrightathletics.com/sports/mens-swimming-and-diving/roster/2021-22?view=2'
    ],
    'womens_swimming_diving': [
        'https://csidolphins.com/sports/womens-swimming-and-diving/roster?view=2',
        'https://queensknights.com/sports/womens-swimming-and-diving/roster?view=2',
        'https://yorkathletics.com/sports/womens-swimming-and-diving/roster?view=2',
        'https://athletics.baruch.cuny.edu/sports/womens-swimming-and-diving/roster/2021-22?path=wswim&view=2',
        'https://www.brooklyncollegeathletics.com/sports/womens-swimming-and-diving/roster?view=2',
        'https://lindenwoodlions.com/sports/womens-swimming-and-diving/roster?view=2',
        'https://mckbearcats.com/sports/womens-swimming-and-diving/roster?view=2',
        'https://ramapoathletics.com/sports/womens-swimming-and-diving/roster?view=2',
        'https://keanathletics.com/sports/womens-swimming-and-diving/roster?view=2',
        'https://oneontaathletics.com/sports/womens-swimming-and-diving/roster?view=2'
    ]
}

In [171]:
def process_data(urls):

  names = []
  heights = []

  for url in urls:
    page = requests.get(url)
    if page.status_code == 200:

      soup = BeautifulSoup(page.content, 'html.parser')

      name_tags = soup.find_all('td', class_='sidearm-table-player-name')
      height_tags = soup.find_all('td', class_='height')

      min_length = min(len(name_tags), len(height_tags))

      for i in range(min_length):
          name_tag = name_tags[i]
          height_tag = height_tags[i]

          names.append(name_tag.get_text().strip())

          raw_height = height_tag.get_text().strip()

          if '-' in raw_height:
              parts = raw_height.split('-')
              if len(parts) == 2 and parts[0] and parts[1]:
                  try:
                      feet = float(parts[0]) * 12
                      inches = float(parts[1])
                      inches_height = feet + inches
                      heights.append(inches_height)
                  except ValueError:
                      heights.append(None)
              else:
                  heights.append(None)
          else:
              heights.append(None)

      if len(name_tags) > min_length:
          for _ in range(len(name_tags) - min_length):
              heights.append(None)
      elif len(height_tags) > min_length:
           for _ in range(len(height_tags) - min_length):
               names.append(None)

  data = {
      'Names': names,
      'Heights' : heights
  }
  df = pd.DataFrame(data)
  valid_heights = [h for h in heights if h is not None]
  avg_height = sum(valid_heights) / max(len(valid_heights), 1)
  df['Heights'] = df['Heights'].fillna('None')

  return df, avg_height

In [166]:
mens_volleyball_df, mens_volleyball_avg_height = process_data(sports_teams['mens_volleyball'])

In [172]:
mens_volleyball_df

Unnamed: 0,Names,Heights
0,Shakib Delowar,71.0
1,Brandon Green,72.0
2,Reng Chen,69.0
3,Anirudha Das,68.0
4,Gabriel Linus,75.0
...,...,...
98,George Molina,72.0
99,Stanley Sanchez,68.0
100,Sebastian Gomez,76.0
101,"David Heyliger, Jr.",73.0


In [173]:
mens_volleyball_avg_height

71.2135922330097

In [174]:
womens_volleyball_df, womens_volleyball_avg_height = process_data(sports_teams['womens_volleyball'])

In [179]:
womens_volleyball_df

Unnamed: 0,Names,Heights
0,Jasmine Vega,61.0
1,Samantha Panameno,65.0
2,Simranjit Kaur,63.0
3,Jasmine Dias,59.0
4,Devina Luckhoo,63.0
...,...,...
117,Erin Ferello,65.0
118,Alyssa Daley,75.0
119,Myka Costanzo,64.0
120,Kyleigh McDermit,71.0


In [180]:
womens_volleyball_avg_height

68.2072072072072

In [186]:
mens_swimming_diving_df, mens_swimming_diving_avg_height = process_data(sports_teams['mens_swimming_diving'])

In [187]:
mens_swimming_diving_df

Unnamed: 0,Names,Heights
0,Anthony Attenborough,70.0
1,James Curran,69.0
2,Charles Cusumano,64.0
3,David Fayngersh,70.0
4,Michael Graham,70.0
...,...,...
189,Kieran Hassard,72.0
190,Anthony Laite,69.0
191,Drew MacDonald,72.0
192,Kyle Morken,73.0


In [188]:
mens_swimming_diving_avg_height

71.07865168539325

In [189]:
womens_swimming_diving_df, womens_swimming_diving_avg_height = process_data(sports_teams['womens_swimming_diving'])

In [190]:
womens_swimming_diving_df

Unnamed: 0,Names,Heights
0,Aurelia Barbagallo,
1,Nicole Conroy,
2,Jacqueline DeFranco,
3,Alexis Doyle,
4,Katherine Ebrahim,
...,...,...
138,Paige Splendido,65.0
139,Callie Stinson,64.0
140,Alyssa Thompson,67.0
141,Avery Tomandl,


In [191]:
womens_swimming_diving_avg_height

65.75206611570248

In [147]:
df_mens_volleyball = process_data(sports_teams['mens_volleyball'])
df_mens_volleyball.to_csv('mens_volleyball.csv', index=False)

df_womens_volleyball = process_data(sports_teams['womens_volleyball'])
df_womens_volleyball.to_csv('womens_volleyball.csv', index=False)

df_mens_swimming = process_data(sports_teams['mens_swimming_diving'])
df_mens_swimming.to_csv('mens_swimming.csv', index=False)

df_womens_swimming = process_data(sports_teams['womens_swimming_diving'])
df_womens_swimming.to_csv('womens_swimming.csv', index=False)

In [194]:
def tallest_and_shortest(df, team_name):
    df_sorted = df.sort_values(by = 'Heights')

    shortest_cutoff = df_sorted['Heights'].nsmallest(5).max()
    shortest = df_sorted[df_sorted['Heights'] <= shortest_cutoff]

    # Get 5 tallest with ties
    # Change 'height_in_inches' to 'Heights' to match the column name
    tallest_cutoff = df_sorted['Heights'].nlargest(5).min()
    # Change 'height_in_inches' to 'Heights' to match the column name
    tallest = df_sorted[df_sorted['Heights'] >= tallest_cutoff]

    print(f"\nTallest athletes in {team_label}:")
    # Change 'name' to 'Names' and 'height_in_inches' to 'Heights' to match the column names
    print(tallest[['Names', 'Heights']])

    print(f"\nShortest athletes in {team_label}:")
    # Change 'name' to 'Names' and 'height_in_inches' to 'Heights' to match the column names
    print(shortest[['Names', 'Heights']])

# Apply to each DataFrame
print_tallest_and_shortest(df_mens_swimming, "Men's Swimming")
print_tallest_and_shortest(df_womens_swimming, "Women's Swimming")
print_tallest_and_shortest(df_mens_volleyball, "Men's Volleyball")
print_tallest_and_shortest(df_womens_volleyball, "Women's Volleyball")


Tallest athletes in Men's Swimming:
                    Names  Heights
83          Adam Szczerba     76.0
70      Mattia Giurgevich     76.0
187     Nicholas Griffith     76.0
116        Michael Stooke     76.0
157         Gavin Weseman     76.0
134             Joe Swede     76.0
125          Tyler Finkle     76.0
167         Brian Harding     76.0
118  Domantas  Tarnauskas     77.0
171            Eric Kroon     77.0
100         Colton Grimes     78.0
120           Tyson Upton     79.0

Shortest athletes in Men's Swimming:
                Names  Heights
96    Miles Fleischer     63.0
2    Charles Cusumano     64.0
36       Eric Mayzlin     65.0
45          Stuart Xu     65.0
103       Wyatt Kurtz     65.0
34        Samuel Liao     65.0

Tallest athletes in Women's Swimming:
                 Names  Heights
115        Grace Geyer     71.0
51     Kornelia Buszka     71.0
76     Bianka Bukovics     71.0
85   Presley Heitzmann     71.0
38   Ayatallah Elkotby     72.0
66     Maja Piotrowicz

In [197]:
from google.colab import files
files.download('Team-SMS-Project-2-Question2.ipynb')

FileNotFoundError: Cannot find file: Team-SMS-Project-2-Question2.ipynb