### Obtaining college data
For the predictions of the college evolution model, we are going to retrieve data from the college players of the 2021/22 season. We'll select only players from teams that reached the NCAA Tournament and have at least 25 games in the season. 

In [1]:
import pandas as pd
import os

In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup, Comment
from collections import Counter

Here we are retrieving the links of the 48 teams that played the NCAA Tournament

In [163]:
html = urlopen('https://www.sports-reference.com/cbb/postseason/2022-ncaa.html')
bs = BeautifulSoup(html, 'html.parser')

In [164]:
link_list = []
school_list = []
for i in range(3):
    for element in bs.find_all('div', {'id':'bracket'})[i].find_all(class_ = 'round')[0].find_all('a'):
        if ('schools' in element['href'] and ~element.string.isdecimal()):
            link_list.append(element['href'])
            school_list.append(element.string)

In [165]:
df_school = pd.DataFrame(link_list, index = school_list, columns = ['link'])

For each one of those teams, we are selecting the players that have at least 25 games. Here we are getting the link of the main page of each player from the Sports Reference.

In [166]:
%%time
player_df = pd.DataFrame()
player_link = []
player_dict = {}
for i in range(len(link_list)):
    html = urlopen('https://www.sports-reference.com' + link_list[i])
    bs = BeautifulSoup(html, 'html.parser')
    for player in bs.find('div', attrs = {'id':'switcher_advanced_players'}).find('tbody').find_all('td', attrs = {'data-stat':'g'}):
        if int(player.string) >= 25:
            player_link.append(player.parent.find('td', attrs = {'data-stat':'player'}).find('a')['href'])
            player_dict['player_link'] = player.parent.find('td', attrs = {'data-stat':'player'}).find('a')['href']
            player_dict['school'] = school_list[i]
            player_dict['name'] = player.parent.find('td', attrs = {'data-stat':'player'}).find('a').string
            player_df = player_df.append(player_dict, ignore_index=True)

Wall time: 1min 15s


In [171]:
player_df

Unnamed: 0,name,player_link,school
0,James Akinjo,/cbb/players/james-akinjo-1.html,Baylor
1,Adam Flagler,/cbb/players/adam-flagler-1.html,Baylor
2,Kendall Brown,/cbb/players/kendall-brown-1.html,Baylor
3,Matthew Mayer,/cbb/players/matthew-mayer-1.html,Baylor
4,Jeremy Sochan,/cbb/players/jeremy-sochan-1.html,Baylor
...,...,...,...
419,Kevin Anderson,/cbb/players/kevin-anderson-6.html,Delaware
420,Jyare Davis,/cbb/players/jyare-davis-1.html,Delaware
421,Ebby Asamoah,/cbb/players/ebby-asamoah-1.html,Delaware
422,Ryan Allen,/cbb/players/ryan-allen-3.html,Delaware


Finally, for each player we are retrieving the data needed for the prediction of the model and we are going to save it in a csv file

In [None]:
## Takes about one hour to run
%%time
player_df_stats = pd.DataFrame()
for i in range(len(player_link)):
    html = urlopen('https://www.sports-reference.com' + player_link[i])
    bs = BeautifulSoup(html, 'html.parser')
    player_dict = {}
    player_dict['mp_per_g'] = bs.find('table', attrs={'id':'players_per_game'}).find('tbody').find('tr', attrs={'id':'players_per_game.2022'}).find('td', attrs = {'data-stat':'mp_per_g'}).string
    player_dict['trb_per_g'] = bs.find('table', attrs={'id':'players_per_game'}).find('tbody').find('tr', attrs={'id':'players_per_game.2022'}).find('td', attrs = {'data-stat':'trb_per_g'}).string
    player_dict['fg3_pct'] = bs.find('table', attrs={'id':'players_per_game'}).find('tbody').find('tr', attrs={'id':'players_per_game.2022'}).find('td', attrs = {'data-stat':'fg3_pct'}).string
    player_dict['ast_per_g'] = bs.find('table', attrs={'id':'players_per_game'}).find('tbody').find('tr', attrs={'id':'players_per_game.2022'}).find('td', attrs = {'data-stat':'ast_per_g'}).string
    player_dict['pts_per_g'] = bs.find('table', attrs={'id':'players_per_game'}).find('tbody').find('tr', attrs={'id':'players_per_game.2022'}).find('td', attrs = {'data-stat':'pts_per_g'}).string
    player_dict['ft_pct'] = bs.find('table', attrs={'id':'players_per_game'}).find('tbody').find('tr', attrs={'id':'players_per_game.2022'}).find('td', attrs = {'data-stat':'ft_pct'}).string
    player_dict['g_col'] = bs.find('table', attrs={'id':'players_per_game'}).find('tbody').find('tr', attrs={'id':'players_per_game.2022'}).find('td', attrs = {'data-stat':'g'}).string
    player_dict['name'] = bs.find('h1').span.string
    player_dict['n_season'] = len(bs.find('table', attrs={'id':'players_per_game'}).find('tbody').find_all('tr'))
    player_df_stats = player_df_stats.append(player_dict, ignore_index=True)
    print(i)  

In [176]:
player_df_stats.head()

Unnamed: 0,ast_per_g,fg3_pct,ft_pct,g_col,mp_per_g,n_season,name,pts_per_g,trb_per_g
0,5.8,0.295,0.835,32,33.1,4.0,,13.5,2.8
1,3.0,0.387,0.741,31,30.7,4.0,,13.8,2.2
2,1.9,0.341,0.689,34,27.0,1.0,,9.7,4.9
3,1.0,0.324,0.7,33,22.8,4.0,,9.8,5.0
4,1.8,0.296,0.589,30,25.1,1.0,,9.2,6.4


In [174]:
final_df = player_df.merge(player_df_stats.drop(columns = ['name']), left_index = True, right_index = True)

In [177]:
final_df.head()

Unnamed: 0,name,player_link,school,ast_per_g,fg3_pct,ft_pct,g_col,mp_per_g,n_season,pts_per_g,trb_per_g
0,James Akinjo,/cbb/players/james-akinjo-1.html,Baylor,5.8,0.295,0.835,32,33.1,4.0,13.5,2.8
1,Adam Flagler,/cbb/players/adam-flagler-1.html,Baylor,3.0,0.387,0.741,31,30.7,4.0,13.8,2.2
2,Kendall Brown,/cbb/players/kendall-brown-1.html,Baylor,1.9,0.341,0.689,34,27.0,1.0,9.7,4.9
3,Matthew Mayer,/cbb/players/matthew-mayer-1.html,Baylor,1.0,0.324,0.7,33,22.8,4.0,9.8,5.0
4,Jeremy Sochan,/cbb/players/jeremy-sochan-1.html,Baylor,1.8,0.296,0.589,30,25.1,1.0,9.2,6.4


In [178]:
final_df.to_csv('college_2022.csv', index = False)