Imports

In [2]:
import pandas as pd
import numpy as np
import collections
import matplotlib.pyplot as plt
%matplotlib inline
import math
import sys
import csv
import urllib
import bs4
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
import Levenshtein
import difflib


Taking a look at the data

In [3]:
#set the path to the main data directory
path = '../Data/mens-machine-learning-competition-2019/DataFiles/'

In [4]:
# This one contains stats for every single regular season game played between 1985 and 2018. It mainly
# contains info on the score of the game, the IDs for each team, and where the game was played.
reg_season_compact_pd = pd.read_csv(path + 'RegularSeasonCompactResults.csv')
reg_season_compact_pd.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0


In [8]:
# This one expands on the previous data frame by going into more in depth stats like 3 point field goals,
# free throws, steals, blocks, etc. 
#NOTE: Whereas the previous dataframe has data from 1985, this one only has data from 2003 on!
reg_season_detailed_pd = pd.read_csv(path+'RegularSeasonDetailedResults.csv')
reg_season_detailed_pd.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF'],
      dtype='object')

In [73]:
teams_pd = pd.read_csv(path+'Teams.csv')
teams_pd.head()

Unnamed: 0,TeamID,TeamName,FirstD1Season,LastD1Season
0,1101,Abilene Chr,2014,2019
1,1102,Air Force,1985,2019
2,1103,Akron,1985,2019
3,1104,Alabama,1985,2019
4,1105,Alabama A&M,2000,2019


In [None]:
#only consider teams that were Division 1 in 2019? May just need to throw out games with an invalid teamID

In [26]:
# Don't think this data is honestly that important. Just contains the region areas for the tournament each 
# year. There isn't really a distinct "home field" advantage in the tourney because the games are supposed
# to be on neutral sites. 
# Not sure why this data set has this arcane region system. Nevertheless, gets used in the seeds data
seasons_pd = pd.read_csv(path+'Seasons.csv')
seasons_pd.head()

Unnamed: 0,Season,DayZero,RegionW,RegionX,RegionY,RegionZ
0,1985,10/29/1984,East,West,Midwest,Southeast
1,1986,10/28/1985,East,Midwest,Southeast,West
2,1987,10/27/1986,East,Southeast,Midwest,West
3,1988,11/2/1987,East,Midwest,Southeast,West
4,1989,10/31/1988,East,West,Midwest,Southeast


In [18]:
# Print Virginia because GO HOOS
print (teams_pd[teams_pd['TeamName'] == 'Virginia'])

     TeamID  TeamName  FirstD1Season  LastD1Season
337    1438  Virginia           1985          2019


In [77]:
# This one contains the stats for every single NCAA tournament game from 1985 to 2018
tourney_compact_pd = pd.read_csv(path+'NCAATourneyCompactResults.csv')
tourney_compact_pd.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0


In [1]:
# More deatiled tourney stats (except only stats from 2003 :( )
tourney_detailed_pd = pd.read_csv(path+'NCAATourneyDetailedResults.csv')
tourney_detailed_pd.columns

NameError: name 'pd' is not defined

In [25]:
# This one tells you what seed each team was for a given tournament year
tourney_seeds_pd = pd.read_csv(path+'NCAATourneySeeds.csv')
tourney_seeds_pd.head()

Unnamed: 0,Season,Seed,TeamID
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


In [29]:
# Seeing what seed Virgnina was in every tourney
virginia_id = teams_pd[teams_pd['TeamName'] == 'Virginia'].values[0][0]
tourney_seeds_pd[tourney_seeds_pd['TeamID'] == virginia_id]

Unnamed: 0,Season,Seed,TeamID
68,1986,W05,1438
180,1987,Z05,1438
308,1989,Z05,1438
358,1990,Y07,1438
438,1991,Z07,1438
517,1993,W06,1438
630,1994,Z07,1438
675,1995,Y04,1438
824,1997,Z09,1438
1077,2001,Z05,1438


In [33]:
# Don't know how helpful this is tbh, because it just tells you what the seeds of the stronger
# and weaker seeds are (assuming that the favored team wins??), so its always 1 vs 16 and then 
# 1 vs 8 and 1 vs 4..
tourney_slots_pd = pd.read_csv(path+'NCAATourneySlots.csv')
tourney_slots_pd[tourney_slots_pd['Slot']=='R3W1'].head()

Unnamed: 0,Season,Slot,StrongSeed,WeakSeed
48,1985,R3W1,R2W1,R2W4
111,1986,R3W1,R2W1,R2W4
174,1987,R3W1,R2W1,R2W4
237,1988,R3W1,R2W1,R2W4
300,1989,R3W1,R2W1,R2W4


In [37]:
conference_pd = pd.read_csv(path+'Conferences.csv')
conference_pd.head()

Unnamed: 0,ConfAbbrev,Description
0,a_sun,Atlantic Sun Conference
1,a_ten,Atlantic 10 Conference
2,aac,American Athletic Conference
3,acc,Atlantic Coast Conference
4,aec,America East Conference


In [38]:
team_conference_pd = pd.read_csv(path+'TeamConferences.csv')
team_conference_pd.head()

Unnamed: 0,Season,TeamID,ConfAbbrev
0,1985,1114,a_sun
1,1985,1147,a_sun
2,1985,1204,a_sun
3,1985,1209,a_sun
4,1985,1215,a_sun


That's pretty lackluster conference information compared to years past. Definitely want to get ahold of some SRS (Simple Rating System) and SOS (Strength of Schedule). In the past, averages per conference per year were included in the conference data. 
Question: Do we want to scrape these numbers for every team+season? Or is it sufficient to get the averages for each conference+season? Is the former that much more difficult than the latter?

In [100]:
#building a crawler for team-season SRS and SOS
def url_format_team_name(team_name):
    formatted_tn = team_name.replace(' St', '-state')
    formatted_tn = formatted_tn.replace(' Chr', '-christian')
    formatted_tn = formatted_tn.replace(' A&M', '-am')
    formatted_tn = formatted_tn.replace('Ark ', 'arkansas-',1)
    formatted_tn = formatted_tn.replace(' Intl', '-international')
    formatted_tn = formatted_tn.replace('CS ', 'cal-state-',1)
    formatted_tn = formatted_tn.replace('E ', 'east-',1)
    formatted_tn = formatted_tn.replace('W ', 'west-',1)
    formatted_tn = formatted_tn.replace('UC ', 'california-',1)
    
    formatted_tn = formatted_tn.replace(' ', '-')
    
    return formatted_tn

def url_format_team(teamID):
    team_name = teams_pd[teams_pd['TeamID'] == teamID]
    

def get_SRS_and_SOS(teamID, season):
    pass

def collect_schedule_stats():
    pass

In [98]:
#url crawler
#We want to obtain a list of team names as they are formatted in the urls of sports reference
#Then, for each team name in the teams csv, associate it with the url-formatted team name with the highest (cosine sim)
#Build a dataframe with columns teamID, teamName, URLFormattedTeamName
def crawl_urls():
    url = 'https://www.sports-reference.com/cbb/schools/'
    source = urllib.request.urlopen(url).read().decode('utf-8')
    soup = bs4.BeautifulSoup(source, "lxml")
    name_list = []
    for link in soup.find_all('a'):
        if '/cbb/schools/' in link.get('href'):
            name = link.get('href')[13:-1]
            if len(name) > 0 and '.htm' not in name:
                name_list.append(name)
    return name_list

def associate_urls():
    #in which i use tools that I don't understand
    url_name_list = crawl_urls()
    
    #tfidf_vectorizer=TfidfVectorizer(analyzer='char', vocabulary=string.printable)
    #teamName_matrix=tfidf_vectorizer.fit_transform(teams_pd['TeamName'])
    #url_tn_matrix = tfidf_vectorizer.fit_transform(url_name_list)
    
    for i in range(len(teams_pd['TeamID'])):
        #max_cs = 0
        best_url = ''
        team_row_data = teams_pd.loc[i, : ]
        team_id = team_row_data['TeamID']
        team_name = team_row_data['TeamName']
        best_url = difflib.get_close_matches(url_format_team_name(team_name), url_name_list, n=1, cutoff=0.6)
        #for j in range(len(url_name_list)):
            #print(team_name)
            #print(teamName_matrix[i])
            #print(url_name_list[j])
            #print(url_tn_matrix[j])
            #cs = cosine_similarity(teamName_matrix[i],url_tn_matrix[j])
            #if cs > max_cs:
                #max_cs = cs
                #best_url = url_name_list[j]
        print([team_id, team_name, best_url])

associate_urls()


[1101, 'Abilene Chr', ['abilene-christian']]
[1102, 'Air Force', ['air-force']]
[1103, 'Akron', ['akron']]
[1104, 'Alabama', ['alabama']]
[1105, 'Alabama A&M', ['alabama-am']]
[1106, 'Alabama St', ['alabama-state']]
[1107, 'Albany NY', ['albany-ny']]
[1108, 'Alcorn St', ['alcorn-state']]
[1109, 'Alliant Intl', ['alliant-international']]
[1110, 'American Univ', ['american']]
[1111, 'Appalachian St', ['appalachian-state']]
[1112, 'Arizona', ['arizona']]
[1113, 'Arizona St', ['arizona-state']]
[1114, 'Ark Little Rock', ['arkansas-little-rock']]
[1115, 'Ark Pine Bluff', ['arkansas-pine-bluff']]
[1116, 'Arkansas', ['arkansas']]
[1117, 'Arkansas St', ['arkansas-state']]
[1118, 'Armstrong St', ['western-state']]
[1119, 'Army', ['army']]
[1120, 'Auburn', ['auburn']]
[1121, 'Augusta', ['augustana-il']]
[1122, 'Austin Peay', ['austin-peay']]
[1123, 'Ball St', ['ball-state']]
[1124, 'Baylor', ['baylor']]
[1125, 'Belmont', ['belmont']]
[1126, 'Bethune-Cookman', ['bethune-cookman']]
[1127, 'Bingham

[1344, 'Providence', ['providence']]
[1345, 'Purdue', ['purdue']]
[1346, 'Quinnipiac', ['quinnipiac']]
[1347, 'Radford', ['radford']]
[1348, 'Rhode Island', ['rhode-island']]
[1349, 'Rice', ['rice']]
[1350, 'Richmond', ['richmond']]
[1351, 'Rider', ['rider']]
[1352, 'Robert Morris', ['robert-morris']]
[1353, 'Rutgers', ['rutgers']]
[1354, 'S Carolina St', ['south-carolina-state']]
[1355, 'S Dakota St', ['south-dakota-state']]
[1356, 'S Illinois', ['illinois']]
[1357, 'Sacred Heart', ['sacred-heart']]
[1358, 'Sam Houston St', ['sam-houston-state']]
[1359, 'Samford', ['samford']]
[1360, 'San Diego', ['san-diego']]
[1361, 'San Diego St', ['san-diego-state']]
[1362, 'San Francisco', ['san-francisco']]
[1363, 'San Jose St', ['san-jose-state']]
[1364, 'Santa Barbara', ['santa-clara']]
[1365, 'Santa Clara', ['santa-clara']]
[1366, 'Savannah St', ['savannah-state']]
[1367, 'SC Upstate', ['utah-state']]
[1368, 'SE Louisiana', ['southeastern-louisiana']]
[1369, 'SE Missouri St', ['southeast-miss

Initial analysis

Building a preliminary team-season vector