In [262]:
import pandas as pd
import numpy as np
from __future__ import division
import collections

# Visualizing All the Historical Data

Let's first take a look at all the historical data we have. These are stats from the 1985 - 2015 NCAA basketball seasons. 

In [72]:
# This one contains stats for every single regular season game played between 1985 and 2015. It mainly
# contains info on the score of the game, the IDs for each team, and where the game was played.
reg_season_compact_pd = pd.read_csv('RegularSeasonCompactResults.csv')
reg_season_compact_pd.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0
3,1985,25,1165,70,1432,54,H,0
4,1985,25,1192,86,1447,74,H,0


In [183]:
# This one expands on the previous data frame by going into more in depth stats like 3 point field goals,
# free throws, steals, blocks, etc. 
reg_season_detailed_pd = pd.read_csv('RegularSeasonDetailedResults.csv')
reg_season_detailed_pd.columns

Index([u'Season', u'Daynum', u'Wteam', u'Wscore', u'Lteam', u'Lscore', u'Wloc',
       u'Numot', u'Wfgm', u'Wfga', u'Wfgm3', u'Wfga3', u'Wftm', u'Wfta',
       u'Wor', u'Wdr', u'Wast', u'Wto', u'Wstl', u'Wblk', u'Wpf', u'Lfgm',
       u'Lfga', u'Lfgm3', u'Lfga3', u'Lftm', u'Lfta', u'Lor', u'Ldr', u'Last',
       u'Lto', u'Lstl', u'Lblk', u'Lpf'],
      dtype='object')

In [14]:
# Don't think this data is honestly that important. Just contains the region areas for the tournament each 
# year. There isn't really a distinct "home field" advantage in the tourney because the games are supposed
# to be on neutral sites. 
seasons_pd = pd.read_csv('Seasons.csv')
seasons_pd.head()

Unnamed: 0,Season,Dayzero,Regionw,Regionx,Regiony,Regionz
0,1985,10/29/1984,East,West,Midwest,Southeast
1,1986,10/28/1985,East,Midwest,Southeast,West
2,1987,10/27/1986,East,Southeast,Midwest,West
3,1988,11/2/1987,East,Midwest,Southeast,West
4,1989,10/31/1988,East,West,Midwest,Southeast


In [261]:
teams_pd = pd.read_csv('Teams.csv')
teamList = teams_pd['Team_Name'].tolist()
teams_pd.head()

Unnamed: 0,Team_Id,Team_Name
0,1101,Abilene Chr
1,1102,Air Force
2,1103,Akron
3,1104,Alabama
4,1105,Alabama A&M


In [217]:
# Finding Kansas's Team_id cuz Frank Mason is too good
print teams_pd[teams_pd['Team_Name'] == 'Kansas']

     Team_Id Team_Name
141     1242    Kansas


In [30]:
# Jk, Lonzo Ball > Frank Mason
print teams_pd[teams_pd['Team_Name'] == 'UCLA']

     Team_Id Team_Name
316     1417      UCLA


In [36]:
# This one contains the stats for every single NCAA tournament game from 1985 to 2015
tourney_compact_pd = pd.read_csv('TourneyCompactResults.csv')
tourney_compact_pd.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot
0,1985,136,1116,63,1234,54,N,0
1,1985,136,1120,59,1345,58,N,0
2,1985,136,1207,68,1250,43,N,0
3,1985,136,1229,58,1425,55,N,0
4,1985,136,1242,49,1325,38,N,0


In [35]:
# More deatiled tourney stats (except only stats from 2003 :( )
tourney_detailed_pd = pd.read_csv('TourneyDetailedResults.csv')
tourney_detailed_pd.head()

Unnamed: 0,Season,Daynum,Wteam,Wscore,Lteam,Lscore,Wloc,Numot,Wfgm,Wfga,...,Lfga3,Lftm,Lfta,Lor,Ldr,Last,Lto,Lstl,Lblk,Lpf
0,2003,134,1421,92,1411,84,N,1,32,69,...,31,14,31,17,28,16,15,5,0,22
1,2003,136,1112,80,1436,51,N,0,31,66,...,16,7,7,8,26,12,17,10,3,15
2,2003,136,1113,84,1272,71,N,0,31,59,...,28,14,21,20,22,11,12,2,5,18
3,2003,136,1141,79,1166,73,N,0,29,53,...,17,12,17,14,17,20,21,6,6,21
4,2003,136,1143,76,1301,74,N,1,27,64,...,21,15,20,10,26,16,14,5,8,19


In [37]:
# This one tells you what seed each team was for a given tournament year
tourney_seeds_pd = pd.read_csv('TourneySeeds.csv')
tourney_seeds_pd.head()

Unnamed: 0,Season,Seed,Team
0,1985,W01,1207
1,1985,W02,1210
2,1985,W03,1228
3,1985,W04,1260
4,1985,W05,1374


In [197]:
# Seeing what seed Duke was in every tourney
duke_id = teams_pd[teams_pd['Team_Name'] == 'Duke'].values[0][0]
tourney_seeds_pd[tourney_seeds_pd['Team'] == duke_id]

Unnamed: 0,Season,Seed,Team
34,1985,Y03,1181
64,1986,W01,1181
164,1987,Y05,1181
193,1988,W02,1181
257,1989,W02,1181
322,1990,W03,1181
417,1991,Y02,1181
448,1992,W01,1181
530,1993,X03,1181
593,1994,X02,1181


In [70]:
# Don't know how helpful this is tbh, because it just tells you what the seeds of the stronger
# and weaker seeds are (assuming that the favored team wins??), so its always 1 vs 16 and then 
# 1 vs 8 and 1 vs 4..
tourney_slots_pd = pd.read_csv('TourneySlots.csv')
tourney_slots_pd.head()

Unnamed: 0,Season,Slot,Strongseed,Weakseed
0,1985,R1W1,W01,W16
1,1985,R1W2,W02,W15
2,1985,R1W3,W03,W14
3,1985,R1W4,W04,W13
4,1985,R1W5,W05,W12


# Features

One of the most important parts of this competition is determining what features are the most important
to determining which team in a head to head matchup will win. Possible features???

* Number of regular season wins 
* Seed Number
* Any head to head matchup in the season?
* Number of top 25 wins in a season
* Number of top 25 games played in a season
* W-L record against common opponents
* Points per game scored
* Points per game allowed (Really important imo)
* 3 Pointers  made per game (Really important imo)
* 3 Pointers  allowed per game
* Turnovers per game
* Avg Steals/Blocks/Assists/Rebounds (How important are these stats tho??)
* Team's Conference (ACC, Big Ten, etc)
* Conference W-L record
* Conference Strength of Schedule
* Nonconference W-L record 
* Nonconference Strength of Schedule
* Regular Season Conference Champion?
* Conference Tournament Champion?
* Number of seniors on team (shows experience)
* Number of freshman on team (can they handle the big stage)
* Number of home wins
* Number of road wins

Given that we know the above information for Team X and Team Y, can we make a model that determines who has the 
better probability of winning?

# Model

Let's make this supervised trianing problem where our model with take in 2 d dimensional vectors, representing the two teams playing in the tournament. Each d dimensional vector will contain information about the team for that year. Below are the features we end up using. 

* Regular Season Wins 
* Points per game season average 
* Points per game allowed season average
* Whether or not in Power 5 conference (ACC, Big Ten, Big 12, SEC, Pac 12) - Binary label
* Number of 3's per game
* Number of 3's allowed per game
* Tournament Seed

In [149]:
listACCteams = ['North Carolina','Virginia','Florida St','Louisville','Notre Dame','Syracuse','Duke','Virginia Tech','Georgia Tech','Miami','Wake Forest','Clemson','NC State','Boston College','Pittsburgh']
listPac12teams = ['Arizona','Oregon','UCLA','California','USC','Utah','Washington St','Stanford','Arizona St','Colorado','Washington','Oregon St']
listSECteams = ['Kentucky','South Carolina','Florida','Arkansas','Alabama','Tennessee','Mississippi St','Georgia','Ole Miss','Vanderbilt','Auburn','Texas A&M','LSU','Missouri']
listBig10teams = ['Maryland','Wisconsin','Purdue','Northwestern','Michigan St','Indiana','Iowa','Michigan','Penn St','Nebraska','Minnesota','Illinois','Ohio St','Rutgers']
listBig12teams = ['Kansas','Baylor','West Virginia','Iowa St','TCU','Kansas St','Texas Tech','Oklahoma St','Texas','Oklahoma']

In [192]:
def checkPower5Conference(team_id):
    teamName = teams_pd.values[team_id-1101][1]
    if (teamName in listACCteams or teamName in listBig10teams or teamName in listBig12teams
       or teamName in listSECteams or teamName in listPac12teams):
        return 1
    else:
        return 0

In [266]:
def getSeasonData(team_id, year):
    # The data frame below holds stats for every single game in the given year
    year_data_pd = reg_season_detailed_pd[reg_season_detailed_pd['Season'] == year]
    # Finding number of points per game
    gamesWon = year_data_pd[year_data_pd.Wteam == team_id] 
    totalPointsScored = gamesWon['Wscore'].sum()
    gamesLost = year_data_pd[year_data_pd.Lteam == team_id] 
    totalGames = gamesWon.append(gamesLost)
    numGames = len(totalGames.index)
    totalPointsScored += gamesLost['Lscore'].sum()
    
    # Finding number of points per game allowed
    totalPointsAllowed = gamesWon['Lscore'].sum()
    totalPointsAllowed += gamesLost['Wscore'].sum()
    
    #Finding number of 3's made per game
    total3sMade = (gamesWon['Wfgm3'].sum())
    total3sMade += (gamesLost['Lfgm3'].sum())
    
    #Finding number of 3's allowed per game
    total3sAllowed = (gamesWon['Lfgm3'].sum())
    total3sAllowed += (gamesLost['Wfgm3'].sum())
    
    #Finding tournament seed for that year
    tourneyYear = tourney_seeds_pd[tourney_seeds_pd['Season'] == year]
    seed = tourneyYear[tourneyYear['Team'] == team_id]
    if (len(seed.index) != 0):
        seed = seed.values[0][1]
        tournamentSeed = int(seed[1:3])
    else:
        tournamentSeed = 25 #Not sure how to represent if a team didn't make the tourney
    
    # Finding number of wins and losses
    numWins = len(gamesWon.index)
    # There are some teams who may have dropped to Division 2, so they won't have games 
    # a certain year. In this case, we don't want to divide by 0, so we'll just set the
    # averages to 0 instead
    if numGames == 0:
        avgPointsScored = 0
        avgPointsAllowed = 0
        avg3sMade = 0
        avg3sAllowed = 0
    else:
        avgPointsScored = totalPointsScored/numGames
        avgPointsAllowed = totalPointsAllowed/numGames
        avg3sMade = total3sMade/numGames
        avg3sAllowed = total3sAllowed/numGames
    return [numWins, avgPointsScored, avgPointsAllowed, checkPower5Conference(team_id), avg3sMade, avg3sAllowed,
           tournamentSeed]

In [250]:
def compareTwoTeams(id_1, id_2, year):
    team_1 = getSeasonData(id_1, year)
    team_2 = getSeasonData(id_2, year)
    diff = [a - b for a, b in zip(team_1, team_2)]
    return diff

In [252]:
kentucky_id = teams_pd[teams_pd['Team_Name'] == 'Kentucky'].values[0][0]
duke_id = teams_pd[teams_pd['Team_Name'] == 'Duke'].values[0][0]
ucla_id = teams_pd[teams_pd['Team_Name'] == 'UCLA'].values[0][0]
kansas_id = teams_pd[teams_pd['Team_Name'] == 'Kansas'].values[0][0]
compareTwoTeams(kentucky_id, ucla_id, 2012)

[14,
 8.83272058823529,
 -3.59375,
 0,
 0.7352941176470589,
 -0.35294117647058787,
 -24]

In [267]:
def createTrainingSet(year):
    seasonDictionary = collections.defaultdict(list)
    for team in teamList:
        team_id = teams_pd[teams_pd['Team_Name'] == team].values[0][0]
        team_vector = getSeasonData(team_id, year)
        seasonDictionary[team] = team_vector
    return seasonDictionary
createTrainingSet(2012)

defaultdict(list,
            {'Abilene Chr': [0, 0, 0, 0, 0, 0, 25],
             'Air Force': [11,
              60.666666666666664,
              63.25925925925926,
              0,
              6.2592592592592595,
              6.851851851851852,
              25],
             'Akron': [21, 71.46875, 65.625, 0, 5.90625, 4.78125, 25],
             'Alabama': [21, 64.9375, 58.125, 1, 4.03125, 4.84375, 9],
             'Alabama A&M': [5,
              61.11538461538461,
              69.65384615384616,
              0,
              5.269230769230769,
              4.653846153846154,
              25],
             'Alabama St': [10,
              57.724137931034484,
              63.55172413793103,
              0,
              6.0,
              4.482758620689655,
              25],
             'Albany NY': [19,
              71.93939393939394,
              68.54545454545455,
              0,
              7.0606060606060606,
              7.181818181818182,
              25],
