In [11]:
import numpy as np
import pandas as pd
import requests
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.max_columns', None)

# Set up the pandas dataframe
play_data = pd.DataFrame()

# Query the API for the CPJ data. We only need to do this once, so it's commented out.
teams = requests.get("https://api.collegefootballdata.com/teams?conference=ACC").json()
for team in teams:
    team_name = team['school']
    for year in range(2007,2018):
        response = requests.get("https://api.collegefootballdata.com/plays?seasonType=both&year={0}&offense={1}".format(year, team_name))
        df = pd.io.json.json_normalize(response.json())
        play_data = pd.concat([play_data,df])

    # Data is exported to a CSV, which we will be pulling from now
    # play_data.to_csv('./data/plays/GeorgiaTech.csv')
    play_data.to_csv('./data/plays/{0}.csv'.format(team_name))


# data = pd.read_csv('./data/plays/GeorgiaTech.csv')[['home','away','offense_score','defense_score','period','clock.minutes','clock.seconds','yards_to_goal','down','distance','play_type']]
# data = pd.read_csv('./data/plays/Alabama.csv')[['home','away','offense_score','defense_score','period','clock.minutes','clock.seconds','yards_to_goal','down','distance','play_type']]
data = play_data

# Is GT playing at home?
data['is_home'] = np.where(data['home'] == 'Georgia Tech', 1, 0)

# Convert this to a single column
data['seconds_remaining'] = (data['clock.minutes'] * 60) + data['clock.seconds'] + ((data['clock.minutes'] + (4 - data['period']) * 15) * 60)

data['score_diff'] = data['offense_score'] - data['defense_score']

# I want both the result, and the playcall
pass_types = ['Pass Reception', 'Pass Interception Return', 'Pass Incompletion', 'Sack', 'Passing Touchdown', 'Interception Return Touchdown']
rush_types = ['Rush', 'Rushing Touchdown']
punt_types = ['Punt', 'Punt Return Touchdown', 'Blocked Punt', 'Blocked Punt Touchdown']
fg_types = ['Field Goal Good', 'Field Goal Missed', 'Blocked Field Goal']

def getPlayCall(x):
    if x in pass_types:
            return 'pass'
    elif x in rush_types:
        return 'rush'
    elif x in punt_types:
        return 'punt'
    elif x in fg_types:
        return 'fg'
    else:
        return None
        
data['play_call'] = data['play_type'].apply(getPlayCall)
data.dropna(subset=['play_call'], inplace=True)

plays = data[['offense_score', 'defense_score', 'period', 'yards_to_goal', 'down', 'distance', 'is_home', 'seconds_remaining', 'play_call','score_diff']].query('period <= 4')

# Refining the model
plays = plays.drop(columns=['is_home', 'period', 'offense_score', 'defense_score'])

# split the data set between our independent variables (i.e. features) and our dependent variable or output
play_calls = plays['play_call']
plays_wo_call = plays.drop(['play_call'], axis=1)

# split the data into training and validation sets
plays_train, plays_validation, calls_train, calls_validation = train_test_split(plays_wo_call, play_calls, train_size=0.8, test_size=0.2, random_state=0)
plays_train.head()
y, y_keys = pd.factorize(calls_train)
# build the classifier
classifier = RandomForestClassifier(random_state=0, n_estimators=100)

# train the classifier with our test set
classifier.fit(plays_train, y)
classifier.predict(plays_validation)
classifier.predict_proba(plays_validation)[0:10]

predicted_calls = y_keys[classifier.predict(plays_validation)]
pd.crosstab(calls_validation, predicted_calls, rownames=['Actual Calls'], colnames=['Predicted Calls'])


# plays['predicted_call'] = y_keys[classifier.predict_proba(plays_wo_call)]

x = classifier.predict_proba(plays_wo_call)
plays['rush_prob'] = x[:,0]
plays['pass_prob'] = x[:,1]
plays['punt_prob'] = x[:,2]
plays['fg_prob'] = x[:,3]
plays.head()

# Lists out the weights for each tree decision
# list(zip(plays_train, classifier.feature_importances_))

Boston College
Clemson
Duke
Florida State
Georgia Tech
Louisville
Miami
NC State
North Carolina
Pittsburgh
Syracuse
Virginia
Virginia Tech
Wake Forest


Unnamed: 0,yards_to_goal,down,distance,seconds_remaining,play_call,score_diff,rush_prob,pass_prob,punt_prob,fg_prob
0,47,1,10,4418,rush,6,0.29,0.71,0.0,0.0
2,25,1,10,383,rush,46,0.01,0.99,0.0,0.0
3,54,1,10,4275,pass,7,0.67,0.33,0.0,0.0
4,54,2,10,4270,rush,7,0.08,0.92,0.0,0.0
5,44,1,10,4194,rush,7,0.26,0.74,0.0,0.0


In [12]:
fig = px.scatter_3d(
    plays,
    x='down',
    y='distance',
    z='seconds_remaining',
    color='play_call',
    title='playcalls'
)

fig.show()

In [13]:

fourth_down_plays = plays[plays.down.eq(4)]
fourth_down_plays.head()

Unnamed: 0,yards_to_goal,down,distance,seconds_remaining,play_call,score_diff,rush_prob,pass_prob,punt_prob,fg_prob
14,9,4,0,3923,fg,7,0.06,0.11,0.0,0.83
36,40,4,8,2815,punt,18,0.02,0.0,0.97,0.01
54,4,4,4,1805,fg,21,0.0,0.0,0.0,1.0
135,2,4,2,3679,fg,21,0.01,0.09,0.0,0.9
141,79,4,14,3420,punt,21,0.0,0.0,1.0,0.0
