In [1]:
import numpy as np
import pandas as pd
import requests

In [2]:
data = pd.DataFrame()

for year in range(2015,2020):
    response = requests.get("https://api.collegefootballdata.com/plays?seasonType=both&year={0}&offense=michigan".format(year))
    df = pd.io.json.json_normalize(response.json())
    data = pd.concat([data, df])
    
data.head()

Unnamed: 0,id,offense,offense_conference,defense,defense_conference,home,away,offense_score,defense_score,drive_id,...,yard_line,yardstogoal,down,distance,yards_gained,play_type,play_text,ppa,clock.minutes,clock.seconds
0,400756883101849902,Michigan,Big Ten,Utah,Pac-12,Utah,Michigan,0,0,4007568831,...,65,65,1,10,0,Kickoff,Kenny Allen kickoff for 65 yds for a touchback,,15,0
1,400756883101914401,Michigan,Big Ten,Utah,Pac-12,Utah,Michigan,0,3,4007568832,...,64,64,1,10,0,Pass Incompletion,Jake Rudock pass incomplete to Jake Butt,-1.041155320344064,8,55
2,400756883101917001,Michigan,Big Ten,Utah,Pac-12,Utah,Michigan,0,3,4007568832,...,64,64,2,10,20,Rush,Ty Isaac run for 5 yds to the Utah 41 for a 1S...,2.7754655706093736,8,29
3,400756883101898901,Michigan,Big Ten,Utah,Pac-12,Utah,Michigan,0,3,4007568832,...,75,75,1,10,1,Rush,De'Veon Smith run for 1 yd to the Mich 26,-0.5874795431016855,10,10
4,400756883101919801,Michigan,Big Ten,Utah,Pac-12,Utah,Michigan,0,3,4007568832,...,49,49,1,15,0,Timeout,"Timeout MICHIGAN, clock 08:01",,8,1


In [3]:
data = data[['home', 'away', 'offense_score', 'defense_score', 'period', 'clock.minutes', 'clock.seconds', 'yardstogoal', 'down', 'distance', 'play_type']]
data.head()

Unnamed: 0,home,away,offense_score,defense_score,period,clock.minutes,clock.seconds,yardstogoal,down,distance,play_type
0,Utah,Michigan,0,0,1,15,0,65,1,10,Kickoff
1,Utah,Michigan,0,3,1,8,55,64,1,10,Pass Incompletion
2,Utah,Michigan,0,3,1,8,29,64,2,10,Rush
3,Utah,Michigan,0,3,1,10,10,75,1,10,Rush
4,Utah,Michigan,0,3,1,8,1,49,1,15,Timeout


In [4]:
data['is_home'] = np.where(lambda x: x.home == 'Michigan', 1, 0)
data.head()

Unnamed: 0,home,away,offense_score,defense_score,period,clock.minutes,clock.seconds,yardstogoal,down,distance,play_type,is_home
0,Utah,Michigan,0,0,1,15,0,65,1,10,Kickoff,1
1,Utah,Michigan,0,3,1,8,55,64,1,10,Pass Incompletion,1
2,Utah,Michigan,0,3,1,8,29,64,2,10,Rush,1
3,Utah,Michigan,0,3,1,10,10,75,1,10,Rush,1
4,Utah,Michigan,0,3,1,8,1,49,1,15,Timeout,1


In [5]:
data['seconds_remaining'] = (data['clock.minutes'] * 60) + data['clock.seconds']
data.head()

Unnamed: 0,home,away,offense_score,defense_score,period,clock.minutes,clock.seconds,yardstogoal,down,distance,play_type,is_home,seconds_remaining
0,Utah,Michigan,0,0,1,15,0,65,1,10,Kickoff,1,900
1,Utah,Michigan,0,3,1,8,55,64,1,10,Pass Incompletion,1,535
2,Utah,Michigan,0,3,1,8,29,64,2,10,Rush,1,509
3,Utah,Michigan,0,3,1,10,10,75,1,10,Rush,1,610
4,Utah,Michigan,0,3,1,8,1,49,1,15,Timeout,1,481


In [6]:
pass_types = ['Pass Reception', 'Pass Interception Return', 'Pass Incompletion', 'Sack', 'Passing Touchdown', 'Interception Return Touchdown']
rush_types = ['Rush', 'Rushing Touchdown']
punt_types = ['Punt', 'Punt Return Touchdown', 'Blocked Punt', 'Blocked Punt Touchdown']
fg_types = ['Field Goal Good', 'Field Goal Missed', 'Blocked Field Goal']

def getPlayCall(x):
    if x in pass_types:
            return 'pass'
    elif x in rush_types:
        return 'rush'
    elif x in punt_types:
        return 'punt'
    elif x in fg_types:
        return 'fg'
    else:
        return None

In [7]:
data['play_call'] = data['play_type'].apply(getPlayCall)
data.head()

Unnamed: 0,home,away,offense_score,defense_score,period,clock.minutes,clock.seconds,yardstogoal,down,distance,play_type,is_home,seconds_remaining,play_call
0,Utah,Michigan,0,0,1,15,0,65,1,10,Kickoff,1,900,
1,Utah,Michigan,0,3,1,8,55,64,1,10,Pass Incompletion,1,535,pass
2,Utah,Michigan,0,3,1,8,29,64,2,10,Rush,1,509,rush
3,Utah,Michigan,0,3,1,10,10,75,1,10,Rush,1,610,rush
4,Utah,Michigan,0,3,1,8,1,49,1,15,Timeout,1,481,


In [8]:
data.dropna(subset=['play_call'], inplace=True)
data.head()

Unnamed: 0,home,away,offense_score,defense_score,period,clock.minutes,clock.seconds,yardstogoal,down,distance,play_type,is_home,seconds_remaining,play_call
1,Utah,Michigan,0,3,1,8,55,64,1,10,Pass Incompletion,1,535,pass
2,Utah,Michigan,0,3,1,8,29,64,2,10,Rush,1,509,rush
3,Utah,Michigan,0,3,1,10,10,75,1,10,Rush,1,610,rush
6,Utah,Michigan,0,3,1,9,50,74,2,9,Pass Reception,1,590,pass
7,Utah,Michigan,0,3,1,9,20,68,3,3,Pass Reception,1,560,pass


In [9]:
plays = data[['offense_score', 'defense_score', 'period', 'yardstogoal', 'down', 'distance', 'is_home', 'seconds_remaining', 'play_call']]
plays.head()

Unnamed: 0,offense_score,defense_score,period,yardstogoal,down,distance,is_home,seconds_remaining,play_call
1,0,3,1,64,1,10,1,535,pass
2,0,3,1,64,2,10,1,509,rush
3,0,3,1,75,1,10,1,610,rush
6,0,3,1,74,2,9,1,590,pass
7,0,3,1,68,3,3,1,560,pass


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [11]:
# split the data set between our independent variables (i.e. features) and our dependent variable or output
play_calls = plays['play_call']
plays = plays.drop(['play_call'], axis=1)

# split the data into training and validation sets
plays_train, plays_validation, calls_train, calls_validation = train_test_split(plays, play_calls, train_size=0.8, test_size=0.2, random_state=0)
plays_train.head()

Unnamed: 0,offense_score,defense_score,period,yardstogoal,down,distance,is_home,seconds_remaining
108,14,0,2,68,1,10,1,735
1118,39,62,4,4,1,4,1,196
923,13,17,4,54,2,9,1,605
214,13,0,2,58,1,13,1,405
232,0,14,1,30,2,1,1,77


In [12]:
y, y_keys = pd.factorize(calls_train)

In [13]:
classifier = RandomForestClassifier(random_state=0, n_estimators=100)
classifier.fit(plays_train, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [15]:
classifier.predict(plays_validation)

array([0, 1, 0, 3, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 1, 1, 3, 0, 1, 3, 1, 3, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 2, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 3, 0, 3, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 3, 0, 0,
       3, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 3, 1, 1, 0, 1, 3, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 2,
       1, 2, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 3, 3, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 3,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 3, 3, 3, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       3, 1, 0, 0, 0, 3, 0, 1, 0, 1, 0, 0, 1, 0, 3, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 3, 0, 0, 0, 1,

In [16]:
classifier.predict_proba(plays_validation)[0:10]

array([[0.51, 0.49, 0.  , 0.  ],
       [0.4 , 0.6 , 0.  , 0.  ],
       [0.64, 0.36, 0.  , 0.  ],
       [0.26, 0.1 , 0.  , 0.64],
       [0.68, 0.32, 0.  , 0.  ],
       [0.68, 0.32, 0.  , 0.  ],
       [0.76, 0.24, 0.  , 0.  ],
       [0.17, 0.83, 0.  , 0.  ],
       [0.78, 0.22, 0.  , 0.  ],
       [0.35, 0.64, 0.01, 0.  ]])

In [18]:
predicted_calls = y_keys[classifier.predict(plays_validation)]
predicted_calls

Index(['rush', 'pass', 'rush', 'punt', 'rush', 'rush', 'rush', 'pass', 'rush',
       'pass',
       ...
       'pass', 'rush', 'pass', 'rush', 'pass', 'rush', 'pass', 'pass', 'rush',
       'pass'],
      dtype='object', length=974)

In [20]:
pd.crosstab(calls_validation, predicted_calls, rownames=['Actual Calls'], colnames=['Predicted Calls'])

Predicted Calls,fg,pass,punt,rush
Actual Calls,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
fg,10,0,1,3
pass,2,214,1,186
punt,0,1,58,1
rush,2,119,4,372


In [21]:
list(zip(plays_train, classifier.feature_importances_))

[('offense_score', 0.10908962441087959),
 ('defense_score', 0.09218030963277997),
 ('period', 0.04304732895605593),
 ('yardstogoal', 0.21569259734963436),
 ('down', 0.20787816951763582),
 ('distance', 0.11463575320038624),
 ('is_home', 0.0),
 ('seconds_remaining', 0.21747621693262817)]

In [22]:
plays['seconds_remaining'] = ((4 - plays['period']) * 15 * 60 ) + plays['seconds_remaining']
plays = plays.drop(columns=['is_home', 'period'])

In [23]:
plays_train, plays_validation, calls_train, calls_validation = train_test_split(plays, play_calls, train_size=0.8, test_size=0.2, random_state=0)
y, y_keys = pd.factorize(calls_train)

classifier = RandomForestClassifier(n_estimators=100, random_state=0)
classifier.fit(plays_train, y)

predicted_calls = y_keys[classifier.predict(plays_validation)]

pd.crosstab(calls_validation, predicted_calls, rownames=['Actual Calls'], colnames=['Predicted Calls'])

Predicted Calls,fg,pass,punt,rush
Actual Calls,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
fg,10,0,1,3
pass,2,216,2,183
punt,0,1,59,0
rush,2,132,5,358


In [24]:
list(zip(plays_train, classifier.feature_importances_))

[('offense_score', 0.10848285077510264),
 ('defense_score', 0.09247227665770269),
 ('yardstogoal', 0.226713389154735),
 ('down', 0.2118119152945256),
 ('distance', 0.10944169730137424),
 ('seconds_remaining', 0.25107787081655986)]

In [25]:
plays['margin'] = plays['offense_score'] - plays['defense_score']
plays = plays.drop(columns=['offense_score', 'defense_score'])

In [26]:
plays_train, plays_validation, calls_train, calls_validation = train_test_split(plays, play_calls, train_size=0.8, test_size=0.2, random_state=0)
y, y_keys = pd.factorize(calls_train)

classifier = RandomForestClassifier(n_estimators=100, random_state=0)
classifier.fit(plays_train, y)

predicted_calls = y_keys[classifier.predict(plays_validation)]

pd.crosstab(calls_validation, predicted_calls, rownames=['Actual Calls'], colnames=['Predicted Calls'])

Predicted Calls,fg,pass,punt,rush
Actual Calls,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
fg,10,0,1,3
pass,2,224,1,176
punt,0,0,60,0
rush,2,131,5,359


In [27]:
list(zip(plays_train, classifier.feature_importances_))

[('yardstogoal', 0.2462038688184224),
 ('down', 0.20805538202774435),
 ('distance', 0.11410755258546777),
 ('seconds_remaining', 0.27959216801625336),
 ('margin', 0.15204102855211205)]

In [28]:
def predict_call(yards, down, distance, seconds, margin):
    test_plays = pd.DataFrame({'yardstogoal': [yards], 'down': [down], 'distance': [distance], 'seconds_remaining': [seconds], 'margin': [margin]})
    return y_keys[classifier.predict(test_plays)][0]

In [29]:
call = predict_call(50, 4, 1, 180, -4)
call

'rush'

In [30]:
call = predict_call(50, 4, 1, 180, 10)
call

'punt'