In [1]:
import numpy as np
import pandas as pd
import requests

In [2]:
data = pd.DataFrame()

for year in range(2015,2020):
    response = requests.get("https://api.collegefootballdata.com/plays?seasonType=both&year={0}&offense=michigan".format(year))
    df = pd.io.json.json_normalize(response.json())
    data = pd.concat([data, df])
    
data.head()

Unnamed: 0,id,offense,offense_conference,defense,defense_conference,home,away,offense_score,defense_score,drive_id,...,yard_line,yardstogoal,down,distance,yards_gained,play_type,play_text,ppa,clock.minutes,clock.seconds
0,400756883101849902,Michigan,Big Ten,Utah,Pac-12,Utah,Michigan,0,0,4007568831,...,65,65,1,10,0,Kickoff,Kenny Allen kickoff for 65 yds for a touchback,,15,0
1,400756883101917002,Michigan,Big Ten,Utah,Pac-12,Utah,Michigan,0,3,4007568832,...,44,44,1,10,-5,Penalty,"MICHIGAN Penalty, False Start (Mason Cole) to ...",,8,29
2,400756883101898901,Michigan,Big Ten,Utah,Pac-12,Utah,Michigan,0,3,4007568832,...,75,75,1,10,1,Rush,De'Veon Smith run for 1 yd to the Mich 26,-0.5874795431016855,10,10
3,400756883101914401,Michigan,Big Ten,Utah,Pac-12,Utah,Michigan,0,3,4007568832,...,64,64,1,10,0,Pass Incompletion,Jake Rudock pass incomplete to Jake Butt,-1.041155320344064,8,55
4,400756883101907901,Michigan,Big Ten,Utah,Pac-12,Utah,Michigan,0,3,4007568832,...,68,68,3,3,4,Pass Reception,Jake Rudock pass complete to Jake Butt for 4 y...,1.3197166530611415,9,20


In [3]:
data = data[['home', 'away', 'offense_score', 'defense_score', 'period', 'clock.minutes', 'clock.seconds', 'yardstogoal', 'down', 'distance', 'play_type']]
data.head()

Unnamed: 0,home,away,offense_score,defense_score,period,clock.minutes,clock.seconds,yardstogoal,down,distance,play_type
0,Utah,Michigan,0,0,1,15,0,65,1,10,Kickoff
1,Utah,Michigan,0,3,1,8,29,44,1,10,Penalty
2,Utah,Michigan,0,3,1,10,10,75,1,10,Rush
3,Utah,Michigan,0,3,1,8,55,64,1,10,Pass Incompletion
4,Utah,Michigan,0,3,1,9,20,68,3,3,Pass Reception


In [5]:
data['is_home'] = np.where(data['home'] == 'Michigan', 1, 0)
data.head()

Unnamed: 0,home,away,offense_score,defense_score,period,clock.minutes,clock.seconds,yardstogoal,down,distance,play_type,is_home
0,Utah,Michigan,0,0,1,15,0,65,1,10,Kickoff,0
1,Utah,Michigan,0,3,1,8,29,44,1,10,Penalty,0
2,Utah,Michigan,0,3,1,10,10,75,1,10,Rush,0
3,Utah,Michigan,0,3,1,8,55,64,1,10,Pass Incompletion,0
4,Utah,Michigan,0,3,1,9,20,68,3,3,Pass Reception,0


In [6]:
data['seconds_remaining'] = (data['clock.minutes'] * 60) + data['clock.seconds']
data.head()

Unnamed: 0,home,away,offense_score,defense_score,period,clock.minutes,clock.seconds,yardstogoal,down,distance,play_type,is_home,seconds_remaining
0,Utah,Michigan,0,0,1,15,0,65,1,10,Kickoff,0,900
1,Utah,Michigan,0,3,1,8,29,44,1,10,Penalty,0,509
2,Utah,Michigan,0,3,1,10,10,75,1,10,Rush,0,610
3,Utah,Michigan,0,3,1,8,55,64,1,10,Pass Incompletion,0,535
4,Utah,Michigan,0,3,1,9,20,68,3,3,Pass Reception,0,560


In [7]:
pass_types = ['Pass Reception', 'Pass Interception Return', 'Pass Incompletion', 'Sack', 'Passing Touchdown', 'Interception Return Touchdown']
rush_types = ['Rush', 'Rushing Touchdown']
punt_types = ['Punt', 'Punt Return Touchdown', 'Blocked Punt', 'Blocked Punt Touchdown']
fg_types = ['Field Goal Good', 'Field Goal Missed', 'Blocked Field Goal']

def getPlayCall(x):
    if x in pass_types:
            return 'pass'
    elif x in rush_types:
        return 'rush'
    elif x in punt_types:
        return 'punt'
    elif x in fg_types:
        return 'fg'
    else:
        return None

In [8]:
data['play_call'] = data['play_type'].apply(getPlayCall)
data.head()

Unnamed: 0,home,away,offense_score,defense_score,period,clock.minutes,clock.seconds,yardstogoal,down,distance,play_type,is_home,seconds_remaining,play_call
0,Utah,Michigan,0,0,1,15,0,65,1,10,Kickoff,0,900,
1,Utah,Michigan,0,3,1,8,29,44,1,10,Penalty,0,509,
2,Utah,Michigan,0,3,1,10,10,75,1,10,Rush,0,610,rush
3,Utah,Michigan,0,3,1,8,55,64,1,10,Pass Incompletion,0,535,pass
4,Utah,Michigan,0,3,1,9,20,68,3,3,Pass Reception,0,560,pass


In [9]:
data.dropna(subset=['play_call'], inplace=True)
data.head()

Unnamed: 0,home,away,offense_score,defense_score,period,clock.minutes,clock.seconds,yardstogoal,down,distance,play_type,is_home,seconds_remaining,play_call
2,Utah,Michigan,0,3,1,10,10,75,1,10,Rush,0,610,rush
3,Utah,Michigan,0,3,1,8,55,64,1,10,Pass Incompletion,0,535,pass
4,Utah,Michigan,0,3,1,9,20,68,3,3,Pass Reception,0,560,pass
5,Utah,Michigan,0,3,1,9,50,74,2,9,Pass Reception,0,590,pass
7,Utah,Michigan,0,3,1,5,48,21,3,5,Pass Interception Return,0,348,pass


In [10]:
plays = data[['offense_score', 'defense_score', 'period', 'yardstogoal', 'down', 'distance', 'is_home', 'seconds_remaining', 'play_call']]
plays.head()

Unnamed: 0,offense_score,defense_score,period,yardstogoal,down,distance,is_home,seconds_remaining,play_call
2,0,3,1,75,1,10,0,610,rush
3,0,3,1,64,1,10,0,535,pass
4,0,3,1,68,3,3,0,560,pass
5,0,3,1,74,2,9,0,590,pass
7,0,3,1,21,3,5,0,348,pass


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [12]:
# split the data set between our independent variables (i.e. features) and our dependent variable or output
play_calls = plays['play_call']
plays = plays.drop(['play_call'], axis=1)

# split the data into training and validation sets
plays_train, plays_validation, calls_train, calls_validation = train_test_split(plays, play_calls, train_size=0.8, test_size=0.2, random_state=0)
plays_train.head()

Unnamed: 0,offense_score,defense_score,period,yardstogoal,down,distance,is_home,seconds_remaining
108,14,0,2,68,1,10,1,735
1117,32,62,4,59,2,8,0,225
923,13,17,4,6,1,6,0,605
212,13,0,2,50,4,5,0,307
232,0,14,1,16,1,10,0,0


In [13]:
y, y_keys = pd.factorize(calls_train)

In [14]:
classifier = RandomForestClassifier(random_state=0, n_estimators=100)
classifier.fit(plays_train, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [15]:
classifier.predict(plays_validation)

array([1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 3, 2,
       1, 0, 2, 0, 1, 0, 1, 0, 1, 0, 3, 3, 2, 0, 1, 0, 1, 0, 0, 0, 2, 0,
       0, 2, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 2, 2, 1, 0, 0, 0, 1, 3, 2, 0, 1, 0, 0,
       1, 2, 0, 2, 1, 0, 3, 0, 0, 0, 0, 1, 2, 2, 0, 2, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, 0, 3,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 2, 2, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 2, 0, 1, 0, 0, 1,
       0, 2, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 3, 1, 0, 1, 0, 0, 1, 0, 0, 2, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 3, 0, 3, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 0, 2, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 3, 1,
       0, 0, 0, 1, 0, 0, 1, 2, 0, 0, 1, 0, 2, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,

In [16]:
classifier.predict_proba(plays_validation)[0:10]

array([[0.29, 0.71, 0.  , 0.  ],
       [0.56, 0.43, 0.01, 0.  ],
       [0.33, 0.67, 0.  , 0.  ],
       [0.57, 0.41, 0.02, 0.  ],
       [0.71, 0.29, 0.  , 0.  ],
       [0.3 , 0.66, 0.04, 0.  ],
       [0.77, 0.22, 0.01, 0.  ],
       [0.61, 0.39, 0.  , 0.  ],
       [0.75, 0.25, 0.  , 0.  ],
       [0.67, 0.33, 0.  , 0.  ]])

In [17]:
predicted_calls = y_keys[classifier.predict(plays_validation)]
predicted_calls

Index(['pass', 'rush', 'pass', 'rush', 'rush', 'pass', 'rush', 'rush', 'rush',
       'rush',
       ...
       'pass', 'rush', 'pass', 'rush', 'pass', 'punt', 'punt', 'rush', 'pass',
       'rush'],
      dtype='object', length=974)

In [18]:
pd.crosstab(calls_validation, predicted_calls, rownames=['Actual Calls'], colnames=['Predicted Calls'])

Predicted Calls,fg,pass,punt,rush
Actual Calls,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
fg,21,0,1,0
pass,0,209,3,183
punt,1,0,63,0
rush,2,97,1,393


In [19]:
list(zip(plays_train, classifier.feature_importances_))

[('offense_score', 0.10444453608893221),
 ('defense_score', 0.0893383167153483),
 ('period', 0.044935906448350514),
 ('yardstogoal', 0.20916578321494264),
 ('down', 0.20201689640755077),
 ('distance', 0.1153935010966933),
 ('is_home', 0.021996550844788673),
 ('seconds_remaining', 0.21270850918339357)]

In [20]:
plays['seconds_remaining'] = ((4 - plays['period']) * 15 * 60 ) + plays['seconds_remaining']
plays = plays.drop(columns=['is_home', 'period'])

In [21]:
plays_train, plays_validation, calls_train, calls_validation = train_test_split(plays, play_calls, train_size=0.8, test_size=0.2, random_state=0)
y, y_keys = pd.factorize(calls_train)

classifier = RandomForestClassifier(n_estimators=100, random_state=0)
classifier.fit(plays_train, y)

predicted_calls = y_keys[classifier.predict(plays_validation)]

pd.crosstab(calls_validation, predicted_calls, rownames=['Actual Calls'], colnames=['Predicted Calls'])

Predicted Calls,fg,pass,punt,rush
Actual Calls,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
fg,21,0,1,0
pass,0,209,2,184
punt,2,0,62,0
rush,1,112,2,378


In [22]:
list(zip(plays_train, classifier.feature_importances_))

[('offense_score', 0.10944135934239327),
 ('defense_score', 0.09360750163663585),
 ('yardstogoal', 0.2331170857977108),
 ('down', 0.20160154022146237),
 ('distance', 0.10974941635441526),
 ('seconds_remaining', 0.2524830966473824)]

In [23]:
plays['margin'] = plays['offense_score'] - plays['defense_score']
plays = plays.drop(columns=['offense_score', 'defense_score'])

In [24]:
plays_train, plays_validation, calls_train, calls_validation = train_test_split(plays, play_calls, train_size=0.8, test_size=0.2, random_state=0)
y, y_keys = pd.factorize(calls_train)

classifier = RandomForestClassifier(n_estimators=100, random_state=0)
classifier.fit(plays_train, y)

predicted_calls = y_keys[classifier.predict(plays_validation)]

pd.crosstab(calls_validation, predicted_calls, rownames=['Actual Calls'], colnames=['Predicted Calls'])

Predicted Calls,fg,pass,punt,rush
Actual Calls,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
fg,20,0,1,1
pass,0,217,3,175
punt,2,1,61,0
rush,1,119,2,371


In [25]:
list(zip(plays_train, classifier.feature_importances_))

[('yardstogoal', 0.24991047750620787),
 ('down', 0.19781288268703204),
 ('distance', 0.11334817029606006),
 ('seconds_remaining', 0.28502300932207597),
 ('margin', 0.15390546018862403)]

In [26]:
def predict_call(yards, down, distance, seconds, margin):
    test_plays = pd.DataFrame({'yardstogoal': [yards], 'down': [down], 'distance': [distance], 'seconds_remaining': [seconds], 'margin': [margin]})
    return y_keys[classifier.predict(test_plays)][0]

In [27]:
call = predict_call(50, 4, 1, 180, -4)
call

'pass'

In [28]:
call = predict_call(50, 4, 1, 180, 10)
call

'punt'