In [52]:
import numpy as np
import pandas as pd
import requests
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.max_columns', None)

# Set up the pandas dataframe
gt_play_data = pd.DataFrame()

# Query the API for the CPJ data. We only need to do this once, so it's commented out.
# for year in range(2005,2018):
#     response = requests.get("https://api.collegefootballdata.com/plays?seasonType=both&year={0}&offense=Georgia Tech".format(year))
#     df = pd.io.json.json_normalize(response.json())
#     gt_play_data = pd.concat([gt_play_data,df])

# Data is exported to a CSV, which we will be pulling from now
# gt_play_data.to_csv('./data/plays/GeorgiaTech.csv')

data = pd.read_csv('./data/plays/GeorgiaTech.csv')[['home','away','offense_score','defense_score','period','clock.minutes','clock.seconds','yards_to_goal','down','distance','play_type']]

# Is GT playing at home?
data['is_home'] = np.where(data['home'] == 'Georgia Tech', 1, 0)

# Convert this to a single column
data['seconds_remaining'] = (data['clock.minutes'] * 60) + data['clock.seconds'] + ((data['clock.minutes'] + (4 - data['period']) * 15) * 60)

data['score_diff'] = data['offense_score'] - data['defense_score']

# I want both the result, and the playcall
pass_types = ['Pass Reception', 'Pass Interception Return', 'Pass Incompletion', 'Sack', 'Passing Touchdown', 'Interception Return Touchdown']
rush_types = ['Rush', 'Rushing Touchdown']
punt_types = ['Punt', 'Punt Return Touchdown', 'Blocked Punt', 'Blocked Punt Touchdown']
fg_types = ['Field Goal Good', 'Field Goal Missed', 'Blocked Field Goal']

def getPlayCall(x):
    if x in pass_types:
            return 'pass'
    elif x in rush_types:
        return 'rush'
    elif x in punt_types:
        return 'punt'
    elif x in fg_types:
        return 'fg'
    else:
        return None
        
data['play_call'] = data['play_type'].apply(getPlayCall)
data.dropna(subset=['play_call'], inplace=True)

plays = data[['offense_score', 'defense_score', 'period', 'yards_to_goal', 'down', 'distance', 'is_home', 'seconds_remaining', 'play_call','score_diff']].query('period <= 4')

third_down_plays = plays[plays.down.eq(3)]
third_down_plays.head()

# Refining the model
plays = plays.drop(columns=['is_home', 'period', 'offense_score', 'defense_score'])

# split the data set between our independent variables (i.e. features) and our dependent variable or output
play_calls = plays['play_call']
plays = plays.drop(['play_call'], axis=1)

# split the data into training and validation sets
plays_train, plays_validation, calls_train, calls_validation = train_test_split(plays, play_calls, train_size=0.8, test_size=0.2, random_state=0)
plays_train.head()
y, y_keys = pd.factorize(calls_train)
# build the classifier
classifier = RandomForestClassifier(random_state=0, n_estimators=100)

# train the classifier with our test set
classifier.fit(plays_train, y)
classifier.predict(plays_validation)
classifier.predict_proba(plays_validation)[0:10]

predicted_calls = y_keys[classifier.predict(plays_validation)]
predicted_calls
pd.crosstab(calls_validation, predicted_calls, rownames=['Actual Calls'], colnames=['Predicted Calls'])

# list(zip(plays_train, classifier.feature_importances_))

Predicted Calls,fg,pass,punt,rush
Actual Calls,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
fg,38,4,1,2
pass,3,102,5,212
punt,2,0,137,2
rush,4,82,5,1530


In [53]:
fig = px.scatter(
    plays,
    x='seconds_remaining',
    y='score_diff',
    color='play_call',
    trendline='ols'
)
fig.show()

ValueError: Value of 'color' is not the name of a column in 'data_frame'. Expected one of ['yards_to_goal', 'down', 'distance', 'seconds_remaining', 'score_diff'] but received: play_call