In [1]:
import duckdb
import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
# Create the database connection
con = duckdb.connect(".config/nfl.duckdb")
#con.close()

In [3]:
# Creating dataframes with DuckDB, plays and player_play both have 50 columns, more ideal for a broad random forest
X = con.sql("""
    SELECT quarter, down, yardsToGo, yardlineNumber, preSnapHomeScore, preSnapVisitorScore,
    playNullifiedByPenalty, absoluteYardlineNumber, preSnapHomeTeamWinProbability, preSnapVisitorTeamWinProbability, expectedPoints,
    passResult_complete, passResult_incomplete, passResult_sack, passResult_interception, passResult_scramble, passLength, targetX, targetY,
    playAction, passTippedAtLine, unblockedPressure, qbSpike, qbKneel, qbSneak, penaltyYards, prePenaltyYardsGained, 
    homeTeamWinProbabilityAdded, visitorTeamWinProbilityAdded, expectedPointsAdded, isDropback, timeToThrow, timeInTackleBox, timeToSack,
    dropbackDistance, pff_runPassOption, playClockAtSnap, pff_manZone, pff_runConceptPrimary_num, pff_passCoverage_num, pff_runConceptSecondary_num
FROM silver.plays_rf
""").df()
y = np.array(con.sql("""
    SELECT yardsGained
    FROM silver.plays_rf
""").df()).ravel()

In [4]:
# Having issues with NA values, the below code does a simple count using pandas, will then go back and change the query
# As of writing this, the issue is solved; however, the dbt model for this is far from efficient
na_counts = (X == 'NA').sum()

# Optionally, filter only columns with 'NA' values for easier review
na_counts_filtered = na_counts[na_counts > 0]
print(na_counts_filtered, "\n", X.shape, "\n", y.shape) # playClockAtSnap has only 1 NA value, will just drop that row


Series([], dtype: int64) 
 (16124, 41) 
 (16124,)


In [5]:
# Instantiate the model and split the data
rf = RandomForestRegressor(warm_start=True)

selector = RFE(rf, n_features_to_select=10, step=1)
X_selected = selector.fit_transform(X, y)


In [6]:
# Begin Interpretation, first with feature importance
selected_features = X.columns[selector.support_]
print(selected_features)


Index(['yardlineNumber', 'absoluteYardlineNumber',
       'preSnapHomeTeamWinProbability', 'expectedPoints',
       'passResult_scramble', 'penaltyYards', 'prePenaltyYardsGained',
       'homeTeamWinProbabilityAdded', 'visitorTeamWinProbilityAdded',
       'expectedPointsAdded'],
      dtype='object')


In [7]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Calculate scores
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")


Mean Squared Error: 1.7769936744186046
R^2 Score: 0.9766614590863065


In [11]:
# Continue with the GridSearch
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=4)
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_

# Wrap a progress bar for longer Grid Searches
"""with tqdm(total=len(param_grid['n_estimators']) * len(param_grid['max_depth']) * len(param_grid['min_samples_split']) * len(param_grid['min_samples_leaf']), desc="GridSearch Progress") as pbar:
    def callback(*args, **kwargs):
        pbar.update(1)

    # Add the callback to the grid search
    grid_search.fit(X, y, callback=callback)"""

print(grid_search.best_params_)


{'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [12]:
# Continue with the Cross Validation Score
cv_scores = cross_val_score(rf, X_selected, y, cv=5, scoring='neg_mean_squared_error')
print(f"Cross-validated MSE: {-cv_scores.mean()}")


Cross-validated MSE: 1.9303851017196607
