In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [2]:
file_path = '/Users/amaanrai/Desktop/NFL First Touchdown Scorers/playbyplay2024.csv'
data = pd.read_csv(file_path)

In [3]:
# Preprocessing: Selecting features and target variable
# For this example, let's assume we want to predict the play type (e.g., Pass or Rush)
# I'll create a simple binary classification for now (Pass vs. Rush), 
# and focus on columns like Quarter, Down, ToGo, YardLine, OffenseTeam, DefenseTeam

# Feature columns (excluding categorical variables for now to keep it simple)
selected_features = ['Quarter', 'Down', 'ToGo', 'YardLine']

# LabelEncoder for OffenseTeam and DefenseTeam
encoder = LabelEncoder()

# Converting categorical team names into numerical categories
data['OffenseTeam_encoded'] = encoder.fit_transform(data['OffenseTeam'].astype(str))
data['DefenseTeam_encoded'] = encoder.fit_transform(data['DefenseTeam'].astype(str))

# Adding encoded team features to the selected features list
selected_features += ['OffenseTeam_encoded', 'DefenseTeam_encoded']

# For simplicity, I'll create a binary target variable for PlayType
# Here, I assume the column 'RushDirection' indicates a rushing play, otherwise it's a passing play
# I will create a binary target: 1 for Rush, 0 for Pass (where 'RushDirection' is NaN, we assume Pass)
data['PlayType'] = data['RushDirection'].apply(lambda x: 1 if pd.notna(x) else 0)

# Preparing the feature matrix (X) and target vector (y)
X = data[selected_features].fillna(0)  # Filling missing values with 0 for now
y = data['PlayType']

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model 1: Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Model 2: Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# Evaluate the models
rf_accuracy = accuracy_score(y_test, y_pred_rf)
lr_accuracy = accuracy_score(y_test, y_pred_lr)

rf_report = classification_report(y_test, y_pred_rf)
lr_report = classification_report(y_test, y_pred_lr)

rf_accuracy, lr_accuracy, rf_report, lr_report

(0.7615172213341084,
 0.7402993750908298,
 '              precision    recall  f1-score   support\n\n           0       0.82      0.87      0.84      5124\n           1       0.54      0.45      0.49      1757\n\n    accuracy                           0.76      6881\n   macro avg       0.68      0.66      0.67      6881\nweighted avg       0.75      0.76      0.75      6881\n',
 '              precision    recall  f1-score   support\n\n           0       0.74      0.99      0.85      5124\n           1       0.08      0.00      0.00      1757\n\n    accuracy                           0.74      6881\n   macro avg       0.41      0.50      0.43      6881\nweighted avg       0.58      0.74      0.63      6881\n')

In [5]:
# Define the parameter grid to search over
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Instantiate the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Set up the grid search with cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_

# Predict with the best model
y_pred_best_rf = best_rf.predict(X_test)

# Evaluate the tuned Random Forest model
best_rf_accuracy = accuracy_score(y_test, y_pred_best_rf)
best_rf_report = classification_report(y_test, y_pred_best_rf)

print(f"Best Parameters: {best_params}")
print(f"Tuned Random Forest Accuracy: {best_rf_accuracy}")
print(f"Classification Report:\n{best_rf_report}")

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=10

[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, ma

[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   0.0s
[CV] END max_depth=No

540 fits failed out of a total of 1620.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
215 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/amaanrai/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/amaanrai/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/Users/amaanrai/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/Users/amaanrai/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, in valid

Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Tuned Random Forest Accuracy: 0.7642784479000145
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.94      0.86      5124
           1       0.59      0.26      0.36      1757

    accuracy                           0.76      6881
   macro avg       0.69      0.60      0.61      6881
weighted avg       0.74      0.76      0.73      6881

