In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For data processing and analysis
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('seaborn')

In [None]:
# Load game data
games = pd.read_csv('../input/nfl-big-data-bowl-2025/games.csv')

# Load play data
plays = pd.read_csv('../input/nfl-big-data-bowl-2025/plays.csv')

# Load player data
players = pd.read_csv('../input/nfl-big-data-bowl-2025/players.csv')

# Load player play data
player_play = pd.read_csv('../input/nfl-big-data-bowl-2025/player_play.csv')

# Load tracking data for week 1 (you can load more weeks as needed)
tracking_week1 = pd.read_csv('../input/nfl-big-data-bowl-2025/tracking_week_1.csv')

In [None]:
print("Games dataset shape:", games.shape)
print("Plays dataset shape:", plays.shape)
print("Players dataset shape:", players.shape)
print("Player Play dataset shape:", player_play.shape)
print("Tracking Week 1 dataset shape:", tracking_week1.shape)

# Display first few rows of each dataset
print("\nGames dataset preview:")
print(games.head())

print("\nPlays dataset preview:")
print(plays.head())

print("\nPlayers dataset preview:")
print(players.head())

print("\nPlayer Play dataset preview:")
print(player_play.head())

print("\nTracking Week 1 dataset preview:")
print(tracking_week1.head())

In [None]:
def check_missing_values(df, name):
    missing = df.isnull().sum()
    missing_percent = 100 * df.isnull().sum() / len(df)
    missing_table = pd.concat([missing, missing_percent], axis=1, keys=['Missing Values', '% Missing'])
    print(f"\nMissing values in {name} dataset:")
    print(missing_table[missing_table['Missing Values'] > 0])

check_missing_values(games, 'Games')
check_missing_values(plays, 'Plays')
check_missing_values(players, 'Players')
check_missing_values(player_play, 'Player Play')
check_missing_values(tracking_week1, 'Tracking Week 1')

In [None]:


# Load the data
plays = pd.read_csv('../input/nfl-big-data-bowl-2025/plays.csv')
tracking_week1 = pd.read_csv('../input/nfl-big-data-bowl-2025/tracking_week_1.csv')

# Merge plays and tracking data
merged_data = pd.merge(plays, tracking_week1, on=['gameId', 'playId'])

# Filter for pre-snap frames
pre_snap_data = merged_data[merged_data['frameType'] == 'BEFORE_SNAP']

# Group by play and calculate pre-snap features
pre_snap_features = pre_snap_data.groupby(['gameId', 'playId']).agg({
    'x': ['mean', 'std'],
    'y': ['mean', 'std'],
    's': ['mean', 'max'],
    'a': ['mean', 'max'],
    'o': ['mean', 'std'],
    'dir': ['mean', 'std']
}).reset_index()

# Flatten column names
pre_snap_features.columns = ['_'.join(col).strip() for col in pre_snap_features.columns.values]

# Rename columns to avoid conflicts
pre_snap_features = pre_snap_features.rename(columns={
    'gameId_': 'gameId',
    'playId_': 'playId'
})

# Merge pre-snap features with play data
final_data = pd.merge(plays, pre_snap_features, on=['gameId', 'playId'], how='left')

print("Final data shape:", final_data.shape)
print("\nColumns in final_data:")
print(final_data.columns)
print("\nFirst few rows of final_data:")
print(final_data.head())

# Check for any missing values in the final dataset
missing_values = final_data.isnull().sum()
print("\nMissing values in final_data:")
print(missing_values[missing_values > 0])

In [None]:


# Assuming final_data is your current dataframe
# Fill missing values in categorical columns with 'Unknown'
categorical_columns = final_data.select_dtypes(include=['object']).columns
final_data[categorical_columns] = final_data[categorical_columns].fillna('Unknown')

# Fill missing values in numerical columns with the mean of that column
numerical_columns = final_data.select_dtypes(include=['float64', 'int64']).columns
final_data[numerical_columns] = final_data[numerical_columns].fillna(final_data[numerical_columns].mean())

# Create a new feature to indicate whether a play is a pass or run
final_data['is_pass_play'] = final_data['passResult'].apply(lambda x: 1 if x != 'Unknown' else 0)

# Print summary of the updated dataset
print("Updated dataset shape:", final_data.shape)
print("\nMissing values after preprocessing:")
print(final_data.isnull().sum().sum())
print("\nSample of the updated dataset:")
print(final_data.head())
print("\nDistribution of pass vs. run plays:")
print(final_data['is_pass_play'].value_counts(normalize=True))

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

# Assuming final_data is your current dataframe

# Select relevant pre-snap features
categorical_features = ['offenseFormation']
numeric_features = ['quarter', 'down', 'yardsToGo', 'yardlineNumber',
                    'preSnapHomeTeamWinProbability', 'preSnapVisitorTeamWinProbability',
                    'x_mean', 'y_mean', 's_mean', 'a_mean', 'o_mean', 'dir_mean']

# Create additional pre-snap features
final_data['time_remaining'] = final_data['quarter'].map({1: 45, 2: 30, 3: 15, 4: 0}) + \
                               final_data['gameClock'].apply(lambda x: int(x.split(':')[0]) + int(x.split(':')[1])/60)
final_data['score_differential'] = final_data['preSnapHomeScore'] - final_data['preSnapVisitorScore']

numeric_features += ['time_remaining', 'score_differential']

# Prepare data for modeling
X = final_data[categorical_features + numeric_features]
y = final_data['is_pass_play']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create preprocessing steps
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline
clf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Fit the pipeline
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
feature_names = (numeric_features + 
                 clf.named_steps['preprocessor']
                 .named_transformers_['cat']
                 .named_steps['onehot']
                 .get_feature_names_out(categorical_features).tolist())

feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': clf.named_steps['classifier'].feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming you have X and y from your previous code

# 1. Visualize feature importances
plt.figure(figsize=(10, 6))
feature_importance.plot(x='feature', y='importance', kind='bar')
plt.title('Feature Importances')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

# 2. Perform cross-validation
cv_scores = cross_val_score(clf, X, y, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# 3. Hyperparameter tuning
param_dist = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

random_search = RandomizedSearchCV(clf, param_distributions=param_dist, 
                                   n_iter=20, cv=5, random_state=42, n_jobs=-1)
random_search.fit(X, y)

print("Best parameters:", random_search.best_params_)
print("Best cross-validation score:", random_search.best_score_)

# Train the best model
best_model = random_search.best_estimator_
best_model.fit(X, y)

# Get feature importances from the best model
feature_names = (numeric_features + 
                 best_model.named_steps['preprocessor']
                 .named_transformers_['cat']
                 .named_steps['onehot']
                 .get_feature_names_out(categorical_features).tolist())

feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': best_model.named_steps['classifier'].feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features (Best Model):")
print(feature_importance.head(10))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.model_selection import train_test_split

# Assuming best_model is your trained model from the previous step

# 1. Visualize feature importances
plt.figure(figsize=(12, 6))
feature_importance.plot(x='feature', y='importance', kind='bar')
plt.title('Feature Importances')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

# 2. Create confusion matrix
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_pred = best_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# 3. Plot ROC curve
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# Print classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

# Assuming 'best_model' is your trained model and 'X' is your feature dataset

# 1. Feature Importance Plot
feature_importance = best_model.named_steps['classifier'].feature_importances_
feature_names = best_model.named_steps['preprocessor'].get_feature_names_out()

# Sort features by importance
sorted_idx = feature_importance.argsort()
sorted_features = feature_names[sorted_idx]
sorted_importance = feature_importance[sorted_idx]

plt.figure(figsize=(10, 6))
sns.barplot(x=sorted_importance[-15:], y=sorted_features[-15:], orient='h')  # Show top 15 features
plt.title('Top 15 Feature Importance')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

# 2. Confusion Matrix
y_pred = best_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# Print classification report
print(classification_report(y_test, y_pred))

# Print top 10 most important features
print("\nTop 10 Most Important Features:")
for feature, importance in zip(sorted_features[-10:], sorted_importance[-10:]):
    print(f"{feature}: {importance:.4f}")