In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Since 2017, the National Football League has collaborated with Amazon Web Services to collect instantaneous player location data, speed, and acceleration across every inch of the field. The NFL's Next Gen Stats have revolutionized football insights and analytics, largely concentrating on offensive metrics. This joint venture now employs machine learning to craft a new defensive measure assessing tackle probability.**

**Information was gathered from tracking data pertaining to the ball and defensive players during each play. Game event details helped identify the defensive team, while tackle data served as the primary outcome variable in a model aimed at predicting a defender's likelihood of executing a tackle.**

**Python was the tool of choice for data preparation and model construction. Various factors—such as player speed, acceleration, distance from the player to the football, player orientation concerning the football, and player direction in relation to the football—were utilized as features for prediction. Standard scaling was applied to render these features uniform in scale. The output was a binary classification indicating whether the defender successfully tackled or failed to do so. The data was randomly divided into training (80%) and testing (20%) sets. Further splitting of the training data into ten non-overlapping segments (90% training and 10% validation)ensuring optimized model parameters and guarding against overfitting.**

**Three distinct machine learning models Gradient Boosting, xgboost, and Custom Deep learning model—were constructed and evaluated to determine the most effective one. The Custom Deep learning model demonstrated superior performance and was thus chosen**.

**Python script utilizing various libraries and modules for machine learning tasks**

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

**Data loading process for the NFL Big Data Bowl 2024 competition. It utilizes Pandas, a powerful data manipulation library in Python, to read and manipulate data stored in CSV files**

In [None]:


# Load the datasets from the provided file paths
players = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2024/players.csv')
tackles = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2024/tackles.csv')

# Load tracking data from all available tracking_week files
tracking_data = pd.concat([pd.read_csv(f'/kaggle/input/nfl-big-data-bowl-2024/tracking_week_{week}.csv') for week in range(1, 10)])

# Load other relevant datasets if needed (e.g., games, plays, etc.)
games = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2024/games.csv')
plays = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2024/plays.csv')



**The provided code uses the info() method in Pandas to get concise information about the dataframes players, tackles, tracking_data, games, and plays. This method provides a summary of the dataframe, including the number of entries (rows), the column names, the data type of each column, and the count of non-null values for each column.**

In [None]:

players.info()
tackles.info()
tracking_data.info()
games.info()
plays.info()

The provided code snippet includes a function named standardize_date() and subsequent operations applied to the 'players' DataFrame. Here's a breakdown of the code:

standardize_date() Function:
Purpose: This function aims to standardize the date format in the 'birthDate' column of the 'players' DataFrame.
Method:
It attempts to parse the date string using two different date formats ('%y-%d-%m' and '%Y-%d-%m') with pd.to_datetime() and handles potential ValueError exceptions.
If the date parsing fails for both formats, it returns the original date string.
Checks if the parsed date is missing (NaN) and converts it to NaT (Not a Time) if needed.
Finally, it standardizes the parsed date to the format '%Y-%j-%m', representing year, day of the year, and month.

In [None]:
# This function standardizes the date format in the 'birthDate' column of the players_df DataFrame.
# The code was adapted from a script by Ayush Khaire (https://www.kaggle.com/code/ayushkhaire/players-data-analysis-of-nfl)
def standardize_date(date_str):
    try:
        parsed_date = pd.to_datetime(date_str, format='%y-%d-%m', errors='raise')
    except ValueError:
        try:
            parsed_date = pd.to_datetime(date_str, format='%Y-%d-%m', errors='raise')
        except ValueError:
            return date_str
    if pd.isna(parsed_date):
        return pd.NaT
    standardized_date = parsed_date.strftime('%Y-%j-%m')
    return standardized_date
players['birthDate'] = players['birthDate'].apply(standardize_date)
players['birthDate'] = pd.to_datetime(players['birthDate'], format='%Y-%j-%m', errors='coerce')
# Calculate the median birth date for each position
median_dates = players.groupby('position')['birthDate'].median()
median_dates

Creating a new DataFrame 'df' based on the 'players' DataFrame, likely to perform operations without NaN values.
Computing the median birth date for each position after removing rows with missing 'birthDate' values.

In [None]:
df = players
df = df.dropna()
median_dates = df.groupby('position')['birthDate'].median()
median_dates

The provided code snippet involves a function fill_birth_dates() applied to the 'players' DataFrame for handling missing values in the 'birthDate' column

In [None]:
def fill_birth_dates(row):
    if pd.isna(row['birthDate']):
        if row['position'] in median_dates.index:
            return median_dates[row['position']]
    else:
        return row['birthDate']

# Apply the function to the 'birthDate' column
players['birthDate'] = players.apply(fill_birth_dates, axis=1)
players.isnull().any()

In [None]:
nan_index = players[players['birthDate'].isnull()].index
players.loc[nan_index]

In [None]:
# Import the datetime module
import datetime

# Create a datetime object for his birth date
birth_date = datetime.datetime(1998, 7, 26)

# Find the index of Isaiah Simmons in the DataFrame
index = players[players['displayName'] == 'Isaiah Simmons'].index

# Update his birth date in the DataFrame
players.loc[index, 'birthDate'] = birth_date

In [None]:
players['age'] = 2024 - players['birthDate'].dt.year

This function is designed to convert the height from an imperial format (feet and inches) to meters.

In [None]:
def convert_height_to_meters(height):
    # Split the height into feet and inches
    feet, inches = map(int, height.split('-'))
    # Convert height to inches
    total_inches = feet * 12 + inches
    # Convert inches to cm (1 inch = 2.54 cm)
    height_cm = total_inches * 2.54
    # Convert cm to meters
    height_m = height_cm / 100
    return height_m

players['height_m'] = players['height'].apply(convert_height_to_meters)

In [None]:
players['weight_kg'] = players['weight'] * 0.45359237

In [None]:
players['bmi'] = players['weight_kg'] / players['height_m']**2
players = players.drop(['height', 'weight', 'height_m', 'weight_kg', 'birthDate'], axis=1)
# Merge tackles data with player data
combined_df_1 = pd.merge(tackles, players, on = 'nflId')

# Then merge with play data
combined_df_2 = pd.merge(combined_df_1, plays, on = ['gameId', 'playId'])

# Further merge with games data
data = pd.merge(combined_df_2, games, on = 'gameId')


**Selecting relevant features for analysis**

In [None]:
# Selecting relevant features for analysis
features = ['tackle', 'assist', 'forcedFumble', 'pff_missedTackle', 'position', 'age', 'bmi', 'passResult',
           'passLength', 'offenseFormation', 'defendersInTheBox','passProbability', 'preSnapHomeTeamWinProbability',
            'homeFinalScore', 'visitorFinalScore']
eda_df = data[features]

In [None]:
sub_data = data[features]
cat_columns = sub_data.select_dtypes(['object']).columns


sub_data.loc[:, cat_columns] = sub_data.loc[:, cat_columns].apply(lambda x: pd.factorize(x)[0])

In [None]:
sub_data.loc[:, :].fillna(0, inplace=True)

In [None]:
X = sub_data.drop('tackle', axis=1)  # Features
y = sub_data['tackle']  # Target variable



In [None]:
import seaborn as sn
corr_matrix = X.corr()
sn.heatmap(corr_matrix, annot=True)
plt.show()

In [None]:
sns.pairplot(X)

**Normalize the features**

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
import numpy as np
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X)

In [None]:
# Split the data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X_train_minmax, y, test_size=0.2)

# **Gradient Boosting **

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0).fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
prediction_clf=clf.predict(X_test)

In [None]:

report = classification_report(y_test, prediction_clf)

print(report)

# **xgboost**


In [None]:

import xgboost as xgb



In [None]:
xgb_model = xgb.XGBRegressor(objective="reg:linear", random_state=42)

xgb_model.fit(X_train, y_train)

In [None]:
prediction_XGB=xgb_model.predict(X_test)

In [None]:
import numpy as np

# Assuming prediction_XGB contains probabilities or continuous values
threshold = 0.5  # Adjust this threshold based on your problem

# Convert probabilities to class labels
binary_predictions = np.where(prediction_XGB >= threshold, 1, 0)

# Now use classification_report with y_test and binary_predictions
report = classification_report(y_test, binary_predictions)
print(report)

# **Deep learning model ** 

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Define the model architecture
model = Sequential()

# Add a dense layer (fully connected layer)
# Modify input_dim according to your feature dimensions
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
# Add more layers if needed
model.add(Dense(64, activation='relu'))

# Output layer - adjust units based on your classification task
# For binary classification, use 1 unit with sigmoid activation
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=256, validation_split=0.2)

In [None]:
# Plot training and validation loss
plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# Plot training and validation accuracy
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Assuming 'model' has been trained and 'X_test' is available

# Predict classes using the trained model
y_pred = model.predict(X_test)
# Convert probabilities to class labels
y_pred = (y_pred > 0.5).astype(int)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(6, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Predicted Negative', 'Predicted Positive'],
            yticklabels=['Actual Negative', 'Actual Positive'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Get predicted probabilities for the positive class
y_pred_prob = model.predict(X_test)

# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
auc = roc_auc_score(y_test, y_pred_prob)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
from sklearn.metrics import precision_recall_curve,average_precision_score
precision,recall,_ = precision_recall_curve(y_test, y_pred_prob)
average_precision = average_precision_score(y_test, y_pred_prob)
plt.figure()
plt.plot(recall, precision,color="blue",lw=2, label='Precision-Recall Curve (AP=%0.2f)'%average_precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='lower left')
plt.show()


In [None]:
# Assuming 'y_test' and 'y_pred' are 2-dimensional arrays with shape (3486, 1)
y_test_1d = y_test  # Reshape to 1-dimensional array
y_pred_1d = y_pred.reshape(-1)  # Reshape to 1-dimensional array

# Create DataFrame with reshaped arrays
submission_data = pd.DataFrame({'Actual': y_test_1d, 'Predicted': y_pred_1d})
submission_data.to_csv('submission.csv', mode="a",index=False)
