In [None]:
import pandas as pd
import numpy as np

# Read in all our data
nfl_data = pd.read_csv("/content/NFL Play by Play 2009-2016 (v3).csv")

# Set seed for reproducibility
np.random.seed(0)

nfl_data.head()

Unnamed: 0,Date,GameID,Drive,qtr,down,time,TimeUnder,TimeSecs,PlayTimeDiff,SideofField,...,yacEPA,Home_WP_pre,Away_WP_pre,Home_WP_post,Away_WP_post,Win_Prob,WPA,airWPA,yacWPA,Season
0,2009-09-10,2009091000,1,1,,15:00,15,3600.0,0.0,TEN,...,,0.485675,0.514325,0.546433,0.453567,0.485675,0.060758,,,2009.0
1,2009-09-10,2009091000,1,1,1.0,14:53,15,3593.0,7.0,PIT,...,1.146076,0.546433,0.453567,0.551088,0.448912,0.546433,0.004655,-0.032244,0.036899,2009.0
2,2009-09-10,2009091000,1,1,2.0,14:16,15,3556.0,37.0,PIT,...,,0.551088,0.448912,0.510793,0.489207,0.551088,-0.040295,,,2009.0
3,2009-09-10,2009091000,1,1,3.0,13:35,14,3515.0,41.0,PIT,...,-5.031425,0.510793,0.489207,0.461217,0.538783,0.510793,-0.049576,0.106663,-0.156239,2009.0
4,2009-09-10,2009091000,1,1,4.0,13:27,14,3507.0,8.0,PIT,...,,0.461217,0.538783,0.558929,0.441071,0.461217,0.097712,,,2009.0


**What is a seed value**

A seed value in the context of random number generation is an initial input that determines the sequence of random numbers produced by a pseudorandom number generator (PRNG).

The function np.random.seed() is used in NumPy to set the seed for the random number generator, ensuring reproducibility of random results.

**How many missing values do we have in each column?**

In [None]:
# Get the number of missing data points
missing_values_count = nfl_data.isnull().sum()

# Look at the missing values in the first ten columns
missing_values_count[0:10]

Unnamed: 0,0
Date,0
GameID,0
Drive,0
qtr,0
down,969
time,6
TimeUnder,0
TimeSecs,6
PlayTimeDiff,12
SideofField,11


In [None]:
# How many total missing values do we have
total_cells = np.prod(nfl_data.shape)
total_missing = missing_values_count.sum()

# Percent of data that is missing
percent_missing = (total_missing/total_cells) * 100
print(percent_missing)

27.754285282790907


In [None]:
# remove all rows that contain a missing value
nfl_data.dropna()

Unnamed: 0,Date,GameID,Drive,qtr,down,time,TimeUnder,TimeSecs,PlayTimeDiff,SideofField,...,yacEPA,Home_WP_pre,Away_WP_pre,Home_WP_post,Away_WP_post,Win_Prob,WPA,airWPA,yacWPA,Season


In [None]:
# Remove all columns with at least one missing value
columns_with_na_dropped = nfl_data.dropna(axis=1)
columns_with_na_dropped.head()

Unnamed: 0,Date,GameID,Drive,qtr,TimeUnder,ydstogo,ydsnet,desc,PlayAttempted,Yards.Gained,...,AirYards,YardsAfterCatch,QBHit,InterceptionThrown,RushAttempt,Reception,Fumble,Sack,Challenge.Replay,Accepted.Penalty
0,2009-09-10,2009091000,1,1,15,0,0,R.Bironas kicks 67 yards from TEN 30 to PIT 3....,1,39,...,0,0,0,0,0,0,0,0,0,0
1,2009-09-10,2009091000,1,1,15,10,5,(14:53) B.Roethlisberger pass short left to H....,1,5,...,-3,8,0,0,0,1,0,0,0,0
2,2009-09-10,2009091000,1,1,15,5,2,(14:16) W.Parker right end to PIT 44 for -3 ya...,1,-3,...,0,0,0,0,1,0,0,0,0,0
3,2009-09-10,2009091000,1,1,14,8,2,(13:35) (Shotgun) B.Roethlisberger pass incomp...,1,0,...,34,0,0,0,0,0,0,0,0,0
4,2009-09-10,2009091000,1,1,14,8,2,(13:27) (Punt formation) D.Sepulveda punts 54 ...,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Just how much data did we loose?
print("Columns in original dataset: %d \n" % nfl_data.shape[1])
print("Columns with na's dropped: %d" % columns_with_na_dropped.shape[1])

Columns in original dataset: 102 

Columns with na's dropped: 26


**Fill in missing values automatically**

In [None]:
# Get a small number of the NFL dataset
subset_nfl_data = nfl_data.loc[:, 'EPA':'Season'].head()
subset_nfl_data

Unnamed: 0,EPA,airEPA,yacEPA,Home_WP_pre,Away_WP_pre,Home_WP_post,Away_WP_post,Win_Prob,WPA,airWPA,yacWPA,Season
0,2.014474,,,0.485675,0.514325,0.546433,0.453567,0.485675,0.060758,,,2009.0
1,0.077907,-1.068169,1.146076,0.546433,0.453567,0.551088,0.448912,0.546433,0.004655,-0.032244,0.036899,2009.0
2,-1.40276,,,0.551088,0.448912,0.510793,0.489207,0.551088,-0.040295,,,2009.0
3,-1.712583,3.318841,-5.031425,0.510793,0.489207,0.461217,0.538783,0.510793,-0.049576,0.106663,-0.156239,2009.0
4,2.097796,,,0.461217,0.538783,0.558929,0.441071,0.461217,0.097712,,,2009.0


In [None]:
# Replace NA's value with 0
subset_nfl_data.fillna(0)

Unnamed: 0,EPA,airEPA,yacEPA,Home_WP_pre,Away_WP_pre,Home_WP_post,Away_WP_post,Win_Prob,WPA,airWPA,yacWPA,Season
0,2.014474,0.0,0.0,0.485675,0.514325,0.546433,0.453567,0.485675,0.060758,0.0,0.0,2009.0
1,0.077907,-1.068169,1.146076,0.546433,0.453567,0.551088,0.448912,0.546433,0.004655,-0.032244,0.036899,2009.0
2,-1.40276,0.0,0.0,0.551088,0.448912,0.510793,0.489207,0.551088,-0.040295,0.0,0.0,2009.0
3,-1.712583,3.318841,-5.031425,0.510793,0.489207,0.461217,0.538783,0.510793,-0.049576,0.106663,-0.156239,2009.0
4,2.097796,0.0,0.0,0.461217,0.538783,0.558929,0.441071,0.461217,0.097712,0.0,0.0,2009.0


In [None]:
# Replace all NA values the value that comes directly after it
# And replace all the remaining NA with 0
subset_nfl_data.fillna(method='bfill', axis=0).fillna(0)

  subset_nfl_data.fillna(method='bfill', axis=0).fillna(0)


Unnamed: 0,EPA,airEPA,yacEPA,Home_WP_pre,Away_WP_pre,Home_WP_post,Away_WP_post,Win_Prob,WPA,airWPA,yacWPA,Season
0,2.014474,-1.068169,1.146076,0.485675,0.514325,0.546433,0.453567,0.485675,0.060758,-0.032244,0.036899,2009.0
1,0.077907,-1.068169,1.146076,0.546433,0.453567,0.551088,0.448912,0.546433,0.004655,-0.032244,0.036899,2009.0
2,-1.40276,3.318841,-5.031425,0.551088,0.448912,0.510793,0.489207,0.551088,-0.040295,0.106663,-0.156239,2009.0
3,-1.712583,3.318841,-5.031425,0.510793,0.489207,0.461217,0.538783,0.510793,-0.049576,0.106663,-0.156239,2009.0
4,2.097796,0.0,0.0,0.461217,0.538783,0.558929,0.441071,0.461217,0.097712,0.0,0.0,2009.0
