In [598]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder


# Data Cleaning for Supervised Analysis - Turf Data

## Analyzing the Plays from the PlayList file

The first thing to note is that this list contains all of the plays, including the exact play that will match with the injury list, therefore anything that is on both with the exception of the PlayerKey should be maintained on THIS DF so that we don't lose data on the non-injury columns

In [599]:
plays = pd.read_csv('NFL_Turf/PlayList.csv')  # 267,000 rows
plays.head()

Unnamed: 0,PlayerKey,GameID,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup
0,26624,26624-1,26624-1-1,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,1,QB,QB
1,26624,26624-1,26624-1-2,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,2,QB,QB
2,26624,26624-1,26624-1-3,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Rush,3,QB,QB
3,26624,26624-1,26624-1-4,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Rush,4,QB,QB
4,26624,26624-1,26624-1-5,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,5,QB,QB


PlayKey will be used as the Key to merge the datasets, so PlayerKey and GameID can be removed. While FieldType information is also in the surface column of the injuries table, we need to maintain it here, so we don't lose the data from the columns not containing injuries. 

In [600]:
plays.drop(columns=['PlayerKey', 'GameID'], inplace=True)
plays.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup
0,26624-1-1,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,1,QB,QB
1,26624-1-2,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,2,QB,QB
2,26624-1-3,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Rush,3,QB,QB
3,26624-1-4,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Rush,4,QB,QB
4,26624-1-5,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,5,QB,QB


In [601]:
plays.nunique()

PlayKey           267005
RosterPosition        10
PlayerDay            215
PlayerGame            32
StadiumType           29
FieldType              2
Temperature           79
Weather               63
PlayType              11
PlayerGamePlay       102
Position              23
PositionGroup         10
dtype: int64

In [602]:
objects = plays.dtypes[plays.dtypes == 'object'].index.tolist()
objects

['PlayKey',
 'RosterPosition',
 'StadiumType',
 'FieldType',
 'Weather',
 'PlayType',
 'Position',
 'PositionGroup']

- PlayKeys represent all plays, not only those where injuries occurred - these will function to merge the tables
- FieldType only has 2 values, Natural or Synthetic and can be easily changed to binary values 
- Stadium Type is also strange with 29 unique types of stadiums. These can likely be grouped in smaller categories.
- Weather - there are 63 unique types of weather.... this is odd. 
- RosterPosition, Position, and Position Group are all similar and need to be investigated
- PlayTypes should be encoded, as they are categorical such as pass, rush, kick, ... 


### Change the Field Types to Binary Values

In [603]:
# Creates a function to change the surface values
def surface_code(row):
    surface = row['FieldType']
    coded_surface = 0
    if surface == 'Natural':
        coded_surface = 0
    elif surface == 'Synthetic':
        coded_surface = 1

    return coded_surface

In [604]:
# Create a new column called CodedSurface that encodes 0 for a natural surface and 1 for a synthetic surface, then verify
plays['CodedSurface'] = plays.apply(surface_code, axis=1)
plays.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup,CodedSurface
0,26624-1-1,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,1,QB,QB,1
1,26624-1-2,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,2,QB,QB,1
2,26624-1-3,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Rush,3,QB,QB,1
3,26624-1-4,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Rush,4,QB,QB,1
4,26624-1-5,Quarterback,1,1,Outdoor,Synthetic,63,Clear and warm,Pass,5,QB,QB,1


In [605]:
# The code above worked, now change the FieldType to the coded and remove the redundant column
plays['FieldType'] = plays['CodedSurface']
plays.drop(columns='CodedSurface', inplace=True)

In [606]:
plays.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup
0,26624-1-1,Quarterback,1,1,Outdoor,1,63,Clear and warm,Pass,1,QB,QB
1,26624-1-2,Quarterback,1,1,Outdoor,1,63,Clear and warm,Pass,2,QB,QB
2,26624-1-3,Quarterback,1,1,Outdoor,1,63,Clear and warm,Rush,3,QB,QB
3,26624-1-4,Quarterback,1,1,Outdoor,1,63,Clear and warm,Rush,4,QB,QB
4,26624-1-5,Quarterback,1,1,Outdoor,1,63,Clear and warm,Pass,5,QB,QB


### Reduce the Number of Stadium Types to Something Meaningful

It turns out that there are a lot of misspelled stadium types. There are 7 unique spellings of the word 'Outdoor' alone. Also, the people of Pittsburgh seemed pretty confused as to the meaning of Stadium Type, as there are MANY entries listing the stadium type as Heinz Field. 

In [607]:
stadiums = plays.StadiumType.unique().tolist()
stadiums

['Outdoor',
 'Indoors',
 'Oudoor',
 'Outdoors',
 'Open',
 'Closed Dome',
 'Domed, closed',
 nan,
 'Dome',
 'Indoor',
 'Domed',
 'Retr. Roof-Closed',
 'Outdoor Retr Roof-Open',
 'Retractable Roof',
 'Ourdoor',
 'Indoor, Roof Closed',
 'Retr. Roof - Closed',
 'Bowl',
 'Outddors',
 'Retr. Roof-Open',
 'Dome, closed',
 'Indoor, Open Roof',
 'Domed, Open',
 'Domed, open',
 'Heinz Field',
 'Cloudy',
 'Retr. Roof - Open',
 'Retr. Roof Closed',
 'Outdor',
 'Outside']

In [608]:
# How many Stadium Types are missing? 
plays.StadiumType.isna().sum()

16910

In [609]:
# Since most stadiums are outdoor stadiums, for now, just going to change any NaN stadiums to outdoor for now
plays.StadiumType.fillna('Outdoor', inplace=True)
plays.StadiumType.isna().sum()

0

Grouping all stadiums into Outdoor, Indoor, Open Dome, or Closed Dome using a dictionary 

In [610]:
dict = {'Outdoor': 'Outdoor',
        'Indoors': 'Indoor',
        'Oudoor': 'Outdoor',
        'Outdoors': 'Outdoor',
        'Open': 'Open Dome',
        'Closed Dome': 'Closed Dome',
        'Domed, closed': 'Closed Dome',
        'Dome': 'Closed Dome',
        'Indoor': 'Indoor',
        'Domed': 'Closed Dome',
        'Retr. Roof-Closed': 'Closed Dome',
        'Outdoor Retr Roof-Open': 'Open Dome',
        'Retractable Roof': 'Open Dome',
        'Ourdoor': 'Outdoor',
        'Indoor, Roof Closed': 'Closed Dome',
        'Retr. Roof - Closed': 'Closed Dome',
        'Bowl': 'Outdoor',
        'Outddors': 'Outdoor',
        'Retr. Roof-Open': 'Open Dome',
        'Dome, closed': 'Closed Dome',
        'Indoor, Open Roof': 'Open Dome',
        'Domed, Open': 'Open Dome',
        'Domed, open': 'Open Dome',
        'Heinz Field': 'Outdoor',
        'Cloudy': 'Outdoor',
        'Retr. Roof - Open': 'Open Dome',
        'Retr. Roof Closed': 'Closed Dome',
        'Outdor': 'Outdoor',
        'Outside': 'Outdoor'}


plays.StadiumType.replace(dict, inplace=True)
plays.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup
0,26624-1-1,Quarterback,1,1,Outdoor,1,63,Clear and warm,Pass,1,QB,QB
1,26624-1-2,Quarterback,1,1,Outdoor,1,63,Clear and warm,Pass,2,QB,QB
2,26624-1-3,Quarterback,1,1,Outdoor,1,63,Clear and warm,Rush,3,QB,QB
3,26624-1-4,Quarterback,1,1,Outdoor,1,63,Clear and warm,Rush,4,QB,QB
4,26624-1-5,Quarterback,1,1,Outdoor,1,63,Clear and warm,Pass,5,QB,QB


### For the Supervised Learning, going to initially group the Stadium Types as Outdoor, or Not Outdoor in a new column, OutdoorStadium

In [611]:
# This uses the numpy where to classify anything that meets the True condition as 1, denoting Outdoor Stadium, and False becomes 0, for all other non-outdoor stadiums
plays['OutdoorStadium'] = np.where(plays['StadiumType']=='Outdoor', 1, 0)

In [612]:
plays.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup,OutdoorStadium
0,26624-1-1,Quarterback,1,1,Outdoor,1,63,Clear and warm,Pass,1,QB,QB,1
1,26624-1-2,Quarterback,1,1,Outdoor,1,63,Clear and warm,Pass,2,QB,QB,1
2,26624-1-3,Quarterback,1,1,Outdoor,1,63,Clear and warm,Rush,3,QB,QB,1
3,26624-1-4,Quarterback,1,1,Outdoor,1,63,Clear and warm,Rush,4,QB,QB,1
4,26624-1-5,Quarterback,1,1,Outdoor,1,63,Clear and warm,Pass,5,QB,QB,1


### Dealing with the Weather Situation

There were a lot of different entries meaning the same thing; these were grouped in a dictionary the same way the stadiums were, and can be adjusted if necessary 

In [613]:
weather_dict = {'Clear and warm': 'Clear',
                'Mostly Cloudy': 'Cloudy',
                'Sunny': 'Clear',
                'Clear': 'Clear',
                'Cloudy': 'Cloudy',
                'Cloudy, fog started developing in 2nd quarter': 'Hazy/Fog',
                'Rain': 'Rain',
                'Partly Cloudy': 'Cloudy',
                'Mostly cloudy': 'Cloudy',
                'Cloudy and cold': 'Cloudy',
                'Cloudy and Cool': 'Cloudy',
                'Rain Chance 40%': 'Rain',
                'Controlled Climate': 'Indoor',
                'Sunny and warm': 'Clear',
                'Partly cloudy': 'Cloudy',
                'Clear and Cool': 'Cloudy',
                'Clear and cold': 'Cloudy',
                'Sunny and cold': 'Clear',
                'Indoor': 'Indoor',
                'Partly Sunny': 'Clear',
                'N/A (Indoors)': 'Indoor',
                'Mostly Sunny': 'Clear',
                'Indoors': 'Indoor',
                'Clear Skies': 'Clear',
                'Partly sunny': 'Clear',
                'Showers': 'Rain',
                'N/A Indoor': 'Indoor',
                'Sunny and clear': 'Clear',
                'Snow': 'Snow',
                'Scattered Showers': 'Rain',
                'Party Cloudy': 'Cloudy',
                'Clear skies': 'Clear',
                'Rain likely, temps in low 40s.': 'Rain',
                'Hazy': 'Hazy/Fog',
                'Partly Clouidy': 'Cloudy',
                'Sunny Skies': 'Clear',
                'Overcast': 'Cloudy',
                'Cloudy, 50% change of rain': 'Cloudy',
                'Fair': 'Clear',
                'Light Rain': 'Rain',
                'Partly clear': 'Clear',
                'Mostly Coudy': 'Cloudy',
                '10% Chance of Rain': 'Cloudy',
                'Cloudy, chance of rain': 'Cloudy',
                'Heat Index 95': 'Clear',
                'Sunny, highs to upper 80s': 'Clear',
                'Sun & clouds': 'Cloudy',
                'Heavy lake effect snow': 'Snow',
                'Mostly sunny': 'Clear',
                'Cloudy, Rain': 'Rain',
                'Sunny, Windy': 'Windy',
                'Mostly Sunny Skies': 'Clear',
                'Rainy': 'Rain',
                '30% Chance of Rain': 'Rain',
                'Cloudy, light snow accumulating 1-3"': 'Snow',
                'cloudy': 'Cloudy',
                'Clear and Sunny': 'Clear',
                'Coudy': 'Cloudy',
                'Clear and sunny': 'Clear',
                'Clear to Partly Cloudy': 'Clear',
                'Cloudy with periods of rain, thunder possible. Winds shifting to WNW, 10-20 mph.': 'Windy',
                'Rain shower': 'Rain',
                'Cold': 'Clear'}

plays.Weather.replace(weather_dict, inplace=True)
plays.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup,OutdoorStadium
0,26624-1-1,Quarterback,1,1,Outdoor,1,63,Clear,Pass,1,QB,QB,1
1,26624-1-2,Quarterback,1,1,Outdoor,1,63,Clear,Pass,2,QB,QB,1
2,26624-1-3,Quarterback,1,1,Outdoor,1,63,Clear,Rush,3,QB,QB,1
3,26624-1-4,Quarterback,1,1,Outdoor,1,63,Clear,Rush,4,QB,QB,1
4,26624-1-5,Quarterback,1,1,Outdoor,1,63,Clear,Pass,5,QB,QB,1


Assess whether the nan rows are indoor statiums, in which case, change to Indoor, otherwise remove

In [614]:
plays['Weather'].unique()


array(['Clear', 'Cloudy', 'Hazy/Fog', 'Rain', 'Indoor', nan, 'Snow',
       'Windy'], dtype=object)

In [615]:
plays.Weather.value_counts()


Cloudy      112306
Clear        96985
Indoor       20276
Rain         14280
Snow          1945
Hazy/Fog      1809
Windy          713
Name: Weather, dtype: int64

In [616]:
plays.Weather.isna().sum()

18691

In [617]:
# This line of code identifies from the plays table, where the stadium type is 'Indoor' and then fills NaN values in the 'Weather' column with 'Indoor'.
plays.loc[plays.StadiumType == 'Indoor', 'Weather'] = plays.loc[plays.StadiumType == 'Indoor', 'Weather'].fillna('Indoor')


In [618]:
# This addeda bout 7000 values to the Indoor values
plays.Weather.value_counts()

Cloudy      112306
Clear        96985
Indoor       27113
Rain         14280
Snow          1945
Hazy/Fog      1809
Windy          713
Name: Weather, dtype: int64

In [619]:
# The remaining ~ 12,000 were outdoor with no weather - going to remove these since it's impossible to predict the weather conditions
plays.Weather.isna().sum()

11854

In [620]:
# It's possible to determine the weather on those days if absolutely necessary, this looks like about 4.7% of the data...
plays = plays.loc[plays.Weather.isna() == False]
plays.Weather.isna().sum()

0

In [621]:
# Weather has been reduced from 63 different values to 7
plays.Weather.nunique()

7

Now that the Weather has been reduced to fewer than 10, it is ready to be encoded.

### Encoding the Weather in a new column called WeatherImpact

Weather can be ranked in order of impact:  Clear and Indoor= 0, Cloudy = 1,  Windy = 2, Hazy/Fog = 3, Rain = 4, Snow = 5 

In [622]:
weather_impact = {
    'Indoor': 0, 
    'Clear': 0, 
    'Cloudy': 1,
    'Windy': 2,
    'Hazy/Fog': 3, 
    'Rain': 4, 
    'Snow': 5 
}

plays['WeatherImpact'] = plays.Weather.map(weather_impact)

In [623]:
plays.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup,OutdoorStadium,WeatherImpact
0,26624-1-1,Quarterback,1,1,Outdoor,1,63,Clear,Pass,1,QB,QB,1,0
1,26624-1-2,Quarterback,1,1,Outdoor,1,63,Clear,Pass,2,QB,QB,1,0
2,26624-1-3,Quarterback,1,1,Outdoor,1,63,Clear,Rush,3,QB,QB,1,0
3,26624-1-4,Quarterback,1,1,Outdoor,1,63,Clear,Rush,4,QB,QB,1,0
4,26624-1-5,Quarterback,1,1,Outdoor,1,63,Clear,Pass,5,QB,QB,1,0


### Looking at the Temperature Values - was determined in the PCA that some temperatures were... aberrant

In [624]:
plays.Temperature.value_counts()

-999    18032
 68     13588
 61      6744
 72      6513
 48      6068
        ...  
 34       418
 32       383
 10       292
 26       243
 9        210
Name: Temperature, Length: 79, dtype: int64

Note that 18000 temperatures were included as -999 degrees. This did impact the analysis, and for the time being, these will all be dropped for initial analysis.
Later we can check to see how many of the -999 temperatures come from indoor stadiums, and for any of those we can use the mean temperature of known indoor stadiums. 

In [625]:
plays = plays[plays['Temperature'] != -999]
plays.Temperature.value_counts()

68    13588
61     6744
72     6513
48     6068
60     5982
      ...  
34      418
32      383
10      292
26      243
9       210
Name: Temperature, Length: 78, dtype: int64

### Addressing the Positions Issue

RosterPositions are similar to the PositionGroups, only not put in abbreviations. Will need to change the Roster Positions into abbreviations first. PositionGroups can be dropped, since they are nearly identical to the Roster and actual positions. 

In [626]:
roster = plays.RosterPosition.unique()
roster

array(['Quarterback', 'Wide Receiver', 'Linebacker', 'Running Back',
       'Defensive Lineman', 'Tight End', 'Safety', 'Cornerback',
       'Offensive Lineman', 'Kicker'], dtype=object)

In [627]:
abbreviations = plays.Position.unique()
abbreviations

array(['QB', 'Missing Data', 'WR', 'ILB', 'RB', 'DE', 'TE', 'FS', 'CB',
       'G', 'T', 'OLB', 'DT', 'SS', 'MLB', 'C', 'NT', 'DB', 'K', 'LB',
       'S', 'HB', 'P'], dtype=object)

Going to change the the positions the same way, using a dictionary

In [628]:
position_dict = {
    'Quarterback': 'QB',
    'Wide Receiver': 'WR',
    'Linebacker': 'LB',
    'Running Back': 'RB',
    'Defensive Lineman': 'DL',
    'Tight End': 'TE',
    'Safety': 'S',
    'Cornerback': 'CB',
    'Offensive Lineman': 'OL',
    'Kicker': 'K'
}

plays.RosterPosition.replace(position_dict, inplace=True)
plays.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position,PositionGroup,OutdoorStadium,WeatherImpact
0,26624-1-1,QB,1,1,Outdoor,1,63,Clear,Pass,1,QB,QB,1,0
1,26624-1-2,QB,1,1,Outdoor,1,63,Clear,Pass,2,QB,QB,1,0
2,26624-1-3,QB,1,1,Outdoor,1,63,Clear,Rush,3,QB,QB,1,0
3,26624-1-4,QB,1,1,Outdoor,1,63,Clear,Rush,4,QB,QB,1,0
4,26624-1-5,QB,1,1,Outdoor,1,63,Clear,Pass,5,QB,QB,1,0


In [629]:
plays.Position.unique()

array(['QB', 'Missing Data', 'WR', 'ILB', 'RB', 'DE', 'TE', 'FS', 'CB',
       'G', 'T', 'OLB', 'DT', 'SS', 'MLB', 'C', 'NT', 'DB', 'K', 'LB',
       'S', 'HB', 'P'], dtype=object)

In [630]:
# Drop the Position Group column
plays = plays.drop(columns='PositionGroup')
plays.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position,OutdoorStadium,WeatherImpact
0,26624-1-1,QB,1,1,Outdoor,1,63,Clear,Pass,1,QB,1,0
1,26624-1-2,QB,1,1,Outdoor,1,63,Clear,Pass,2,QB,1,0
2,26624-1-3,QB,1,1,Outdoor,1,63,Clear,Rush,3,QB,1,0
3,26624-1-4,QB,1,1,Outdoor,1,63,Clear,Rush,4,QB,1,0
4,26624-1-5,QB,1,1,Outdoor,1,63,Clear,Pass,5,QB,1,0


In [631]:
plays.Position[plays.Position == "Missing Data"].value_counts()

Missing Data    45
Name: Position, dtype: int64

In [632]:
# This code identifies "Missing Data" from the Position and replaces the missing value with the RosterPosition
plays['Position'] = np.where(plays['Position'] == 'Missing Data', plays['RosterPosition'], plays['Position'])

# Verify that the missing Data values have been replaced
plays.Position[plays.Position == "Missing Data"].value_counts()

Series([], Name: Position, dtype: int64)

In [633]:
plays.Position.value_counts()
# This is binned into more than 10 groups and may not produce reliable results

WR     38461
OLB    28758
CB     26638
FS     19465
G      16531
T      14594
SS     13590
DT     12595
C      11983
DE     11707
RB     10104
ILB     7649
TE      6700
QB      6391
MLB     4567
NT      2418
LB      2404
DB      1280
K        521
S        412
HB       185
P        160
OL         4
DL         2
Name: Position, dtype: int64

In [634]:
plays.RosterPosition.value_counts()


LB    43606
OL    43112
WR    38391
S     35712
DL    26494
CB    25673
RB    10359
TE     6700
QB     6391
K       681
Name: RosterPosition, dtype: int64

The above values show how many recorded plays each player type was logged in as for all data. The positions are categorical and will be encoded using OneHotEncoder, changing them to binary columns. The Roster Position is the general class, and is redudant if we keep both position and Roster Position.

Position was initially tested, and only the WR and OLB had a high impact and were related to the frequency of the positions. 

In [635]:
# Something weird happened when trying to do a Naive Bayes... it found negative values... 
min(plays.PlayerDay)

-62

In [636]:
plays = plays.assign(DaysPlayed = lambda x: x['PlayerDay'] + 63)
plays.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerDay,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position,OutdoorStadium,WeatherImpact,DaysPlayed
0,26624-1-1,QB,1,1,Outdoor,1,63,Clear,Pass,1,QB,1,0,64
1,26624-1-2,QB,1,1,Outdoor,1,63,Clear,Pass,2,QB,1,0,64
2,26624-1-3,QB,1,1,Outdoor,1,63,Clear,Rush,3,QB,1,0,64
3,26624-1-4,QB,1,1,Outdoor,1,63,Clear,Rush,4,QB,1,0,64
4,26624-1-5,QB,1,1,Outdoor,1,63,Clear,Pass,5,QB,1,0,64


In [637]:
min(plays.DaysPlayed)

1

In [638]:
plays.drop(columns='PlayerDay', inplace=True)

## Now To Address the Injuries Dataset

In [639]:
injuries = pd.read_csv("NFL_Turf/InjuryRecord.csv")  # 105 rows
injuries.head()

Unnamed: 0,PlayerKey,GameID,PlayKey,BodyPart,Surface,DM_M1,DM_M7,DM_M28,DM_M42
0,39873,39873-4,39873-4-32,Knee,Synthetic,1,1,1,1
1,46074,46074-7,46074-7-26,Knee,Natural,1,1,0,0
2,36557,36557-1,36557-1-70,Ankle,Synthetic,1,1,1,1
3,46646,46646-3,46646-3-30,Ankle,Natural,1,0,0,0
4,43532,43532-5,43532-5-69,Ankle,Synthetic,1,1,1,1


Evaluate all columns for na values

In [640]:
# The PlayKey column is the only one that has NaN values
injuries['PlayKey'].isna().sum()

28

In [641]:
# Drop the NaN values, since we won't be able to correlate these with the other tables
injuries = injuries.dropna(subset = ['PlayKey'])

In [642]:
injuries.nunique()

PlayerKey    74
GameID       76
PlayKey      76
BodyPart      3
Surface       2
DM_M1         1
DM_M7         2
DM_M28        2
DM_M42        2
dtype: int64

Note: there is only 1 unique value for DM_M1 - which means that every player on this list was injured for at least 1 day

In [643]:
injuries.dtypes

PlayerKey     int64
GameID       object
PlayKey      object
BodyPart     object
Surface      object
DM_M1         int64
DM_M7         int64
DM_M28        int64
DM_M42        int64
dtype: object

The Surface is the same as the Field Type from the other table, so this can be dropped. 
Note: Anyone whose injury is in the DM_M42 list is also in all of the prior lists, so there will be more of the lower values due the the encoding. Going to change this to a single column with values of 1, 7, 28, and 42

### Group the DM columns into a single Injury Duration column

In [644]:
def injury_duration(row):
    injury_duration = 0
    if row["DM_M42"] == 1:
        injury_duration = 42
    else:
        if row["DM_M28"] == 1:
            injury_duration = 28
        else:
            if row["DM_M7"] == 1:
                injury_duration = 7
            else: 
                injury_duration = 1
    
    return injury_duration

# Apply the function to all rows
injuries['InjuryDuration'] = injuries.apply(injury_duration, axis=1)
injuries.head()


Unnamed: 0,PlayerKey,GameID,PlayKey,BodyPart,Surface,DM_M1,DM_M7,DM_M28,DM_M42,InjuryDuration
0,39873,39873-4,39873-4-32,Knee,Synthetic,1,1,1,1,42
1,46074,46074-7,46074-7-26,Knee,Natural,1,1,0,0,7
2,36557,36557-1,36557-1-70,Ankle,Synthetic,1,1,1,1,42
3,46646,46646-3,46646-3-30,Ankle,Natural,1,0,0,0,1
4,43532,43532-5,43532-5-69,Ankle,Synthetic,1,1,1,1,42


In [645]:
# Remove the rows for DMs
injuries.drop(columns=['DM_M1', 'DM_M7', 'DM_M28', 'DM_M42', 'Surface'], inplace=True)
injuries.head()

Unnamed: 0,PlayerKey,GameID,PlayKey,BodyPart,InjuryDuration
0,39873,39873-4,39873-4-32,Knee,42
1,46074,46074-7,46074-7-26,Knee,7
2,36557,36557-1,36557-1-70,Ankle,42
3,46646,46646-3,46646-3-30,Ankle,1
4,43532,43532-5,43532-5-69,Ankle,42


Analyze the BodyPart of injury to verify it's ready for encoding

In [646]:
# The body parts are categorical and will be, but since each injury was logged as unique, going to use the occurrence frequency as the numerical coding instead of arbitrary numbers
knee_freq = injuries.BodyPart.value_counts()['Knee']
ankle_freq = injuries.BodyPart.value_counts()['Ankle']
foot_freq = injuries.BodyPart.value_counts()['Foot']
injuries.BodyPart.value_counts()


Knee     36
Ankle    35
Foot      6
Name: BodyPart, dtype: int64

In [647]:
# There are 74 known individual players that have been injured for at least 1 day 
injuries.PlayerKey.nunique()

74

In [648]:
# This output only 76 unique plays with only 74 players, so only 2 players were reinjured at different times of the season
injuries.PlayKey.nunique()

76

Every GameID and PlayID are unique, meaning that once that
particular player was injured during a specific game at a specific play,
they didn't return to the field. Since the GameID numbers are not in any 
chronological order and offer no information other than the PlayKey can, this column can be dropped

In [649]:
injuries.GameID.nunique()

76

Since the PlayerID, GameID, and PlayKey number are all contained within the PlayKey, the GameID and PlayerID can be dropped. 

In [650]:
injuries.drop(columns=['GameID', 'PlayerKey'], inplace=True)
injuries.head()

Unnamed: 0,PlayKey,BodyPart,InjuryDuration
0,39873-4-32,Knee,42
1,46074-7-26,Knee,7
2,36557-1-70,Ankle,42
3,46646-3-30,Ankle,1
4,43532-5-69,Ankle,42


For the supervised analysis, the injuries will need to be recorded as numerical values. We will create 2 columns:
- 'IsInjured' where 0 is not injured and 1 is injured
- 'InjuryType' where the Injury Type will be encoded by the frequency of occurrence, Knee = 36, Ankle = 35, and Foot = 6

Depeding on the type of analysis, if we're trying to predict with a binary outcome, whether or not there will be an injury, we will use 'IsInjured'. If we're trying to predict which types of injury, we'd need the numerical factors for each type of injury. 

These changes cannot be made until this table is merged with the other table, containing the non-injured player plays

## Merge the 2 dataframes

In [651]:
plays.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerGame,StadiumType,FieldType,Temperature,Weather,PlayType,PlayerGamePlay,Position,OutdoorStadium,WeatherImpact,DaysPlayed
0,26624-1-1,QB,1,Outdoor,1,63,Clear,Pass,1,QB,1,0,64
1,26624-1-2,QB,1,Outdoor,1,63,Clear,Pass,2,QB,1,0,64
2,26624-1-3,QB,1,Outdoor,1,63,Clear,Rush,3,QB,1,0,64
3,26624-1-4,QB,1,Outdoor,1,63,Clear,Rush,4,QB,1,0,64
4,26624-1-5,QB,1,Outdoor,1,63,Clear,Pass,5,QB,1,0,64


- Drop the categorical columns that have been encoded for the supervised analysis
- Play Type and RosterPosition will be encoded with OneHotEncoder

In [652]:
plays.drop(columns=['StadiumType', 'Weather', 'Position'], inplace=True)

If we want, we can switch out the RosterPosition for the played position to see if there was a difference, but the actual position is more specific to the play, which may be a better indicator

In [653]:
plays.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerGame,FieldType,Temperature,PlayType,PlayerGamePlay,OutdoorStadium,WeatherImpact,DaysPlayed
0,26624-1-1,QB,1,1,63,Pass,1,1,0,64
1,26624-1-2,QB,1,1,63,Pass,2,1,0,64
2,26624-1-3,QB,1,1,63,Rush,3,1,0,64
3,26624-1-4,QB,1,1,63,Rush,4,1,0,64
4,26624-1-5,QB,1,1,63,Pass,5,1,0,64


In [654]:
injuries.head()

Unnamed: 0,PlayKey,BodyPart,InjuryDuration
0,39873-4-32,Knee,42
1,46074-7-26,Knee,7
2,36557-1-70,Ankle,42
3,46646-3-30,Ankle,1
4,43532-5-69,Ankle,42


In [655]:
play_injuries = pd.merge(plays, injuries, on='PlayKey', how='outer')

In [656]:
play_injuries.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerGame,FieldType,Temperature,PlayType,PlayerGamePlay,OutdoorStadium,WeatherImpact,DaysPlayed,BodyPart,InjuryDuration
0,26624-1-1,QB,1.0,1.0,63.0,Pass,1.0,1.0,0.0,64.0,,
1,26624-1-2,QB,1.0,1.0,63.0,Pass,2.0,1.0,0.0,64.0,,
2,26624-1-3,QB,1.0,1.0,63.0,Rush,3.0,1.0,0.0,64.0,,
3,26624-1-4,QB,1.0,1.0,63.0,Rush,4.0,1.0,0.0,64.0,,
4,26624-1-5,QB,1.0,1.0,63.0,Pass,5.0,1.0,0.0,64.0,,


### Add values for duration and Body Part. Change NaN to None for body part. Change Injury_Duration to 0 for all NaN values

In [657]:
play_injuries.BodyPart.fillna('NoInjury', inplace=True)
play_injuries.InjuryDuration.fillna(0, inplace=True)
play_injuries.head()


Unnamed: 0,PlayKey,RosterPosition,PlayerGame,FieldType,Temperature,PlayType,PlayerGamePlay,OutdoorStadium,WeatherImpact,DaysPlayed,BodyPart,InjuryDuration
0,26624-1-1,QB,1.0,1.0,63.0,Pass,1.0,1.0,0.0,64.0,NoInjury,0.0
1,26624-1-2,QB,1.0,1.0,63.0,Pass,2.0,1.0,0.0,64.0,NoInjury,0.0
2,26624-1-3,QB,1.0,1.0,63.0,Rush,3.0,1.0,0.0,64.0,NoInjury,0.0
3,26624-1-4,QB,1.0,1.0,63.0,Rush,4.0,1.0,0.0,64.0,NoInjury,0.0
4,26624-1-5,QB,1.0,1.0,63.0,Pass,5.0,1.0,0.0,64.0,NoInjury,0.0


Add a binary column for injury/no_injury

In [658]:
play_injuries['IsInjured'] = play_injuries['BodyPart'].apply(lambda x: 0 if x == 'NoInjury'  else 1)

In [659]:
play_injuries.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerGame,FieldType,Temperature,PlayType,PlayerGamePlay,OutdoorStadium,WeatherImpact,DaysPlayed,BodyPart,InjuryDuration,IsInjured
0,26624-1-1,QB,1.0,1.0,63.0,Pass,1.0,1.0,0.0,64.0,NoInjury,0.0,0
1,26624-1-2,QB,1.0,1.0,63.0,Pass,2.0,1.0,0.0,64.0,NoInjury,0.0,0
2,26624-1-3,QB,1.0,1.0,63.0,Rush,3.0,1.0,0.0,64.0,NoInjury,0.0,0
3,26624-1-4,QB,1.0,1.0,63.0,Rush,4.0,1.0,0.0,64.0,NoInjury,0.0,0
4,26624-1-5,QB,1.0,1.0,63.0,Pass,5.0,1.0,0.0,64.0,NoInjury,0.0,0


Add the numerical frequency-based column for the InjuryTypes

In [660]:
# Rearrange the columns 
injury_type = {
    'Knee': knee_freq, 
    'Ankle': ankle_freq,
    'Foot': foot_freq, 
    'NoInjury': 0
}

play_injuries['InjuryType'] = play_injuries.BodyPart.map(injury_type)

play_injuries.head()

Unnamed: 0,PlayKey,RosterPosition,PlayerGame,FieldType,Temperature,PlayType,PlayerGamePlay,OutdoorStadium,WeatherImpact,DaysPlayed,BodyPart,InjuryDuration,IsInjured,InjuryType
0,26624-1-1,QB,1.0,1.0,63.0,Pass,1.0,1.0,0.0,64.0,NoInjury,0.0,0,0
1,26624-1-2,QB,1.0,1.0,63.0,Pass,2.0,1.0,0.0,64.0,NoInjury,0.0,0,0
2,26624-1-3,QB,1.0,1.0,63.0,Rush,3.0,1.0,0.0,64.0,NoInjury,0.0,0,0
3,26624-1-4,QB,1.0,1.0,63.0,Rush,4.0,1.0,0.0,64.0,NoInjury,0.0,0,0
4,26624-1-5,QB,1.0,1.0,63.0,Pass,5.0,1.0,0.0,64.0,NoInjury,0.0,0,0


In [661]:
play_injuries.InjuryType.value_counts()

0     237051
36        36
35        35
6          6
Name: InjuryType, dtype: int64

In [662]:
# Drop the BodyPart column and PlayKey
play_injuries.drop(columns=['PlayKey','BodyPart'], inplace=True)

In [663]:
play_injuries.dtypes

RosterPosition     object
PlayerGame        float64
FieldType         float64
Temperature       float64
PlayType           object
PlayerGamePlay    float64
OutdoorStadium    float64
WeatherImpact     float64
DaysPlayed        float64
InjuryDuration    float64
IsInjured           int64
InjuryType          int64
dtype: object

In [664]:
play_injuries.isna().sum()

RosterPosition      8
PlayerGame          8
FieldType           8
Temperature         8
PlayType          336
PlayerGamePlay      8
OutdoorStadium      8
WeatherImpact       8
DaysPlayed          8
InjuryDuration      0
IsInjured           0
InjuryType          0
dtype: int64

There seem to be 8 na values in most of the columns, except the play type - where there are 336. This won't remove a lot of the data, so going to drop the remaining nan values 

In [665]:
play_injuries = play_injuries.dropna()

In [666]:
play_injuries.head()

Unnamed: 0,RosterPosition,PlayerGame,FieldType,Temperature,PlayType,PlayerGamePlay,OutdoorStadium,WeatherImpact,DaysPlayed,InjuryDuration,IsInjured,InjuryType
0,QB,1.0,1.0,63.0,Pass,1.0,1.0,0.0,64.0,0.0,0,0
1,QB,1.0,1.0,63.0,Pass,2.0,1.0,0.0,64.0,0.0,0,0
2,QB,1.0,1.0,63.0,Rush,3.0,1.0,0.0,64.0,0.0,0,0
3,QB,1.0,1.0,63.0,Rush,4.0,1.0,0.0,64.0,0.0,0,0
4,QB,1.0,1.0,63.0,Pass,5.0,1.0,0.0,64.0,0.0,0,0


After the first run, the Pass and Rush plays had a large impact, but each of the other plays were minimal, and all of the other plays are effectively kicking plays. Going to group the other plays to reduce features.

In [667]:
play_injuries.PlayType.value_counts()

Pass                    122785
Rush                     82413
Extra Point               5279
Kickoff                   4910
Punt                      4902
Field Goal                4437
Kickoff Not Returned      4003
Punt Not Returned         3095
Kickoff Returned          2490
Punt Returned             2233
0                          245
Name: PlayType, dtype: int64

In [668]:
play_type = {
    'Pass': 'Pass',
    'Rush': 'Rush',
    'Extra Point': 'Kick',
    'Kickoff': 'Kick',
    'Punt': 'Kick',
    'Field Goal': 'Kick',
    'Kickoff Not Returned': 'Kick',
    'Punt Not Returned': 'Kick',
    'Kickoff Returned': 'Kick',
    'Punt Returned': 'Kick',
    '0': 'Kick'
}

play_injuries['PlayType'] = play_injuries.PlayType.map(play_type)
play_injuries.head()


Unnamed: 0,RosterPosition,PlayerGame,FieldType,Temperature,PlayType,PlayerGamePlay,OutdoorStadium,WeatherImpact,DaysPlayed,InjuryDuration,IsInjured,InjuryType
0,QB,1.0,1.0,63.0,Pass,1.0,1.0,0.0,64.0,0.0,0,0
1,QB,1.0,1.0,63.0,Pass,2.0,1.0,0.0,64.0,0.0,0,0
2,QB,1.0,1.0,63.0,Rush,3.0,1.0,0.0,64.0,0.0,0,0
3,QB,1.0,1.0,63.0,Rush,4.0,1.0,0.0,64.0,0.0,0,0
4,QB,1.0,1.0,63.0,Pass,5.0,1.0,0.0,64.0,0.0,0,0


In [669]:
play_injuries.PlayType.value_counts()

Pass    122785
Rush     82413
Kick     31594
Name: PlayType, dtype: int64

In [670]:
play_injuries.isna().sum()

RosterPosition    0
PlayerGame        0
FieldType         0
Temperature       0
PlayType          0
PlayerGamePlay    0
OutdoorStadium    0
WeatherImpact     0
DaysPlayed        0
InjuryDuration    0
IsInjured         0
InjuryType        0
dtype: int64

## Encode the Position and Play type using OneHotEncoder

In [671]:
# Gather the categorical variables
play_cat = play_injuries.dtypes[play_injuries.dtypes == 'object'].index.tolist()

# Create the Encoder Instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the categorical data
encode_df = pd.DataFrame(enc.fit_transform(play_injuries[play_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(play_cat)
encode_df.head()

Unnamed: 0,RosterPosition_CB,RosterPosition_DL,RosterPosition_K,RosterPosition_LB,RosterPosition_OL,RosterPosition_QB,RosterPosition_RB,RosterPosition_S,RosterPosition_TE,RosterPosition_WR,PlayType_Kick,PlayType_Pass,PlayType_Rush
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [672]:
# Merge encoded features and drop the original columns
play_injuries_super = play_injuries.merge(encode_df, left_index=True, right_index=True)
play_injuries_super.drop(columns=play_cat, inplace=True)

In [673]:
play_injuries_super.head()

Unnamed: 0,PlayerGame,FieldType,Temperature,PlayerGamePlay,OutdoorStadium,WeatherImpact,DaysPlayed,InjuryDuration,IsInjured,InjuryType,...,RosterPosition_LB,RosterPosition_OL,RosterPosition_QB,RosterPosition_RB,RosterPosition_S,RosterPosition_TE,RosterPosition_WR,PlayType_Kick,PlayType_Pass,PlayType_Rush
0,1.0,1.0,63.0,1.0,1.0,0.0,64.0,0.0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,1.0,63.0,2.0,1.0,0.0,64.0,0.0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,1.0,63.0,3.0,1.0,0.0,64.0,0.0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,1.0,63.0,4.0,1.0,0.0,64.0,0.0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,1.0,63.0,5.0,1.0,0.0,64.0,0.0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## Export the Data for Analysis

In [674]:
play_injuries_super.to_csv('play_injuries_super.csv', index=False)