In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

#Import the initial dataset
dataFrame = pd.read_csv("NFLDraftData.csv")

In [2]:
#Some of the data in the dataset is not going to be used, therefore it needs to be cleaned
positions = dataFrame.position.unique()

#We are only concerned with data about WR, RB and TE
#Drop all irrelevant rows
for position in positions:
    if(position == "WR" or position == "RB" or position == "TE"):
        continue
    dataFrame = dataFrame[dataFrame.position != position]
    
#Drop all irrelevant columns
columnsToDrop = ["passCompletions", "passAttempts", "passCompletionPercentage","passingYards", 
                 "avgYardsPerPass", "passingTouchdowns", "passingInterceptions"]
dataFrame = dataFrame.drop(columns = columnsToDrop, axis = 1)

#Update the positions array to store only current positions
positions = dataFrame.position.unique()
numberOfDataPoints = []

#Would be useful to know how many data points we now have
for position in positions:
    tempDataFrame = dataFrame[dataFrame['position'] == position]
    number = len(tempDataFrame.index)
    numberOfDataPoints.append(number)
    
print(positions)
print(numberOfDataPoints)

['WR' 'RB' 'TE']
[436, 301, 200]


In [3]:
#There are some null data points in certain which need to be filled
#For data points relating to college statistics, fill all null values with 0, as a null entry means
#they have no recorded data for that particular stat
columnsToZeroFill = ["rushingAttempts", "rushingYards", "avgYardsPerRush", "rushingTouchdowns", "receptions",
                    "receivingYards", "avgYardsPerReception", "receivingTouchdowns"]
dataFrame[columnsToZeroFill] = dataFrame[columnsToZeroFill].fillna(value = 0.0)

#For data relating to combine statistics, fill all null values with the average for that position
#Quick function to find average combine statistics for each position and return an array containing all the values
def getAverageCombineData(position):
    averageCombineData = []
    tempDataFrame = dataFrame[dataFrame['position'] == position]
    averageCombineData.append(round(tempDataFrame['fortyYardDash'].mean(), 2))
    averageCombineData.append(round(tempDataFrame['verticalJump'].mean(), 1))
    averageCombineData.append(round(tempDataFrame['benchPress'].mean(), 0))
    averageCombineData.append(round(tempDataFrame['broadJump'].mean(), 0))
    averageCombineData.append(round(tempDataFrame['threeCone'].mean(), 2))
    averageCombineData.append(round(tempDataFrame['shuttle'].mean(), 2))
    
    return averageCombineData

columnsToAverageFill = ["fortyYardDash", "verticalJump", "benchPress", "broadJump", "threeCone", "shuttle"]

averageWRData = getAverageCombineData("WR")
averageRBData = getAverageCombineData("RB")
averageTEData = getAverageCombineData("TE")

#Set up a variable to hold index of array holding average data
i = 0

#Iterate through each column and average fill using the appropriate value based on the position
for column in columnsToAverageFill:
    dataFrame[column] = dataFrame.apply(
    lambda row: averageWRData[i] if np.isnan(row[column]) and row['position'] == "WR" else row[column],
    axis=1)
    dataFrame[column] = dataFrame.apply(
    lambda row: averageRBData[i] if np.isnan(row[column]) and row['position'] == "RB" else row[column],
    axis=1)
    dataFrame[column] = dataFrame.apply(
    lambda row: averageTEData[i] if np.isnan(row[column]) and row['position'] == "TE" else row[column],
    axis=1)
    
    i+=1

In [4]:
#Print out the dataFrame for reference
dataFrame

Unnamed: 0,draftRound,draftPick,playerName,position,age,height,weight,collegeAttended,rushingAttempts,rushingYards,...,receptions,receivingYards,avgYardsPerReception,receivingTouchdowns,fortyYardDash,verticalJump,benchPress,broadJump,threeCone,shuttle
0,1,4,Peter Warrick,WR,23,180.0,192,Florida St.,41.0,188.0,...,207.0,3517.0,17.0,32.0,4.58,35.8,15.0,121.0,6.92,4.19
1,1,5,Jamal Lewis,RB,21,180.0,240,Tennessee,487.0,2677.0,...,39.0,475.0,12.2,4.0,4.58,34.8,23.0,119.0,7.03,4.25
2,1,7,Thomas Jones,RB,22,178.0,220,Virginia,809.0,3998.0,...,71.0,571.0,8.0,4.0,4.45,34.8,20.0,119.0,7.03,4.25
3,1,8,Plaxico Burress,WR,23,196.0,226,Michigan St.,0.0,0.0,...,131.0,2155.0,16.5,20.0,4.59,33.0,15.0,115.0,6.92,4.19
4,1,10,Travis Taylor,WR,21,185.0,210,Florida,4.0,25.0,...,72.0,1150.0,16.0,15.0,4.43,37.0,15.0,118.0,7.15,4.19
5,1,11,Ron Dayne,RB,22,178.0,250,Wisconsin,1220.0,7125.0,...,31.0,304.0,9.8,0.0,4.65,34.8,20.0,119.0,7.03,4.25
6,1,14,Bubba Franks,TE,22,198.0,263,Miami (FL),0.0,0.0,...,77.0,1038.0,13.5,12.0,4.85,33.7,21.0,116.0,7.14,4.33
9,1,19,Shaun Alexander,RB,23,180.0,225,Alabama,727.0,3565.0,...,62.0,798.0,12.9,8.0,4.58,34.8,20.0,119.0,7.03,4.25
11,1,27,Anthony Becht,TE,23,196.0,272,West Virginia,0.0,0.0,...,83.0,1178.0,14.2,11.0,4.78,33.5,21.0,123.0,6.94,4.08
12,1,29,R. Jay Soward,WR,22,180.0,178,USC,27.0,340.0,...,161.0,2672.0,16.6,23.0,4.34,35.0,15.0,124.0,6.92,4.19


In [None]:
#Export the cleaned dataframe to a new CSV file
dataFrame.to_csv("NFLDraftDataCleaned.csv", index = False)