### Collin Hough, Matt Kaiser
### ECE 1170: Semester Project
The purpose of this file is to process all yearly rushing data from 2012-2021. This will require appending all yearly datasets together, adding extra data columns for more averaging statistics, and performing data cleanup.

In [1]:
import os
import glob
import pandas as pd

# Find rushing data file names
path = "../../data/raw/rushing"
files = glob.glob(path + "/*.csv")
print(files)


['../../data/raw/rushing/rushing_2018.csv', '../../data/raw/rushing/rushing_2019.csv', '../../data/raw/rushing/rushing_2021.csv', '../../data/raw/rushing/rushing_2020.csv', '../../data/raw/rushing/rushing_2012.csv', '../../data/raw/rushing/rushing_2013.csv', '../../data/raw/rushing/rushing_2017.csv', '../../data/raw/rushing/rushing_2016.csv', '../../data/raw/rushing/rushing_2014.csv', '../../data/raw/rushing/rushing_2015.csv']


In [2]:
# Load files into dataframes
df_list = (pd.read_csv(file) for file in files)

In [3]:
# Concatenate dataframes
df = pd.concat(df_list, ignore_index=True)
df.head()

Unnamed: 0,Rk,Player,Age,Pos,G,GS,Att,Yds,TD,Lng,Y/A,Y/G,Fmb,id
0,1,Ezekiel Elliott*,23,RB,15,15,304,1434,6,41,4.7,95.6,6,ElliEz00
1,2,Saquon Barkley*,21,RB,16,16,261,1307,11,78,5.0,81.7,0,BarkSa00
2,3,David Johnson,27,RB,16,16,258,940,7,53,3.6,58.8,3,JohnDa08
3,4,Todd Gurley*+,24,RB,14,14,256,1251,17,36,4.9,89.4,1,GurlTo01
4,5,Adrian Peterson,33,RB,16,16,251,1042,7,90,4.2,65.1,3,PeteAd01


In [4]:
# Check dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3310 entries, 0 to 3309
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Rk      3310 non-null   int64  
 1   Player  3310 non-null   object 
 2   Age     3310 non-null   int64  
 3   Pos     3308 non-null   object 
 4   G       3310 non-null   int64  
 5   GS      3310 non-null   int64  
 6   Att     3310 non-null   int64  
 7   Yds     3310 non-null   int64  
 8   TD      3310 non-null   int64  
 9   Lng     3310 non-null   int64  
 10  Y/A     3310 non-null   float64
 11  Y/G     3310 non-null   float64
 12  Fmb     3310 non-null   int64  
 13  id      3310 non-null   object 
dtypes: float64(2), int64(9), object(3)
memory usage: 362.2+ KB


In [5]:
# Remove rank column
df = df.drop(columns=['Rk'],axis=1)
df.head()

Unnamed: 0,Player,Age,Pos,G,GS,Att,Yds,TD,Lng,Y/A,Y/G,Fmb,id
0,Ezekiel Elliott*,23,RB,15,15,304,1434,6,41,4.7,95.6,6,ElliEz00
1,Saquon Barkley*,21,RB,16,16,261,1307,11,78,5.0,81.7,0,BarkSa00
2,David Johnson,27,RB,16,16,258,940,7,53,3.6,58.8,3,JohnDa08
3,Todd Gurley*+,24,RB,14,14,256,1251,17,36,4.9,89.4,1,GurlTo01
4,Adrian Peterson,33,RB,16,16,251,1042,7,90,4.2,65.1,3,PeteAd01


In [6]:
# Rearrange columns
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]
df.head()

Unnamed: 0,id,Player,Age,Pos,G,GS,Att,Yds,TD,Lng,Y/A,Y/G,Fmb
0,ElliEz00,Ezekiel Elliott*,23,RB,15,15,304,1434,6,41,4.7,95.6,6
1,BarkSa00,Saquon Barkley*,21,RB,16,16,261,1307,11,78,5.0,81.7,0
2,JohnDa08,David Johnson,27,RB,16,16,258,940,7,53,3.6,58.8,3
3,GurlTo01,Todd Gurley*+,24,RB,14,14,256,1251,17,36,4.9,89.4,1
4,PeteAd01,Adrian Peterson,33,RB,16,16,251,1042,7,90,4.2,65.1,3


In [7]:
# Filter out positions except QB,RB,WR,TE
positions = ["QB","RB","WR","TE"]
df = df[df["Pos"].isin(positions)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3096 entries, 0 to 3309
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      3096 non-null   object 
 1   Player  3096 non-null   object 
 2   Age     3096 non-null   int64  
 3   Pos     3096 non-null   object 
 4   G       3096 non-null   int64  
 5   GS      3096 non-null   int64  
 6   Att     3096 non-null   int64  
 7   Yds     3096 non-null   int64  
 8   TD      3096 non-null   int64  
 9   Lng     3096 non-null   int64  
 10  Y/A     3096 non-null   float64
 11  Y/G     3096 non-null   float64
 12  Fmb     3096 non-null   int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 338.6+ KB


In [8]:
# Add columns for TD/G, Att/G
df["TD/G"] = df["TD"]/df["G"]
df["Att/G"] = df["Att"]/df["G"]
df = df.round({"TD/G":1,"Att/G":1})
df.head()

Unnamed: 0,id,Player,Age,Pos,G,GS,Att,Yds,TD,Lng,Y/A,Y/G,Fmb,TD/G,Att/G
0,ElliEz00,Ezekiel Elliott*,23,RB,15,15,304,1434,6,41,4.7,95.6,6,0.4,20.3
1,BarkSa00,Saquon Barkley*,21,RB,16,16,261,1307,11,78,5.0,81.7,0,0.7,16.3
2,JohnDa08,David Johnson,27,RB,16,16,258,940,7,53,3.6,58.8,3,0.4,16.1
3,GurlTo01,Todd Gurley*+,24,RB,14,14,256,1251,17,36,4.9,89.4,1,1.2,18.3
4,PeteAd01,Adrian Peterson,33,RB,16,16,251,1042,7,90,4.2,65.1,3,0.4,15.7


In [9]:
# Limit data to 2022 players

# Load in 2022 data and limit to desired positions
path = "../../data/raw/2022/rushing_2022.csv"
df_2022 = pd.read_csv(path)
df_2022 = df_2022[df_2022["Pos"].isin(positions)]
df_2022.Pos.unique()

array(['RB', 'QB', 'TE', 'WR'], dtype=object)

In [10]:
# Convert unique 2022 ids into a list and filter large data
ids_2022 = df_2022["id"].tolist()
df = df[df["id"].isin(ids_2022)]
df.head()

Unnamed: 0,id,Player,Age,Pos,G,GS,Att,Yds,TD,Lng,Y/A,Y/G,Fmb,TD/G,Att/G
0,ElliEz00,Ezekiel Elliott*,23,RB,15,15,304,1434,6,41,4.7,95.6,6,0.4,20.3
1,BarkSa00,Saquon Barkley*,21,RB,16,16,261,1307,11,78,5.0,81.7,0,0.7,16.3
2,JohnDa08,David Johnson,27,RB,16,16,258,940,7,53,3.6,58.8,3,0.4,16.1
5,HowaJo00,Jordan Howard,24,RB,16,15,250,935,9,42,3.7,58.4,2,0.6,15.6
7,MixoJo00,Joe Mixon,22,RB,14,13,237,1168,8,51,4.9,83.4,0,0.6,16.9


In [11]:
# Sort data by id and age
df = df.sort_values(by = ["id","Age"], ascending = [True, True])
df.head()

Unnamed: 0,id,Player,Age,Pos,G,GS,Att,Yds,TD,Lng,Y/A,Y/G,Fmb,TD/G,Att/G
3032,AbduAm00,Ameer Abdullah,22,RB,16,9,143,597,2,36,4.2,37.3,5,0.1,8.9
2489,AbduAm00,Ameer Abdullah,23,RB,2,2,18,101,0,24,5.6,50.5,0,0.0,9.0
2070,AbduAm00,Ameer Abdullah,24,RB,14,11,165,552,4,34,3.3,39.4,2,0.3,11.8
273,AbduAm00,Ameer Abdullah,25,RB,10,0,1,1,0,1,1.0,0.1,1,0.0,0.1
456,AbduAm00,Ameer Abdullah,26,RB,16,0,23,115,0,15,5.0,7.2,1,0.0,1.4


In [12]:
# Test finding a player's career data (ex: Ezekiel Elliott)
test_df = df[df["id"] == "ElliEz00"]
test_df.head(6)

Unnamed: 0,id,Player,Age,Pos,G,GS,Att,Yds,TD,Lng,Y/A,Y/G,Fmb,TD/G,Att/G
2358,ElliEz00,Ezekiel Elliott*+,21,RB,15,15,322,1631,15,60,5.1,108.7,5,1.0,21.5
2051,ElliEz00,Ezekiel Elliott,22,RB,10,10,242,983,7,30,4.1,98.3,1,0.7,24.2
0,ElliEz00,Ezekiel Elliott*,23,RB,15,15,304,1434,6,41,4.7,95.6,6,0.4,20.3
342,ElliEz00,Ezekiel Elliott*,24,RB,16,16,301,1357,12,33,4.5,84.8,3,0.8,18.8
1050,ElliEz00,Ezekiel Elliott,25,RB,15,15,244,979,6,31,4.0,65.3,6,0.4,16.3
681,ElliEz00,Ezekiel Elliott,26,RB,17,17,237,1002,10,47,4.2,58.9,1,0.6,13.9


In [13]:
# Cleanup indexes of data
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,id,Player,Age,Pos,G,GS,Att,Yds,TD,Lng,Y/A,Y/G,Fmb,TD/G,Att/G
0,AbduAm00,Ameer Abdullah,22,RB,16,9,143,597,2,36,4.2,37.3,5,0.1,8.9
1,AbduAm00,Ameer Abdullah,23,RB,2,2,18,101,0,24,5.6,50.5,0,0.0,9.0
2,AbduAm00,Ameer Abdullah,24,RB,14,11,165,552,4,34,3.3,39.4,2,0.3,11.8
3,AbduAm00,Ameer Abdullah,25,RB,10,0,1,1,0,1,1.0,0.1,1,0.0,0.1
4,AbduAm00,Ameer Abdullah,26,RB,16,0,23,115,0,15,5.0,7.2,1,0.0,1.4


In [14]:
# Re-test finding a player's career data
test_df = df[df["id"] == "ElliEz00"]
test_df.head(6)

Unnamed: 0,id,Player,Age,Pos,G,GS,Att,Yds,TD,Lng,Y/A,Y/G,Fmb,TD/G,Att/G
204,ElliEz00,Ezekiel Elliott*+,21,RB,15,15,322,1631,15,60,5.1,108.7,5,1.0,21.5
205,ElliEz00,Ezekiel Elliott,22,RB,10,10,242,983,7,30,4.1,98.3,1,0.7,24.2
206,ElliEz00,Ezekiel Elliott*,23,RB,15,15,304,1434,6,41,4.7,95.6,6,0.4,20.3
207,ElliEz00,Ezekiel Elliott*,24,RB,16,16,301,1357,12,33,4.5,84.8,3,0.8,18.8
208,ElliEz00,Ezekiel Elliott,25,RB,15,15,244,979,6,31,4.0,65.3,6,0.4,16.3
209,ElliEz00,Ezekiel Elliott,26,RB,17,17,237,1002,10,47,4.2,58.9,1,0.6,13.9


In [15]:
# Convert df into a processed csv
path = "../../data/processed/rushing.csv"
df.to_csv(path)