### Collin Hough, Matt Kaiser
### ECE 1170: Semester Project
The purpose of this file is to process all yearly receiving data from 2012-2021. This will require appending all yearly datasets together, adding extra data columns for more averaging statistics, and performing data cleanup.

In [1]:
import os
import glob
import pandas as pd

# Find rushing data file names
path = "../data/raw/receiving"
files = glob.glob(path + "/*.csv")
print(files)


['../data/raw/receiving/receiving_2014.csv', '../data/raw/receiving/receiving_2015.csv', '../data/raw/receiving/receiving_2017.csv', '../data/raw/receiving/receiving_2016.csv', '../data/raw/receiving/receiving_2012.csv', '../data/raw/receiving/receiving_2013.csv', '../data/raw/receiving/receiving_2021.csv', '../data/raw/receiving/receiving_2020.csv', '../data/raw/receiving/receiving_2018.csv', '../data/raw/receiving/receiving_2019.csv']


In [2]:
# Load files into dataframes
df_list = (pd.read_csv(file) for file in files)

In [3]:
# Concatenate dataframes
df = pd.concat(df_list, ignore_index=True)
df.head()

Unnamed: 0,Rk,Player,Age,Pos,G,GS,Tgt,Rec,Ctch%,Yds,Y/R,TD,Lng,Y/Tgt,R/G,Y/G,Fmb,id
0,1,Antonio Brown*+,26,WR,16,16,181.0,129,71.3%,1698,13.2,13,63,9.4,8.1,106.1,2.0,BrowAn04
1,2,Demaryius Thomas*,27,WR,16,16,184.0,111,60.3%,1619,14.6,11,86,8.8,6.9,101.2,0.0,ThomDe03
2,3,Julio Jones*,25,WR,15,15,163.0,104,63.8%,1593,15.3,6,79,9.8,6.9,106.2,2.0,JoneJu02
3,4,Matt Forte,29,RB,16,16,130.0,102,78.5%,808,7.9,4,56,6.2,6.4,50.5,2.0,FortMa00
4,5,Emmanuel Sanders*,27,WR,16,16,141.0,101,71.6%,1404,13.9,9,48,10.0,6.3,87.8,1.0,SandEm00


In [4]:
# Check dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5021 entries, 0 to 5020
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Rk      5021 non-null   int64  
 1   Player  5021 non-null   object 
 2   Age     5021 non-null   int64  
 3   Pos     5016 non-null   object 
 4   G       5021 non-null   int64  
 5   GS      5021 non-null   int64  
 6   Tgt     5019 non-null   float64
 7   Rec     5021 non-null   int64  
 8   Ctch%   5021 non-null   object 
 9   Yds     5021 non-null   int64  
 10  Y/R     4771 non-null   float64
 11  TD      5021 non-null   int64  
 12  Lng     5021 non-null   int64  
 13  Y/Tgt   5018 non-null   float64
 14  R/G     5021 non-null   float64
 15  Y/G     5021 non-null   float64
 16  Fmb     4909 non-null   float64
 17  id      5021 non-null   object 
dtypes: float64(6), int64(8), object(4)
memory usage: 706.2+ KB


In [5]:
# Remove rank column
df = df.drop(columns=['Rk'],axis=1)
df.head()

Unnamed: 0,Player,Age,Pos,G,GS,Tgt,Rec,Ctch%,Yds,Y/R,TD,Lng,Y/Tgt,R/G,Y/G,Fmb,id
0,Antonio Brown*+,26,WR,16,16,181.0,129,71.3%,1698,13.2,13,63,9.4,8.1,106.1,2.0,BrowAn04
1,Demaryius Thomas*,27,WR,16,16,184.0,111,60.3%,1619,14.6,11,86,8.8,6.9,101.2,0.0,ThomDe03
2,Julio Jones*,25,WR,15,15,163.0,104,63.8%,1593,15.3,6,79,9.8,6.9,106.2,2.0,JoneJu02
3,Matt Forte,29,RB,16,16,130.0,102,78.5%,808,7.9,4,56,6.2,6.4,50.5,2.0,FortMa00
4,Emmanuel Sanders*,27,WR,16,16,141.0,101,71.6%,1404,13.9,9,48,10.0,6.3,87.8,1.0,SandEm00


In [6]:
# Rearrange columns
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]
df.head()

Unnamed: 0,id,Player,Age,Pos,G,GS,Tgt,Rec,Ctch%,Yds,Y/R,TD,Lng,Y/Tgt,R/G,Y/G,Fmb
0,BrowAn04,Antonio Brown*+,26,WR,16,16,181.0,129,71.3%,1698,13.2,13,63,9.4,8.1,106.1,2.0
1,ThomDe03,Demaryius Thomas*,27,WR,16,16,184.0,111,60.3%,1619,14.6,11,86,8.8,6.9,101.2,0.0
2,JoneJu02,Julio Jones*,25,WR,15,15,163.0,104,63.8%,1593,15.3,6,79,9.8,6.9,106.2,2.0
3,FortMa00,Matt Forte,29,RB,16,16,130.0,102,78.5%,808,7.9,4,56,6.2,6.4,50.5,2.0
4,SandEm00,Emmanuel Sanders*,27,WR,16,16,141.0,101,71.6%,1404,13.9,9,48,10.0,6.3,87.8,1.0


In [7]:
# Filter out positions except QB,RB,WR,TE
positions = ["QB","RB","WR","TE"]
df = df[df["Pos"].isin(positions)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4756 entries, 0 to 5020
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      4756 non-null   object 
 1   Player  4756 non-null   object 
 2   Age     4756 non-null   int64  
 3   Pos     4756 non-null   object 
 4   G       4756 non-null   int64  
 5   GS      4756 non-null   int64  
 6   Tgt     4754 non-null   float64
 7   Rec     4756 non-null   int64  
 8   Ctch%   4756 non-null   object 
 9   Yds     4756 non-null   int64  
 10  Y/R     4567 non-null   float64
 11  TD      4756 non-null   int64  
 12  Lng     4756 non-null   int64  
 13  Y/Tgt   4753 non-null   float64
 14  R/G     4756 non-null   float64
 15  Y/G     4756 non-null   float64
 16  Fmb     4672 non-null   float64
dtypes: float64(6), int64(7), object(4)
memory usage: 668.8+ KB


In [8]:
# Add columns for TD/G, Att/G
df["TD/G"] = df["TD"]/df["G"]
df = df.round({"TD/G":1})
df.head()

Unnamed: 0,id,Player,Age,Pos,G,GS,Tgt,Rec,Ctch%,Yds,Y/R,TD,Lng,Y/Tgt,R/G,Y/G,Fmb,TD/G
0,BrowAn04,Antonio Brown*+,26,WR,16,16,181.0,129,71.3%,1698,13.2,13,63,9.4,8.1,106.1,2.0,0.8
1,ThomDe03,Demaryius Thomas*,27,WR,16,16,184.0,111,60.3%,1619,14.6,11,86,8.8,6.9,101.2,0.0,0.7
2,JoneJu02,Julio Jones*,25,WR,15,15,163.0,104,63.8%,1593,15.3,6,79,9.8,6.9,106.2,2.0,0.4
3,FortMa00,Matt Forte,29,RB,16,16,130.0,102,78.5%,808,7.9,4,56,6.2,6.4,50.5,2.0,0.2
4,SandEm00,Emmanuel Sanders*,27,WR,16,16,141.0,101,71.6%,1404,13.9,9,48,10.0,6.3,87.8,1.0,0.6


In [9]:
# Limit data to 2022 players

# Load in 2022 data and limit to desired positions
path = "../data/raw/2022/receiving_2022.csv"
df_2022 = pd.read_csv(path)
df_2022 = df_2022[df_2022["Pos"].isin(positions)]
df_2022.Pos.unique()

array(['WR', 'RB', 'TE', 'QB'], dtype=object)

In [10]:
# Convert unique 2022 ids into a list and filter large data
ids_2022 = df_2022["id"].tolist()
df = df[df["id"].isin(ids_2022)]
df.head()

Unnamed: 0,id,Player,Age,Pos,G,GS,Tgt,Rec,Ctch%,Yds,Y/R,TD,Lng,Y/Tgt,R/G,Y/G,Fmb,TD/G
2,JoneJu02,Julio Jones*,25,WR,15,15,163.0,104,63.8%,1593,15.3,6,79,9.8,6.9,106.2,2.0,0.4
9,CobbRa00,Randall Cobb*,24,WR,16,16,127.0,91,71.7%,1287,14.1,12,70,10.1,5.7,80.4,3.0,0.8
16,LandJa00,Jarvis Landry,22,WR,16,11,112.0,84,75.0%,758,9.0,5,25,6.8,5.3,47.4,7.0,0.3
24,AlleKe00,Keenan Allen,22,WR,14,14,121.0,77,63.6%,783,10.2,4,35,6.5,5.5,55.9,2.0,0.3
25,HopkDe00,DeAndre Hopkins,22,WR,16,16,127.0,76,59.8%,1210,15.9,6,76,9.5,4.8,75.6,2.0,0.4


In [11]:
# Sort data by id and age
df = df.sort_values(by = ["id","Age"], ascending = [True, True])
df.head()

Unnamed: 0,id,Player,Age,Pos,G,GS,Tgt,Rec,Ctch%,Yds,Y/R,TD,Lng,Y/Tgt,R/G,Y/G,Fmb,TD/G
652,AbduAm00,Ameer Abdullah,22,RB,16,9,38.0,25,65.8%,183,7.3,1,36,4.8,1.6,11.4,5.0,0.1
1781,AbduAm00,Ameer Abdullah,23,RB,2,2,5.0,5,100.0%,57,11.4,1,18,11.4,2.5,28.5,0.0,0.5
1131,AbduAm00,Ameer Abdullah,24,RB,14,11,35.0,25,71.4%,162,6.5,1,22,4.6,1.8,11.6,2.0,0.1
4371,AbduAm00,Ameer Abdullah,25,RB,10,0,4.0,3,75.0%,28,9.3,0,12,7.0,0.3,2.8,1.0,0.0
4735,AbduAm00,Ameer Abdullah,26,RB,16,0,21.0,15,71.4%,88,5.9,1,16,4.2,0.9,5.5,1.0,0.1


In [12]:
# Test finding a player's career data (ex: Ezekiel Elliott)
test_df = df[df["id"] == "ElliEz00"]
test_df.head(6)

Unnamed: 0,id,Player,Age,Pos,G,GS,Tgt,Rec,Ctch%,Yds,Y/R,TD,Lng,Y/Tgt,R/G,Y/G,Fmb,TD/G
1589,ElliEz00,Ezekiel Elliott*+,21,RB,15,15,39.0,32,82.1%,363,11.3,1,83,9.3,2.1,24.2,5.0,0.1
1128,ElliEz00,Ezekiel Elliott,22,RB,10,10,38.0,26,68.4%,269,10.3,2,72,7.1,2.6,26.9,1.0,0.2
4017,ElliEz00,Ezekiel Elliott*,23,RB,15,15,95.0,77,81.1%,567,7.4,3,38,6.0,5.1,37.8,6.0,0.2
4572,ElliEz00,Ezekiel Elliott*,24,RB,16,16,71.0,54,76.1%,420,7.8,2,27,5.9,3.4,26.3,3.0,0.1
3539,ElliEz00,Ezekiel Elliott,25,RB,15,15,71.0,52,73.2%,338,6.5,2,19,4.8,3.5,22.5,6.0,0.1
3014,ElliEz00,Ezekiel Elliott,26,RB,17,17,65.0,47,72.3%,287,6.1,2,21,4.4,2.8,16.9,1.0,0.1


In [13]:
# Cleanup indexes of data
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,id,Player,Age,Pos,G,GS,Tgt,Rec,Ctch%,Yds,Y/R,TD,Lng,Y/Tgt,R/G,Y/G,Fmb,TD/G
0,AbduAm00,Ameer Abdullah,22,RB,16,9,38.0,25,65.8%,183,7.3,1,36,4.8,1.6,11.4,5.0,0.1
1,AbduAm00,Ameer Abdullah,23,RB,2,2,5.0,5,100.0%,57,11.4,1,18,11.4,2.5,28.5,0.0,0.5
2,AbduAm00,Ameer Abdullah,24,RB,14,11,35.0,25,71.4%,162,6.5,1,22,4.6,1.8,11.6,2.0,0.1
3,AbduAm00,Ameer Abdullah,25,RB,10,0,4.0,3,75.0%,28,9.3,0,12,7.0,0.3,2.8,1.0,0.0
4,AbduAm00,Ameer Abdullah,26,RB,16,0,21.0,15,71.4%,88,5.9,1,16,4.2,0.9,5.5,1.0,0.1


In [14]:
# Re-test finding a player's career data
test_df = df[df["id"] == "ElliEz00"]
test_df.head(6)

Unnamed: 0,id,Player,Age,Pos,G,GS,Tgt,Rec,Ctch%,Yds,Y/R,TD,Lng,Y/Tgt,R/G,Y/G,Fmb,TD/G
342,ElliEz00,Ezekiel Elliott*+,21,RB,15,15,39.0,32,82.1%,363,11.3,1,83,9.3,2.1,24.2,5.0,0.1
343,ElliEz00,Ezekiel Elliott,22,RB,10,10,38.0,26,68.4%,269,10.3,2,72,7.1,2.6,26.9,1.0,0.2
344,ElliEz00,Ezekiel Elliott*,23,RB,15,15,95.0,77,81.1%,567,7.4,3,38,6.0,5.1,37.8,6.0,0.2
345,ElliEz00,Ezekiel Elliott*,24,RB,16,16,71.0,54,76.1%,420,7.8,2,27,5.9,3.4,26.3,3.0,0.1
346,ElliEz00,Ezekiel Elliott,25,RB,15,15,71.0,52,73.2%,338,6.5,2,19,4.8,3.5,22.5,6.0,0.1
347,ElliEz00,Ezekiel Elliott,26,RB,17,17,65.0,47,72.3%,287,6.1,2,21,4.4,2.8,16.9,1.0,0.1


In [15]:
# Convert df into a processed csv
path = "../data/processed/receiving.csv"
df.to_csv(path)