In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
from statistics import mean

import requests

In [2]:
WR_data_complete = pd.read_csv('Data/WR_data_edited.csv')

Here comes the fun part. 

Now have the complete data set of every NFL WR's statistics and Madden rating for the corresponding year. It is now time to classify these players so our KNN classifier has something to work with.

Let's take a look at Marvin Jones' stats and madden ratings. Marvin Jones is the only active receiver from the 2012 draft class, meaning he has recorded values for every season of our dataset (on second inspection, he missed the 2014 season due to injury).

What can be done to classify his career (thus far). He's had a long, fairly productive career at a glance. 10 seasons (11 actually but this data does not include 2023 season), one season over 1100 yards, a couple more close to 1000 (bench mark for a pro-bowl consideration usually) and a very high touchdown to reception ratio across his career. His stats are not Hall of Fame worthy by any stretch, but very solid. His Madden ratings suggest a good career as well.

Let's remember the goal. To predict ROI for the rookie class. Marvin Jones' best seasons were from 2016-2020 on Detriot, which is not the team that drafted him. If you were the general manager of the Cincinnati Bengals, Marvin Jones' success during those years is of little interest to you. So it makes sense that our model should only uses data in years where the player played for the team that drafted him.

Luckily for us, NFL rookie contracts are set at 4 years. After the 4 years, players become free agents and are free to sign wherever they please, sometimes resigning with the team that drafted them, or in Jones' case, opting to sign with another team. Thus, it makes sense to cap our model at the first four years of a players career to determine ROI.

Of course, it is possible that a player provides middling value for their franchise during these years, then resigns with the team, and sees a major uptick in production. For example, all-time great Antonio Brown (see stats below) spent his rookie contract (2010-2013) as a rotational player before having arguably the most productive 6-year stretch in NFL history after signing his second contract. However, examples like this are few and far between, as the NFL is a very fast-moving business; teams are always looking to improve talent so unproductive rookie contract players are usually not brought back. Thus, we can live with the occasional Antonio Brown-like career trajectory and the model should still be accurate. Julian Edelman also fits into this category.

[scroll]

In [3]:
WR_data_complete[WR_data_complete['name'] == 'Mike Evans']

Unnamed: 0,id,name,years_played,team,season,season_type,receptions,targets,receiving_yards,receiving_tds,...,ypr,rec_td_percentage,rec_ypg,round,overall,ht,wt,forty,vertical,Overall Rating
1715,1275,Mike Evans,,TB,2014,REG,68,123,1051,12,...,15.46,0.176,70.066667,1.0,7.0,,,,,87.0
1716,1275,Mike Evans,,TB,2015,REG,74,148,1206,3,...,16.3,0.041,80.4,,,,,,,86.0
1717,1275,Mike Evans,,TB,2016,REG,96,173,1321,12,...,13.76,0.125,82.5625,,,,,,,90.0
1718,1275,Mike Evans,,TB,2017,REG,71,136,1001,5,...,14.1,0.07,66.733333,,,,,,,89.0
1719,1275,Mike Evans,,TB,2018,REG,86,138,1524,8,...,17.72,0.093,95.25,,,,,,,91.0
1720,1275,Mike Evans,,TB,2019,REG,67,118,1157,8,...,17.27,0.119,89.0,,,,,,,92.0
1721,1275,Mike Evans,,TB,2020,REG,70,109,1006,13,...,14.37,0.186,62.875,,,,,,,91.0
1722,1275,Mike Evans,,TB,2021,REG,74,114,1035,14,...,13.99,0.189,64.6875,,,,,,,92.0
1723,1275,Mike Evans,,TB,2022,REG,77,127,1124,6,...,14.6,0.078,74.933333,,,,,,,90.0


In [4]:
WR_data_complete[WR_data_complete['name'] == 'Antonio Brown'].sort_values(by=['name','season'])

Unnamed: 0,id,name,years_played,team,season,season_type,receptions,targets,receiving_yards,receiving_tds,...,ypr,rec_td_percentage,rec_ypg,round,overall,ht,wt,forty,vertical,Overall Rating
162,84,Antonio Brown,3.0,PIT,2012,REG,66,106,787,5,...,11.92,0.076,60.538462,6.0,195.0,10-May,186.0,4.56,33.5,86.0
163,84,Antonio Brown,4.0,PIT,2013,REG,110,166,1499,8,...,13.63,0.073,93.6875,6.0,195.0,10-May,186.0,4.56,33.5,92.0
164,84,Antonio Brown,5.0,PIT,2014,REG,129,181,1698,13,...,13.16,0.101,106.125,6.0,195.0,10-May,186.0,4.56,33.5,97.0
165,84,Antonio Brown,6.0,PIT,2015,REG,136,193,1834,10,...,13.49,0.074,114.625,6.0,195.0,10-May,186.0,4.56,33.5,97.0
166,84,Antonio Brown,7.0,PIT,2016,REG,106,154,1284,12,...,12.11,0.113,85.6,6.0,195.0,10-May,186.0,4.56,33.5,97.0
167,84,Antonio Brown,8.0,PIT,2017,REG,101,163,1533,9,...,15.18,0.089,109.5,6.0,195.0,10-May,186.0,4.56,33.5,99.0
168,84,Antonio Brown,9.0,PIT,2018,REG,104,168,1297,15,...,12.47,0.144,86.466667,6.0,195.0,10-May,186.0,4.56,33.5,98.0
161,84,Antonio Brown,,NE,2019,REG,4,8,56,1,...,14.0,0.25,56.0,,,,,,,
169,84,Antonio Brown,,TB,2020,REG,45,62,483,4,...,10.73,0.089,60.375,,,,,,,86.0
170,84,Antonio Brown,,TB,2021,REG,42,62,545,4,...,12.98,0.095,77.857143,,,,,,,


Thus, Marvin Jones ROI for his rookie contract is given below.

In [5]:
WR_data_complete[WR_data_complete['name'] == 'Marvin Jones'].head(4)

Unnamed: 0,id,name,years_played,team,season,season_type,receptions,targets,receiving_yards,receiving_tds,...,ypr,rec_td_percentage,rec_ypg,round,overall,ht,wt,forty,vertical,Overall Rating
1639,1209,Marvin Jones,1.0,CIN,2012,REG,18,32,201,1,...,11.17,0.056,25.125,5.0,166.0,2-Jun,199.0,4.46,33.0,70.0
1640,1209,Marvin Jones,2.0,CIN,2013,REG,51,80,712,10,...,13.96,0.196,44.5,5.0,166.0,2-Jun,199.0,4.46,33.0,81.0
1641,1209,Marvin Jones,4.0,CIN,2015,REG,65,103,816,4,...,12.55,0.062,51.0,5.0,166.0,2-Jun,199.0,4.46,33.0,82.0
1642,1209,Marvin Jones,,DET,2016,REG,55,103,930,4,...,16.91,0.073,62.0,,,,,,,84.0


Something else to think about, Marvin Jones was a fifth round pick. He was drafted 166 overall in the 2012 NFL draft as is shown in the data. A 900, 800, and 700 yard season in a four year span may not be ideal for a top 10 pick, but for a fifth round pick that is excellent production.

This introduces a new element: if I wish to include draft postion into my model (and I do), then my classifier has to represent relative success, not absolute. Furthermore, I would need to know where playes of the upcoming rookie class will be selected. The 2024 NFL draft is not until April 27, so I won't know this in time. Fortunately, there is usually a concensus for where prospects will land according to top scouts. E.g, top of the first round, second/third round, day 3 pick (fifth to seventh round), etc.




In [9]:
#issue with groupby(name).mean() because the name,position, 
# season_type categorys were objects and not strings, this fixes that:
#for col in WR_data_complete.select_dtypes(include=['object']).columns:
    #WR_data_complete[col] = WR_data_complete[col].astype('string')

#top12 = WR_data_complete[WR_data_complete['overall'] <= 12]
#top12.groupby('name', as_index=False).mean()

#not sure what's going on but, this should fix it

#top12numerical = top12.select_dtypes(include = [np.number]).columns.tolist()

#averaged_stats = top12.groupby('name')[top12numerical].mean().reset_index()

#orange = top12.groupby('name', as_index=False).head(4)

In [10]:
#some seasons are sorted incorrectly
WR_data = WR_data_complete.sort_values(by= ['name','season'])

#also some 'round' and 'overall' entries are NaN, 
#just going to fill those with previous values since they are constant
WR_data['round'] = WR_data.groupby('name')['round'].fillna(method='ffill')
WR_data['overall'] = WR_data.groupby('name')['overall'].fillna(method='ffill')

#now we are complete

first, let's find the mean rookie contract output and ratings for a top 12 pick --> a top prospect.

In [17]:
#all players drafted in the top 12 picks from 2012-2022
top12 = WR_data[WR_data['overall'] <= 12]

#filter for rookie contract years (first 4 years)
top12rookie = top12.groupby('name', as_index=False).head(4)

#trying smth
#top12rookie.groupby('name', as_index= False).mean()


numeric_cols = top12rookie.select_dtypes(include=[np.number]).columns.tolist()

# Now group by 'name' and calculate the mean for these numeric columns only
averaged_stats = top12rookie.groupby('name')[numeric_cols].mean().reset_index()

average_current = averaged_stats.drop(index = [2,3,6,18,22]) #these guys are older players not on rookie deals
average_current[average_current['name'] == 'Julio Jones'].transpose()

Unnamed: 0,15
name,Julio Jones
id,979.0
years_played,3.5
season,2013.5
receptions,90.0
targets,138.75
receiving_yards,1310.5
receiving_tds,6.5
receiving_yards_after_catch,472.0
receiving_first_downs,62.5
