In [11]:
import pandas as pd
import glob
from matplotlib import pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

#load the last 10 years
path = r'./data_v2/yearly' # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

df_combined = pd.concat(li, axis=0, ignore_index=True)
# Load Data and Remove unused fields
df_combined = df_combined.drop(columns=['Unnamed: 0', 'G', 'GS', 'Tgt', 'Yds', 'Yds.1', 'Yds.2', 'Int', 'Att', 'Att.1', 'FumblesLost', 'Cmp', 'Y/R'])


In [12]:
# Get total TDs and remove unnecessary fields
df_combined['TDs'] = df_combined['PassingTD'] + df_combined['RushingTD'] + df_combined['ReceivingTD']
df_combined = df_combined.drop(columns=['PassingTD', 'RushingTD', 'ReceivingTD'])
df_combined.head()

Unnamed: 0,Player,Tm,Pos,Age,Rec,Fumbles,PassingYds,PassingAtt,RushingYds,RushingAtt,ReceivingYds,FantasyPoints,TDs
0,Chris Johnson,TEN,RB,24.0,50.0,3.0,0.0,1.0,2006.0,358.0,503.0,390.9,16.0
1,Adrian Peterson,MIN,RB,24.0,43.0,7.0,0.0,0.0,1383.0,314.0,436.0,320.9,18.0
2,Maurice Jones-Drew,JAX,RB,24.0,53.0,2.0,0.0,0.0,1391.0,312.0,374.0,323.5,16.0
3,Aaron Rodgers,GNB,QB,26.0,0.0,10.0,4434.0,541.0,316.0,58.0,0.0,336.96,35.0
4,Ray Rice,BAL,RB,22.0,78.0,3.0,0.0,1.0,1339.0,254.0,702.0,326.1,8.0


In [13]:
# Get total YDS and remove unnecessary fields
df_combined['YDs'] = df_combined['PassingYds'] + df_combined['RushingYds'] + df_combined['ReceivingYds']
df_combined = df_combined.drop(columns=['PassingYds', 'RushingYds', 'ReceivingYds'])
df_combined.head()

Unnamed: 0,Player,Tm,Pos,Age,Rec,Fumbles,PassingAtt,RushingAtt,FantasyPoints,TDs,YDs
0,Chris Johnson,TEN,RB,24.0,50.0,3.0,1.0,358.0,390.9,16.0,2509.0
1,Adrian Peterson,MIN,RB,24.0,43.0,7.0,0.0,314.0,320.9,18.0,1819.0
2,Maurice Jones-Drew,JAX,RB,24.0,53.0,2.0,0.0,312.0,323.5,16.0,1765.0
3,Aaron Rodgers,GNB,QB,26.0,0.0,10.0,541.0,58.0,336.96,35.0,4750.0
4,Ray Rice,BAL,RB,22.0,78.0,3.0,1.0,254.0,326.1,8.0,2041.0


In [14]:
# Get total Touches and remove unnecessary fields
df_combined["Touches"] = df_combined['Rec'] + df_combined['PassingAtt'] + df_combined['RushingAtt']
df_combined = df_combined.drop(columns=['Rec', 'PassingAtt', 'RushingAtt'])
df_combined.head()

Unnamed: 0,Player,Tm,Pos,Age,Fumbles,FantasyPoints,TDs,YDs,Touches
0,Chris Johnson,TEN,RB,24.0,3.0,390.9,16.0,2509.0,409.0
1,Adrian Peterson,MIN,RB,24.0,7.0,320.9,18.0,1819.0,357.0
2,Maurice Jones-Drew,JAX,RB,24.0,2.0,323.5,16.0,1765.0,365.0
3,Aaron Rodgers,GNB,QB,26.0,10.0,336.96,35.0,4750.0,599.0
4,Ray Rice,BAL,RB,22.0,3.0,326.1,8.0,2041.0,333.0


In [15]:
# let's remove any player without at least 50 touches
df_combined.drop(df_combined[df_combined.Touches < 10].index, inplace=True)
df_combined.sort_values(by='Touches', ascending=False)

Unnamed: 0,Player,Tm,Pos,Age,Fumbles,FantasyPoints,TDs,YDs,Touches
4920,Matthew Stafford,DET,QB,24.0,6.0,274.58,24.0,5096.0,763.0
1215,Ben Roethlisberger,PIT,QB,36.0,7.0,333.86,37.0,5226.0,707.0
5583,Peyton Manning,IND,QB,34.0,3.0,285.80,33.0,4718.0,697.0
3146,Drew Brees,NOR,QB,37.0,5.0,332.32,39.0,5228.0,696.0
3216,Joe Flacco,BAL,QB,31.0,5.0,234.48,22.0,4375.0,693.0
...,...,...,...,...,...,...,...,...,...
3589,Dexter McCluster,SDG,WR,27.0,0.0,10.80,0.0,38.0,10.0
3561,Keith Mumphery,HOU,WR,24.0,0.0,16.90,0.0,69.0,10.0
3560,Chris Moore,BAL,WR,23.0,1.0,13.50,0.0,65.0,10.0
3539,Terrell Watson,PHI,RB,23.0,0.0,10.30,1.0,33.0,10.0


In [16]:
# 30% for test, 70% for training
print (0.2*len(df_combined))
print (0.8*len(df_combined))
df_combined = df_combined.sample(frac=1).reset_index(drop=True)
df_combined.head()

854.8000000000001
3419.2000000000003


Unnamed: 0,Player,Tm,Pos,Age,Fumbles,FantasyPoints,TDs,YDs,Touches
0,Rashard Mendenhall,PIT,RB,22.0,3.0,203.9,8.0,1369.0,267.0
1,Dede Westbrook,JAX,WR,26.0,1.0,152.7,3.0,687.0,71.0
2,Melvin Gordon,LAC,RB,24.0,1.0,288.1,12.0,1581.0,342.0
3,Jared Cook,STL,TE,27.0,0.0,133.4,3.0,634.0,53.0
4,Jeremiah Johnson,DEN,RB,24.0,0.0,20.9,0.0,139.0,21.0


In [17]:
df_train = df_combined[:3419]
df_test = df_combined[-854:]

print("Train:")
df_train.head()
print("Test:")
df_test.head()

Train:
Test:


Unnamed: 0,Player,Tm,Pos,Age,Fumbles,FantasyPoints,TDs,YDs,Touches
3420,Mike Hart,IND,RB,23.0,0.0,23.4,1.0,124.0,31.0
3421,T.J. Hockenson,DET,TE,22.0,0.0,80.7,2.0,367.0,32.0
3422,Emmanuel Sanders,DEN,WR,27.0,1.0,299.8,9.0,1448.0,109.0
3423,Brian Hoyer,NWE,QB,25.0,0.0,6.08,1.0,114.0,25.0
3424,Dez Bryant,DAL,WR,25.0,3.0,292.4,13.0,1234.0,94.0


In [18]:
# save to csv files
df_train.to_csv('./2019_train.csv')
df_test.to_csv('./2019_test.csv')