<a href="https://colab.research.google.com/github/Arvean/FantasyFootballScorePredictor/blob/main/FantasyFootballScorePredictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
CSE 163
This file contains the method to train and predict fantasy football player
fantasy scores as part of the team's CSE 163 final project. This file includes
the sklearn library and other libraries to use a regression tree classifier to
train a ML model to predict player fantasy scores for the next year using past
year data.
"""
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np


def predict_best_players():
  """
  This method loads in player csv data drom the past two decades and prints the
  top five quarter backs, running backs, wide receivers, and tight ends
  predicted for the test data set. This method takes in data from 2000 to 2019
  and trains the ML model on year pairs from 2000 to 2016. This method then
  tests the 2016 and 2019 data by using the 2018 data to predict the top players
  by position for 2019. The method also prints the testing and training error
  for the model using a mean percent error analysis.
  """
# Load in dataframes from 2000 to 2019
  df2000 = pd.read_csv('/content/2000.csv')
  df2001 = pd.read_csv('/content/2001.csv')
  df2002 = pd.read_csv('/content/2002.csv')
  df2003 = pd.read_csv('/content/2003.csv')
  df2004 = pd.read_csv('/content/2004.csv')
  df2005 = pd.read_csv('/content/2005.csv')
  df2006 = pd.read_csv('/content/2006.csv')
  df2007 = pd.read_csv('/content/2007.csv')
  df2008 = pd.read_csv('/content/2008.csv')
  df2009 = pd.read_csv('/content/2009.csv')
  df2010 = pd.read_csv('/content/2010.csv')
  df2011 = pd.read_csv('/content/2011.csv')
  df2012 = pd.read_csv('/content/2012.csv')
  df2013 = pd.read_csv('/content/2013.csv')
  df2014 = pd.read_csv('/content/2014.csv')
  df2015 = pd.read_csv('/content/2015.csv')
  df2016 = pd.read_csv('/content/2016.csv')
  df2017 = pd.read_csv('/content/2017.csv')
  df2018 = pd.read_csv('/content/2018.csv')
  df2019 = pd.read_csv('/content/2019.csv')

  df_years = [df2000, df2001, df2002, df2003, df2004, df2005, df2006, df2007,
              df2008, df2009, df2010, df2011, df2012, df2013, df2014, df2015,
              df2016, df2017, df2018, df2019]

  # Initialize lists that will be merged and merged list.
  # Features from a season and the label from the next season
  # will be merged
  l_df = []
  f_df = []
  merged_df = []

  # Create label dataframes for years 2001-2019
  for df in df_years:
      df_l = df[['Player', 'FantasyPoints']]
      l_df.append(df_l)
  del l_df[0]

  # Create feature dataframes for years 2000-2018
  for df in df_years:
      df_f = df.rename(columns={"FantasyPoints": "Points"})
      f_df.append(df_f)
  del f_df[-1]

  # Create merged dataframes for year pairs
  for l_df, f_df in zip(l_df, f_df):
      merged_df.append(f_df.merge(l_df, left_on='Player', right_on='Player'))

  # Create training and testing dataframes using year pairs
  df_train = pd.concat(merged_df[0:16])
  df_test = pd.concat(merged_df[17:18])

  # Data fantasy score labels for testing dataset
  actual_points = pd.DataFrame()
  actual_points['Player'] = df_test['Player']
  actual_points['Actual Points'] = df_test['FantasyPoints']

  # Organize training and testing data to remove duplicate columns of data
  df_train = df_train.loc[:, df_train.columns != 'PassingYds']
  df_train = df_train.loc[:, df_train.columns != 'PassingAtt']
  df_train = df_train.loc[:, df_train.columns != 'RushingYds']
  df_train = df_train.loc[:, df_train.columns != 'RushingAtt']
  df_train = df_train.loc[:, df_train.columns != 'ReceivingYds']

  df_train.rename(columns={"Att": "PassingAtt", "Yds": "PassignYds",
                           "Att.1": "RushingAtt", "Yds.1": "RushingYds",
                           "Yds.2": "ReceivingYds"})

  df_test = df_test.loc[:, df_test.columns != 'PassingYds']
  df_test = df_test.loc[:, df_test.columns != 'PassingAtt']
  df_test = df_test.loc[:, df_test.columns != 'RushingYds']
  df_test = df_test.loc[:, df_test.columns != 'RushingAtt']
  df_test = df_test.loc[:, df_test.columns != 'ReceivingYds']

  df_test.rename(columns={"Att": "PassingAtt", "Yds": "PassignYds",
                          "Att.1": "RushingAtt", "Yds.1": "RushingYds",
                          "Yds.2": "ReceivingYds"})

  # Create training features, label, and weight datasets
  train_features = df_train.loc[:, df_train.columns != 'Tm']
  train_features = train_features.loc[:,
                                      train_features.columns != 'FantasyPoints']
  train_features = train_features.loc[:, train_features.columns != 'Player']
  train_features = train_features.loc[:,
                                      train_features.columns != 'Unnamed: 0']

  # Hash player position to give pos unique integer ID
  train_features['Pos'] = train_features['Pos'].apply(hash)
  train_label = df_train['FantasyPoints']

  # Create weights for training set

  # Create training dataset list of players
  train_players = df_train['Player']

  # Create testing features, label, and weight datasets
  test_features = df_test.loc[:, df_test.columns != 'Tm']
  test_features = test_features.loc[:, test_features.columns != 'FantasyPoints']
  test_features = test_features.loc[:, test_features.columns != 'Player']
  test_features = test_features.loc[:, test_features.columns != 'Unnamed: 0']

  # Apply hash to testing positions to give unique integer ID
  test_features['Pos'] = test_features['Pos'].apply(hash)
  test_label = df_test['FantasyPoints']

  # Create testing dataset list of players
  test_player_names = df_test['Player']

  # Create weights
  train_weight = (df_train['Points'] +
                  1.5*df_train['Rec'] +
                  1.5*df_train['PassingTD'] + 1.5*df_train['RushingTD'])
  test_weight = (df_test['Points'] +
                 1.5*df_train['Rec'] +
                 1.5*df_test['PassingTD'] + 1.5*df_test['RushingTD'])

  # Create ML regression tree model
  model = DecisionTreeRegressor()
  model.fit(train_features, train_label, train_weight)

  # Predict results for training and test data
  train_predictions = model.predict(train_features)
  test_predictions = model.predict(test_features)

  # Create organized list of testing predictions associated with corresponding
  # player names
  test_pred_names = pd.DataFrame()
  test_pred_names['Player'] = test_player_names
  test_pred_names['Predicted Points'] = test_predictions
  test_pred_names['Actual Points'] = actual_points['Actual Points']
  test_pred_names['Error'] = (100 *
                              np.abs((test_pred_names['Actual Points'] -
                                      test_pred_names['Predicted Points']) /
                                     test_pred_names['Actual Points']))
  test_pred_names['Pos'] = test_features['Pos']
  test_pred_names = test_pred_names[test_pred_names['Error'] <= 500]

  # Create training label set masking out players with zero fantasy points
  tr = pd.DataFrame()
  tr['train_label'] = list(train_label)
  tr['train_predictions'] = train_predictions
  tr = tr[tr['train_label'] != 0]

  # Create testing label set masking out players with zero fantasy points
  te = pd.DataFrame()
  te['test_label'] = list(test_label)
  te['test_predictions'] = test_predictions
  te = te[te['test_label'] != 0]

  # Calculate training and test error (mean percent error): Filter out high
  # error
  train_error = (np.abs((tr['train_label'] -
                         tr['train_predictions'])/tr['train_label']))*100
  train_error = pd.Series(train_error)
  train_error = train_error[train_error <= 500]
  train_error = np.mean(train_error)

  test_error = (np.abs((te['test_label'] -
                        te['test_predictions'])/te['test_label']))*100
  test_error = pd.Series(test_error)
  test_error = test_error[test_error <= 500]
  test_error = np.mean(test_error)

  # Print training and testing error for model
  print('train error:')
  print(train_error)
  print('')
  print('test_error:')
  print(test_error)
  print('')
  print('')

  # Organize predicted results by the top 5 players for each position
  top_qb = test_pred_names[test_pred_names['Pos'] == hash('QB')]
  top_qb = top_qb.nlargest(5,
                           'Predicted Points').loc[:,
                                                   ['Player',
                                                    'Predicted Points',
                                                    'Actual Points', 'Error']]
  top_rb = test_pred_names[test_pred_names['Pos'] == hash('RB')]
  top_rb = top_rb.nlargest(5,
                           'Predicted Points').loc[:,
                                                   ['Player',
                                                    'Predicted Points',
                                                    'Actual Points', 'Error']]
  top_te = test_pred_names[test_pred_names['Pos'] == hash('TE')]
  top_te = top_te.nlargest(5,
                           'Predicted Points').loc[:,
                                                   ['Player',
                                                    'Predicted Points',
                                                    'Actual Points', 'Error']]
  top_wr = test_pred_names[test_pred_names['Pos'] == hash('WR')]
  top_wr = top_wr.nlargest(5,
                           'Predicted Points').loc[:,
                                                   ['Player',
                                                    'Predicted Points',
                                                    'Actual Points', 'Error']]

  # Print results from test data
  print("Top QB's:")
  print(top_qb)
  print('')
  print("Top RB's:")
  print(top_rb)
  print('')
  print("Top TE's:")
  print(top_te)
  print('')
  print("Top WR's:")
  print(top_wr)


def main():
  predict_best_players()


if __name__ == '__main__':
  main()