# Installation
To get started, run the following command to install all required dependencies.

In [None]:
#!pip install -q -r ./requirements.txt

# Import requirements
Here we import all external and local modulues.

In [None]:
from fpldata.common import Context
from fpldata.prepare import prepare_game_weeks, get_next_gw_counts, prepare_teams, get_team_id_code_map, \
    prepare_fixtures, get_fixtures_id_code_map, prepare_players, get_players_id_code_map, \
    prepare_players_history_past, prepare_players_history, load_team_goal_stats_est
from fpldata.derive import get_player_teams, get_fixture_teams, get_players_history_fixtures, \
    get_team_score_stats, get_fixture_teams_stats, get_team_fixture_strength, get_player_team_fixture_strength, \
    get_team_future_fixtures, get_players_future_fixture_team_strengths, get_player_fixture_stats, get_players_fixture_team_eps, \
    get_players_gw_team_eps, get_player_gw_next_eps
from fpldata.backtest import get_gw_points_backtest
from fpldata.s3store import S3Store
from fpldata.export import export_dfs, add_data_sets_stats, export_data_sets
from fplpandas import FPLPandas

from shutil import copyfile
import tempfile
import os
import logging, sys
import pandas as pd
from datadict import DataDict
from typing import Dict

# Define type aliases
DF = pd.DataFrame
S = pd.Series

# Set variables
This section sets all important global variables.

In [None]:
def is_notebook():
    try:
        shell = get_ipython().__class__.__name__
        if shell == 'ZMQInteractiveShell':
            return True   # Jupyter notebook or qtconsole
        elif shell == 'TerminalInteractiveShell':
            return False  # Terminal running IPython
        else:
            return False
    except NameError:
        return False

In [None]:
DATA_DICT_FILE = 'data/data_dictionary.csv'
DATA_SETS_FILE = 'data/data_sets.csv'
DATA_DIR = f'{tempfile.gettempdir()}/data'

TEAM_STATS_EST_FILE = 'data/team_goals_stats_estimates.csv'
LAST_SEASON = '2019-20'
CURRENT_SEASON = '2020-21'
FIXTURE_LOOK_BACK = 38  # Limit of how many fixtures to look back for calculating rolling team stats
PLAYER_FIXTURE_LOOK_BACK = 10 # Limit of how many fixture to look back for calcating rolling player stats

PLAYERS_FILE = f'data/{LAST_SEASON}/players.csv'
PLAYERS_HISTORY_FILE = f'data/{LAST_SEASON}/players_history.csv'
FIXTURES_FILE = f'data/{LAST_SEASON}/fixtures.csv'
TEAMS_FILE = f'data/{LAST_SEASON}/teams.csv'

logging.basicConfig(level=(logging.WARN if is_notebook() else logging.INFO))
pd.set_option('display.max_columns', 100)

fpl = FPLPandas() # Wrapper for access the FPL API and mapping the data into pandas data frames.

# Load data dictionary
This section loads the data dictionary. The data dictionary contains default ordering of fields, for each field a description, default format and mapping of API field names to more readable ones. It is used to show data in a more user-friendly way.

In [None]:
ctx = Context()
ctx.fixtures_look_back = FIXTURE_LOOK_BACK
ctx.player_fixtures_look_back = PLAYER_FIXTURE_LOOK_BACK
ctx.last_season = LAST_SEASON
ctx.current_season = CURRENT_SEASON
ctx.dd = DataDict(data_dict_file=DATA_DICT_FILE)

# Load game week data
The data frame contains one row for each game week for the current season.

In [None]:
logging.info('Loading game week data ...')

gws = fpl.get_game_weeks().pipe(prepare_game_weeks, ctx)
ctx.total_gws = gws.shape[0]
ctx.next_gw = gws[lambda df: df['Is Next GW?']].index.values[0]
ctx.def_next_gws = 'Next 8 GWs'
ctx.next_gw_counts = get_next_gw_counts(ctx)

# Load team data
This section loads the team data and stats from the following endpoint: https://fantasy.premierleague.com/api/bootstrap-static/ and returns it as a panda data frame.

In [None]:
logging.info('Loading team data ...')

# Get current team data. The resultnig data frame contains one row for each team playing in the current season.
teams = fpl.get_teams().pipe(prepare_teams, ctx)
team_id_code_map = teams.pipe(get_team_id_code_map)

In [None]:
# Get last season's team data
teams_last_season = (pd.read_csv(TEAMS_FILE, index_col=['id']).pipe(prepare_teams, ctx))
teams_last_season_id_code_map = teams_last_season.pipe(get_team_id_code_map)

# This data frame contains one row for each team playing in the current season and the past season.
teams_ext = pd.concat([teams, teams_last_season[~teams_last_season.index.isin(teams.index)]])

# Load fixture data
This section loads the fixture data and stats from the following endpoint: https://fantasy.premierleague.com/api/fixtures/ and returns it as a panda data frame.

In [None]:
logging.info('Loading fixture data ...')

# Get current fixture data. The resulting data frame contains one row for each fixture of the current season, both past and future ones.
fixtures = fpl.get_fixtures().pipe(prepare_fixtures, team_id_code_map, ctx).assign(**{'Season': ctx.current_season})
fixtures_id_code_map = fixtures.pipe(get_fixtures_id_code_map)

In [None]:
# Get last season's fixture data
fixtures_last_season = (pd.read_csv(FIXTURES_FILE, index_col=['id'])
                        .pipe(prepare_fixtures, teams_last_season_id_code_map, ctx)).assign(**{'Season': ctx.last_season})
fixtures_last_season_id_code_map = fixtures_last_season.pipe(get_fixtures_id_code_map)

# This data frame contains one row for each fixture of the the last and the current season, both past and future ones.
fixtures_ext = pd.concat([fixtures, fixtures_last_season])

# Load player data
This section loads the player data and stats from the following FPL API endpoint: https://fantasy.premierleague.com/api/bootstrap-static/ and returns it as a panda data frame. **This can take a few seconds** because for each player the full history for the current season is downloaded.

In [None]:
logging.info('Loading player data ...')

# Get current player data
players_raw = fpl.get_players()
players = (players_raw[0]
           .pipe(prepare_players, ctx)
           .assign(**{'Season': ctx.current_season}))
players_id_code_map = players.pipe(get_players_id_code_map)


# This data frame contains one row for every past season played in the premier league for every player in the current season.
players_history_past = players_raw[1].pipe(prepare_players_history_past, players_id_code_map, ctx)

# This data frame contains one row for every completed fixture in the current season for every player.
players_history = (players_raw[2]
                   .pipe(prepare_players_history, players_id_code_map, fixtures_id_code_map, ctx)
                   .assign(**{'Season': ctx.current_season}))

In [None]:
# Get last season's player data
players_last_season =  (pd.read_csv(PLAYERS_FILE, index_col=['id'], na_values='None')
                        .pipe(prepare_players, ctx).assign(**{'Season': ctx.last_season}))
players_last_season_id_code_map = players_last_season.pipe(get_players_id_code_map)

players_history_last_season = (pd.read_csv(PLAYERS_HISTORY_FILE, index_col=['player_id', 'fixture'])
            .pipe(prepare_players_history, players_last_season_id_code_map, fixtures_last_season_id_code_map, ctx)
            .assign(**{'Season': ctx.last_season}))

# This data frame contains one row for every player in the last and the current season.
players_ext = pd.concat([players, players_last_season[~players_last_season.index.isin(players.index)]])

# This data frame contains one row for every completed fixuture in the last and the current season.
players_history_ext = pd.concat([players_history, players_history_last_season])

In [None]:
import datetime as dt

def get_news(row: S):
    """Derives the text for the News column."""
    if pd.isnull(row['News']) or row['News'] == '':
        return None

    date_part = '' if pd.isnull(row['News Date'] or row['News Date'] == 'None') else ' (' + dt.datetime.strftime(row['News Date'], '%d %b %Y') + ')'
    return str(row['News']) + date_part

# Create derived data
This section creates new dataset by combining the previously loaded ones.

In [None]:
logging.info('Creating derived data sets ...')

## Players with team info

In [None]:
player_teams = players.pipe(get_player_teams, teams, ctx)
player_teams_ext = players.pipe(get_player_teams, teams_ext, ctx)

## Fixtures with team info

In [None]:
fixture_teams_ext = fixtures_ext.pipe(get_fixture_teams, teams_ext, ctx)

## Player derived fields and metrics
The section below derives a few useful player attributes but most importantly, it calculates the total points earned by a player devided by his current cost. This is can be an indicator for whether the player is undervalued or overpriced.

In [None]:
players_history_fixtures_ext = players_history_ext.pipe(get_players_history_fixtures, fixtures_ext, player_teams_ext, ctx)

## Team metrics

In [None]:
team_score_stats_est = load_team_goal_stats_est(TEAM_STATS_EST_FILE, ctx)
team_score_stats_ext = fixture_teams_ext.pipe(get_team_score_stats, teams_ext, team_score_stats_est, ctx)

## Fixture stats
In order to calculate relative strengths of the teams, we aggregate the points that the team has earned so far. We later can use this information to adjust the expected points for each player.

In [None]:
fixture_teams_stats_ext = fixture_teams_ext.pipe(get_fixture_teams_stats, team_score_stats_ext, ctx)

## Calculate relative fixture strengths
Calculates a relative fixtures strengths for each team. The relative strength is a factor around 1 and is used in the expected point prediction below to adjust the predicted points based on the relative strengths of the upcoming game weeks. The simple idea here is that team with more total points so far are stronger. A value above 1 indicates that the player's team is relatively stronger and a value below 1 indicates that the team is relatively weaker. 

In [None]:
team_fixture_strength_ext = fixture_teams_stats_ext.pipe(get_team_fixture_strength, teams_ext, ctx)

## Transfer relative fixture strengths from fixtures to players
This section joins the fixture strengths data set with the player data set so that expected points can be calculated on a player basis.

In [None]:
player_team_fixture_strength_ext = players.pipe(get_player_team_fixture_strength, team_fixture_strength_ext, players_history_ext, ctx)

## Create combined data for past and future fixtures for each player
This section concatenates two sets: one historical and one future fixture set. The reason for this is that for completed matches, we need it to consider the team that player actually played for, while for future games we can assume that the player will play for the same team than he is currently in.

In [None]:
team_future_fixtures = team_fixture_strength_ext.pipe(get_team_future_fixtures, players_history_fixtures_ext)
players_future_fixture_team_strengths = player_teams.pipe(get_players_future_fixture_team_strengths, team_future_fixtures)
player_fixture_stats = players_history_fixtures_ext.pipe(get_player_fixture_stats, players_future_fixture_team_strengths, player_team_fixture_strength_ext)

## Calculates the expected points for the following time horizons
Calculates the cumulative expected points for the all the game weeks up to the end of the season. The expected points for each time horizon are simply the sum of expected points for each game week within the time horizon.

Calculate expected points for each player and fixture combination.

In [None]:
players_fixture_team_eps_ext = player_fixture_stats.pipe(get_players_fixture_team_eps)

Project the fixtures to game week level to deal with game weeks when there is no fixture for a team or double fixtures.

In [None]:
players_gw_team_eps_ext = players_fixture_team_eps_ext.pipe(get_players_gw_team_eps, player_teams)

Calculates the expected points for the different time horizons for each player.

In [None]:
player_gw_next_eps_ext = players_gw_team_eps_ext.pipe(get_player_gw_next_eps, ctx)

## Back test the expected points

In [None]:
gw_points_backtest = get_gw_points_backtest(players_gw_team_eps_ext, ctx)
backtest_results = dict(gw_points_backtest[['Error', 'Error Simple']].mean())
print(f'Back test results for expected points\nError: {backtest_results["Error"]:.2f}\nError Simple: {backtest_results["Error Simple"]:.2f}')
assert backtest_results['Error'] < backtest_results['Error Simple'], 'Error of the Expected Points based on the complex formula are worse than the error of the Expected Points based on the simple calculation.'

In [None]:
if is_notebook():
    import plotly.express as px
    px.line(gw_points_backtest, x='Season Game Week', y=['Avg Expected Points', 'Avg Fixture Total Points', 'Error']).show()

# Publish data sets in S3

In [None]:
logging.info('Publishing data sets to S3 ...')

bucket = 'fpl.177arc.net' if os.environ.get('ENV') == 'Prod' else 'fpl-test.177arc.net'
s3store = S3Store(bucket)

data_sets = pd.read_csv(DATA_SETS_FILE).set_index('Name')

(data_sets
     .pipe(add_data_sets_stats, globals())
     .pipe(export_data_sets, f'{DATA_DIR}/{DATA_SETS_FILE.split("/")[-1]}'))

# Export data frames as CSV files.
export_dfs(globals(), data_sets, DATA_DIR, ctx)

# Copy the data dictory and data sets file.
_ = copyfile(DATA_DICT_FILE, f'{DATA_DIR}/{DATA_DICT_FILE.split("/")[-1]}')

# And off we go to S3.
s3store.save_dir(DATA_DIR)

logging.info('Done!')