[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/maxi-tb22/SFM/blob/main/00_code/SFM.ipynb)

<h1 align="center"><strong><font size="6"> SFM: &emsp; Data Preparation </h1></strong></font>


<br>

This notebook prepares the data for the Soccer Factor Model (SFM) and exports a '.csv'-file.

In [1]:
# --- The usual packages
import numpy as np
import pandas as pd
from tqdm import tqdm
import datetime


# --- Own functions ---> for how to upload them from GitHub thanks to: https://changhsinlee.com/colab-import-python/
import requests
r = requests.get('https://raw.githubusercontent.com/maxi-tb22/SFM/main/00_code/auxiliaries_SFM.py')

# --- --- make sure your filename is the same as how you want to import
with open('auxiliaries_SFM.py', 'w') as f:
    f.write(r.text)

from auxiliaries_SFM import *


In [2]:
# ================================ USER INTERACTION ================================ #

# --- Pick your Player: Some suggestions!
my_player = ['cristiano ronaldo', 'robbie keane','fernando torres','didier drogba', 'ruud van nistelrooy','zlatan ibrahimovic',
             'alan shearer', 'robbie fowler', 'thierry henry','harry kewell', 'robert pires', 'wayne rooney',
             'dimitar berbatov', 'nicolas anelka', 'jermain defoe', 'robin van persie','michael owen']



# --- How many initial games to 'burn-in' (to allow a bit of a lead period for factor construction) ?
match_elim = 5

# ================================ USER INTERACTION ================================ #

<br>

## 1 &emsp; Load the Data

<br>

In [3]:
# ================================ Import Players ================================ #

players_info = pd.read_csv(create_url('https://drive.google.com/file/d/1199-wN3lIqLhHMaOtdBK7_4vR36tdpGB/view?usp=sharing'))
players_info.head()

Unnamed: 0,nateam,birth_date,birth_country_isoCode,birth_country,player_name,player_id,normal_position
0,russia,1969-01-23,ru,russia,andrei kanchelskis,324,m
1,england,1965-06-30,gb-eng,england,gary pallister,320,d
2,england,1960-12-31,gb-eng,england,steve bruce,319,d
3,wales,1963-11-01,gb-wls,wales,mark hughes,339,f
4,england,1964-04-04,gb-eng,england,paul parker,317,d


<br>

... and which data can we work with?

In [4]:
# --- Import the different Data Sets:

lineup = pd.read_csv(create_url('https://drive.google.com/file/d/1WU8yq9m2MewdDf6bAY7YvdgqFHmghZdm/view?usp=sharing'))

events = pd.read_csv(create_url('https://drive.google.com/file/d/1HQk83DNL8MnGxVTpNUV7-I9SQco19bpf/view?usp=sharing'))

games = pd.read_csv(create_url('https://drive.google.com/file/d/1njZWKNAo21H8DIYnagxGgjyMUSlb81k9/view?usp=sharing'))

## 2 &emsp; Data Engineering ##

Gather the data for each player in &emsp; `my_player` &emsp; .

The following cell may run for some minutes ...

<br>

In [10]:
# =============================== Start the Data-Preparation Loop =============================== #

# --- Create a Dictionary where to store each Player's individual Data:
MY_PLAYER = dict.fromkeys(my_player, [])

for pp in tqdm(my_player):


  print(f'\nPreparing data for: {pp} ...\n\n')


  # ================================ Some Information on the Player ====================================== #
  Player_info = players_info[players_info.player_name == pp]


  # ================================ Which Games Has the Player played in? ================================ #

  Player_lineup = lineup[lineup.player_id == Player_info.player_id.values[0]].reset_index(drop=True)

  # --- Number of Games
  N = Player_lineup.shape[0]




  # =============================================== Events =============================================== #

  # --- Select Player's events
  Player_events = events[events.player_id == np.array(Player_info.player_id)[0]]

  # --- Select Player's goals
  Player_goals = Player_events[Player_events.description == "goal"]




  # =============================================== Games =============================================== #

  # --- Which seasons are played? & Kick-Off Date
  matches_seasons = []
  kickoff_date = []


  for rr in range(N):

    matches_seasons.append(games.loc[np.where(Player_lineup.loc[rr,'match_id'] == games.match_id)[0],'season'].values[0])
    kickoff_date.append(games.loc[np.where(Player_lineup.loc[rr,'match_id'] == games.match_id)[0],'kickoff_dt'].values[0])


  # --- Get the UNIQUE seasons
  seasons = np.unique(matches_seasons)

  # --- Attach the Season & Kick-Off date
  Player_lineup['season'] = matches_seasons
  Player_lineup['kickoff_dt'] = kickoff_date

  # --- Sort the data-frame by time
  Player_lineup = Player_lineup.sort_values('kickoff_dt').reset_index(drop=True)

  # --- BEWARE: The 'age' column is probably errorneous: e.g. as of 2022, CR7 has not passed the age of 39 yet!
  Player_lineup = Player_lineup.drop('age', axis=1)


  # ================================ Prepare the Team Statistics (by Season) ================================ #

  # --- Get the Ladder
  table_dict = func_ladder(seasons,games)


  # --- Get 'Team Goals Scored'
  goals_scored_dict = func_goals_scored(seasons,games)


  # --- Get 'Team Goals Conceded'
  goals_conceded_dict = func_goals_conceded(seasons,games)


  # ================ Prepare the Player's individual Data-Frame (all Seasons in one) ======================== #

  # --- Create a dictionary with all relevant data to pass to the function:
  dict_build = {'Number of Games': N, 'Player_lineup': Player_lineup,'Lineup':lineup, 'Events': Player_events,
                'Games': games, 'Season': seasons, 'Ladder': table_dict,
                'Goals Scored': goals_scored_dict, 'Goals Conceded': goals_conceded_dict
              }

  out = func_Player(dict_build)

  # --- Unpack the Output:
  Player_df = out['Player_df']

  # --- Save your work!
  data_true = Player_df



  # ==================================== Some Further Data Preparation ======================================= #

  data = data_true.copy()

  # --- Which are the first 'match_elim' match_days per season?

  matches_burn_in = dict.fromkeys(seasons, [])

  if match_elim > 0:

    for ss in seasons:

      # --- Which is the Player's Team?
      id_team_ss = pd.unique(data_true.id_team[data_true.season == ss].values)[0]

      # --- Which are team's games in season 'ss'?
      games_ss = games[(games.season == ss) & ((games.home_team_id == id_team_ss) | (games.away_team_id == id_team_ss))].sort_values('kickoff_dt').reset_index(drop=True)

      # --- Extract the 'match_id' of the first 'match_elim' matches:
      id_ss = games_ss.match_id[:match_elim].values

      # --- Eliminate all matches with 'match_id' == 'id_ss':
      data = data[~data['id_match'].isin(id_ss)].reset_index(drop=True)




  # ==================================== Create Additional Features ======================================= #

  # ---- Difference in goals-scored between 'team' and 'opp': diff > 0 --> higher likelihood of Player scoring (?) --> probably debatable !
  data['goalsscored_diff'] = data['goalsscored_cum_team'].astype(float) - data['goalsscored_cum_opp'].astype(float)

  # ---- Goal-Balance Team: goals-scored - goals-conceded: diff > 0 --> higher likelihood of Player scoring
  data['goal_balance_team'] = data['goalsscored_cum_team'].astype(float) - data['goalsconceded_cum_team'].astype(float)

  # ---- Goal-Balance Opponent: goals-scored - goals-conceded: diff > 0 --> not sure about infering anything about the likelihood of Player scoring
  data['goal_balance_opp'] = data['goalsscored_cum_opp'].astype(float) - data['goalsconceded_cum_opp'].astype(float)

  # ---- Difference of Goal-Balance Team vs Goal-Balance Opponent: diff > 0 --> higher likelihood of Player scoring
  data['goal_balance_diff'] = data['goal_balance_team'].astype(float) - data['goal_balance_opp'].astype(float)

  # ---- Difference in points between 'team' and 'opp': diff > 0 --> higher likelihood of Player scoring
  data['points_diff'] = data['points_team'].astype(float) - data['points_opp'].astype(float)

  # ---- Share of Player's goals within the team: higher share --> higher likelihood of Player scoring
  #      --> but isn't a high share rather an indication of a player's ability?
  data['goalsscored_share_player_team'] = data['goalsscored_cum_player'].astype(float) / data['goalsscored_cum_team'].astype(float)



  # ============================ THE END IS NEAR! Save your Player's Data ================================= #
  MY_PLAYER[pp] = data




## 3 &emsp; Collect the Data

Assemble the data in a single matrix called `data` .

In [11]:
# ========================================================================================================= #
#
#           If you have specified several Players, i.e. if len(my_player) > 1,
#           you may want to row-bind the individual Data-Frames.
#
# ========================================================================================================= #

if len(my_player) > 1:

  data_true = MY_PLAYER[list(MY_PLAYER)[0]].copy()
  # --- Assign Player-Name
  data_true["name_player"] = list(MY_PLAYER)[0]

  for ll in my_player[1:]:

    help_ll = MY_PLAYER[ll].copy()
    # --- Assign Player-Name
    help_ll["name_player"] = ll

    data_true = pd.concat([data_true,help_ll], ignore_index=True)

else:

  data_true = MY_PLAYER[list(MY_PLAYER)[0]].copy()
  # --- Assign Player-Name
  data_true["name_player"] = list(MY_PLAYER)[0]


# --- Save your work!
data = data_true.copy()

# ==================== The Universe of Features/Factors to Choose from ===================== #
data.columns

Index(['goal', 'goals_in_match', 'points_team', 'points_opp',
       'goalsscored_cum_team', 'goalsscored_cum_opp', 'goalsconceded_cum_team',
       'goalsconceded_cum_opp', 'home_pitch', 'goalsscored_rank_team',
       'goalsconceded_rank_opp', 'goalsscored_rank_team_wo_player',
       'goalsscored_cum_player', 'id_match', 'id_team', 'id_opp', 'name_team',
       'name_opp', 'season', 'kickoff_dt', 'goalsscored_rank_opp',
       'goalsconceded_rank_team', 'goalsscored_diff', 'goal_balance_team',
       'goal_balance_opp', 'goal_balance_diff', 'points_diff',
       'goalsscored_share_player_team', 'name_player'],
      dtype='object')

## 4 &emsp; Export the Data



In [13]:
# ======================================== Export the data as a csv-file ==================================== #

# --- Specify the directory where you want to export the data to:
directory = './10_data/SFM_data'

# --- Any ID you want to add to the file name?
data_ID = ''


# --- Export:
data.to_csv(f'{directory}_{data_ID}.csv', index=False)
