In [42]:
# Import necessary libraries
import requests
import pandas as pd
from tqdm import tqdm

# Define API endpoints
bootstrap_url = "https://fantasy.premierleague.com/api/bootstrap-static/"
player_url_template = "https://fantasy.premierleague.com/api/element-summary/{player_id}/"

# Fetch player and team metadata
response = requests.get(bootstrap_url)
data = response.json()

# Extract player and team data
players = data['elements']
teams = data['teams']

# Convert player data to DataFrame
df_players = pd.DataFrame(players)

# Define relevant fields to retain from player data
selected_features = [
    # Identifiers
    'id', 'first_name', 'second_name', 'team', 'element_type',

    # Core Features
    'minutes', 'goals_scored', 'assists', 'clean_sheets',
    'yellow_cards', 'red_cards', 'own_goals', 'penalties_saved', 'penalties_missed',

    # Position-Specific Features
    'saves', 'bps', 'influence', 'creativity', 'threat',
    'expected_goals', 'expected_assists', 'expected_goal_involvements', 'expected_goals_conceded',

    # Derived Features
    'ict_index'
]

# Filter for selected features
df_selected = df_players[selected_features].copy()

# Add player name column for convenience
df_selected['player_name'] = df_selected['first_name'] + " " + df_selected['second_name']

# Map position names
position_map = {
    1: "Goalkeeper",
    2: "Defender",
    3: "Midfielder",
    4: "Forward"
}
df_selected['position'] = df_selected['element_type'].map(position_map)

# Fetch gameweek-specific data
gameweek_data = []
print("Fetching gameweek data for all players...")
for player_id in tqdm(df_selected['id']):
    response = requests.get(player_url_template.format(player_id=player_id))
    if response.status_code == 200:
        player_data = response.json()
        if 'history' in player_data:
            for gw in player_data['history']:
              gameweek_data.append({
                    'player_id': player_id,
                    'gameweek': gw['round'],
                    'total_points': gw['total_points'],  # Target variable
                    'minutes': gw['minutes'],
                    'goals_scored': gw['goals_scored'],
                    'assists': gw['assists'],
                    'clean_sheets': gw['clean_sheets'],
                    'saves': gw['saves'],
                    'goals_conceded': gw['goals_conceded'],
                    'own_goals': gw['own_goals'],
                    'penalties_saved': gw['penalties_saved'],
                    'penalties_missed': gw['penalties_missed'],
                    'yellow_cards': gw['yellow_cards'],
                    'red_cards': gw['red_cards'],
                    'bonus': gw['bonus'],
                    'bps': gw['bps'],
                    'influence': gw['influence'],
                    'creativity': gw['creativity'],
                    'threat': gw['threat'],
                    'ict_index': gw['ict_index'],
                    'value' : gw['value'],
                    'was_home': int(gw['was_home']),
                    'opponent_team': gw['opponent_team'],
                    'season': '2024-25',
                })



Fetching gameweek data for all players...


100%|██████████| 693/693 [00:42<00:00, 16.30it/s]


In [43]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [44]:
# Convert gameweek data to DataFrame
df_gameweeks = pd.DataFrame(gameweek_data)
df_gameweeks.columns

Index(['player_id', 'gameweek', 'total_points', 'minutes', 'goals_scored',
       'assists', 'clean_sheets', 'saves', 'goals_conceded', 'own_goals',
       'penalties_saved', 'penalties_missed', 'yellow_cards', 'red_cards',
       'bonus', 'bps', 'influence', 'creativity', 'threat', 'ict_index',
       'value', 'was_home', 'opponent_team', 'season'],
      dtype='object')

In [45]:
# Add lagged bps
df_gameweeks.sort_values(['player_id', 'gameweek'], inplace=True)
df_gameweeks['bps_last_gameweek'] = df_gameweeks.groupby('player_id')['bps'].shift(1)

# Use forward fill directly after grouping
df_gameweeks['bps_last_gameweek'] = df_gameweeks.groupby(['player_id', 'season'])['bps_last_gameweek'].ffill()
# Introduce a missing indicator for lagged bps
df_gameweeks['missing_bps_last_gameweek'] = df_gameweeks['bps_last_gameweek'].isna().astype(int)

# Fill NaN values in bps_last_gameweek with 0
df_gameweeks.fillna({'bps_last_gameweek': 0},  inplace=True)


# Merge player metadata with gameweek data
df_gameweeks = df_gameweeks.merge(
    df_selected[['id', 'first_name', 'second_name', 'team', 'position']],
    left_on='player_id',
    right_on='id',
    how='left'
)

# Add weighted features
position_weights = {
    "Goalkeeper": {"goal_weight": 10, "clean_sheet_weight": 4},
    "Defender": {"goal_weight": 6, "clean_sheet_weight": 4},
    "Midfielder": {"goal_weight": 5, "clean_sheet_weight": 1},
    "Forward": {"goal_weight": 4, "clean_sheet_weight": 0},
}

df_gameweeks['weighted_goals'] = df_gameweeks.apply(
    lambda x: x['goals_scored'] * position_weights[x['position']]['goal_weight'] if x['position'] in position_weights else 0,
    axis=1
)
df_gameweeks['weighted_clean_sheets'] = df_gameweeks.apply(
    lambda x: x['clean_sheets'] * position_weights[x['position']]['clean_sheet_weight'] if x['position'] in position_weights else 0,
    axis=1
)

# Save the processed gameweek data to Google Drive
gameweek_file_path = '/content/drive/My Drive/CPSC-171/Final Proj/fpl_gameweek_data_with_features.csv'
df_gameweeks.to_csv(gameweek_file_path, index=False)

print(f"Processed gameweek data with features saved to: {gameweek_file_path}")


Processed gameweek data with features saved to: /content/drive/My Drive/CPSC-171/Final Proj/fpl_gameweek_data_with_features.csv


In [46]:
df_gameweeks.head()

Unnamed: 0,player_id,gameweek,total_points,minutes,goals_scored,assists,clean_sheets,saves,goals_conceded,own_goals,...,season,bps_last_gameweek,missing_bps_last_gameweek,id,first_name,second_name,team,position,weighted_goals,weighted_clean_sheets
0,1,1,0,0,0,0,0,0,0,0,...,2024-25,0.0,1,1,Fábio,Ferreira Vieira,1,Midfielder,0,0
1,1,2,0,0,0,0,0,0,0,0,...,2024-25,0.0,0,1,Fábio,Ferreira Vieira,1,Midfielder,0,0
2,1,3,0,0,0,0,0,0,0,0,...,2024-25,0.0,0,1,Fábio,Ferreira Vieira,1,Midfielder,0,0
3,1,4,0,0,0,0,0,0,0,0,...,2024-25,0.0,0,1,Fábio,Ferreira Vieira,1,Midfielder,0,0
4,1,5,0,0,0,0,0,0,0,0,...,2024-25,0.0,0,1,Fábio,Ferreira Vieira,1,Midfielder,0,0


In [47]:
df_gameweeks.columns

Index(['player_id', 'gameweek', 'total_points', 'minutes', 'goals_scored',
       'assists', 'clean_sheets', 'saves', 'goals_conceded', 'own_goals',
       'penalties_saved', 'penalties_missed', 'yellow_cards', 'red_cards',
       'bonus', 'bps', 'influence', 'creativity', 'threat', 'ict_index',
       'value', 'was_home', 'opponent_team', 'season', 'bps_last_gameweek',
       'missing_bps_last_gameweek', 'id', 'first_name', 'second_name', 'team',
       'position', 'weighted_goals', 'weighted_clean_sheets'],
      dtype='object')

In [48]:
# File paths
historical_file_path = '/content/drive/My Drive/CPSC-171/Final Proj/fpl_historical_gameweek_data.csv'
current_file_path = '/content/drive/My Drive/CPSC-171/Final Proj/fpl_gameweek_data_with_features.csv'
output_file_path = '/content/drive/My Drive/CPSC-171/Final Proj/fpl_combined_gameweek_data.csv'


In [49]:
# Load the datasets
df_historical = pd.read_csv(historical_file_path)
# Define the mapping for positions
position_map = {
    'GK': "Goalkeeper",
    'GKP': "Goalkeeper",
    'DEF': "Defender",
    'MID': "Midfielder",
    'FWD': "Forward"
}

# Map the positions
df_historical['position'] = df_historical['position'].map(position_map)

# Check if any unmapped positions remain
unmapped_positions = df_historical['position'].isnull().sum()
if unmapped_positions > 0:
    print(f"Warning: {unmapped_positions} rows have unmapped positions. Check the original data for inconsistencies.")


In [50]:

# Define position weights
position_weights = {
    "Goalkeeper": {"goal_weight": 10, "clean_sheet_weight": 4},
    "Defender": {"goal_weight": 6, "clean_sheet_weight": 4},
    "Midfielder": {"goal_weight": 5, "clean_sheet_weight": 1},
    "Forward": {"goal_weight": 4, "clean_sheet_weight": 0},
}

# Add derived features
df_historical['weighted_goals'] = df_historical.apply(
    lambda x: x['goals_scored'] * position_weights[x['position']]['goal_weight'] if x['position'] in position_weights else 0,
    axis=1
)
df_historical['weighted_clean_sheets'] = df_historical.apply(
    lambda x: x['clean_sheets'] * position_weights[x['position']]['clean_sheet_weight'] if x['position'] in position_weights else 0,
    axis=1
)
print(df_historical.columns)
print(len(df_historical.columns))

Index(['name', 'position', 'team', 'assists', 'bonus', 'bps', 'clean_sheets',
       'creativity', 'element', 'fixture', 'goals_conceded', 'goals_scored',
       'ict_index', 'influence', 'kickoff_time', 'minutes', 'opponent_team',
       'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards',
       'round', 'saves', 'selected', 'team_a_score', 'team_h_score', 'threat',
       'total_points', 'transfers_balance', 'transfers_in', 'transfers_out',
       'value', 'was_home', 'yellow_cards', 'gameweek', 'season',
       'bps_last_gameweek', 'missing_bps_last_gameweek', 'weighted_goals',
       'weighted_clean_sheets'],
      dtype='object')
40


In [51]:
df_historical.head()

Unnamed: 0,name,position,team,assists,bonus,bps,clean_sheets,creativity,element,fixture,...,transfers_out,value,was_home,yellow_cards,gameweek,season,bps_last_gameweek,missing_bps_last_gameweek,weighted_goals,weighted_clean_sheets
0,Aaron Connolly,Forward,Brighton,0,0,-3,0,0.3,78,7,...,0,55,True,0,1,2020-21,0.0,1,0,0
1,Aaron Connolly,Forward,Brighton,0,2,27,1,11.3,78,16,...,6493,55,False,0,2,2020-21,-3.0,0,4,0
2,Aaron Connolly,Forward,Brighton,0,0,2,0,12.1,78,19,...,13297,55,True,0,3,2020-21,27.0,0,0,0
3,Aaron Connolly,Forward,Brighton,0,0,7,0,0.3,78,32,...,11710,55,False,0,4,2020-21,2.0,0,0,0
4,Aaron Connolly,Forward,Brighton,1,0,13,0,10.3,78,40,...,14852,55,False,0,5,2020-21,7.0,0,0,0


In [52]:
# Ensure derived columns are included in the current season data
df_current = pd.read_csv(current_file_path)

In [53]:
# Add player name column for convenience
df_current['name'] = df_current['first_name'] + " " + df_current['second_name']
print(df_current.columns)
print(len(df_current.columns))
df_current.head()

Index(['player_id', 'gameweek', 'total_points', 'minutes', 'goals_scored',
       'assists', 'clean_sheets', 'saves', 'goals_conceded', 'own_goals',
       'penalties_saved', 'penalties_missed', 'yellow_cards', 'red_cards',
       'bonus', 'bps', 'influence', 'creativity', 'threat', 'ict_index',
       'value', 'was_home', 'opponent_team', 'season', 'bps_last_gameweek',
       'missing_bps_last_gameweek', 'id', 'first_name', 'second_name', 'team',
       'position', 'weighted_goals', 'weighted_clean_sheets', 'name'],
      dtype='object')
34


Unnamed: 0,player_id,gameweek,total_points,minutes,goals_scored,assists,clean_sheets,saves,goals_conceded,own_goals,...,bps_last_gameweek,missing_bps_last_gameweek,id,first_name,second_name,team,position,weighted_goals,weighted_clean_sheets,name
0,1,1,0,0,0,0,0,0,0,0,...,0.0,1,1,Fábio,Ferreira Vieira,1,Midfielder,0,0,Fábio Ferreira Vieira
1,1,2,0,0,0,0,0,0,0,0,...,0.0,0,1,Fábio,Ferreira Vieira,1,Midfielder,0,0,Fábio Ferreira Vieira
2,1,3,0,0,0,0,0,0,0,0,...,0.0,0,1,Fábio,Ferreira Vieira,1,Midfielder,0,0,Fábio Ferreira Vieira
3,1,4,0,0,0,0,0,0,0,0,...,0.0,0,1,Fábio,Ferreira Vieira,1,Midfielder,0,0,Fábio Ferreira Vieira
4,1,5,0,0,0,0,0,0,0,0,...,0.0,0,1,Fábio,Ferreira Vieira,1,Midfielder,0,0,Fábio Ferreira Vieira


In [54]:
# Exclude 'player_id' and 'id' from the current dataframe
columns_to_keep = [col for col in df_current.columns if col not in ['player_id', 'id', 'first_name', 'second_name']]

df_current_filtered = df_current[columns_to_keep]

# Align the historical data with the current data structure
df_historical_filtered = df_historical[columns_to_keep]

# Concatenate historical and current season data
df_combined = pd.concat([df_historical_filtered, df_current_filtered], ignore_index=True)


In [55]:
print(df_combined.columns)
print(len(df_combined.columns))
print(len(df_combined))
df_combined.head()

Index(['gameweek', 'total_points', 'minutes', 'goals_scored', 'assists',
       'clean_sheets', 'saves', 'goals_conceded', 'own_goals',
       'penalties_saved', 'penalties_missed', 'yellow_cards', 'red_cards',
       'bonus', 'bps', 'influence', 'creativity', 'threat', 'ict_index',
       'value', 'was_home', 'opponent_team', 'season', 'bps_last_gameweek',
       'missing_bps_last_gameweek', 'team', 'position', 'weighted_goals',
       'weighted_clean_sheets', 'name'],
      dtype='object')
30
115969


Unnamed: 0,gameweek,total_points,minutes,goals_scored,assists,clean_sheets,saves,goals_conceded,own_goals,penalties_saved,...,was_home,opponent_team,season,bps_last_gameweek,missing_bps_last_gameweek,team,position,weighted_goals,weighted_clean_sheets,name
0,1,1,45,0,0,0,0,2,0,0,...,1,5,2020-21,0.0,1,Brighton,Forward,0,0,Aaron Connolly
1,2,8,89,1,0,1,0,0,0,0,...,0,14,2020-21,-3.0,0,Brighton,Forward,4,0,Aaron Connolly
2,3,2,73,0,0,0,0,2,0,0,...,1,13,2020-21,27.0,0,Brighton,Forward,0,0,Aaron Connolly
3,4,2,65,0,0,0,0,3,0,0,...,0,7,2020-21,2.0,0,Brighton,Forward,0,0,Aaron Connolly
4,5,4,12,0,1,0,0,0,0,0,...,0,6,2020-21,7.0,0,Brighton,Forward,0,0,Aaron Connolly


In [56]:
# Save the combined dataset
output_combined_file = '/content/drive/My Drive/CPSC-171/Final Proj/fpl_combined_gameweek_data.csv'
df_combined.to_csv(output_combined_file, index=False)

print(f"Combined dataset saved to: {output_combined_file}")

Combined dataset saved to: /content/drive/My Drive/CPSC-171/Final Proj/fpl_combined_gameweek_data.csv
