In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import pandas as pd

# Define the seasons and base URL
seasons = ['2020-21', '2021-22', '2022-23', '2023-24']
base_url = "https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/refs/heads/master/data/{}/gws/merged_gw.csv"

# Columns to extract
columns_to_extract = [
    'name', 'position', 'team', 'assists', 'bonus', 'bps',
    'clean_sheets', 'creativity', 'element', 'fixture', 'goals_conceded',
    'goals_scored', 'ict_index', 'influence', 'kickoff_time', 'minutes',
    'opponent_team', 'own_goals', 'penalties_missed', 'penalties_saved',
    'red_cards', 'round', 'saves', 'selected', 'team_a_score', 'team_h_score',
    'threat', 'total_points', 'transfers_balance', 'transfers_in',
    'transfers_out', 'value', 'was_home', 'yellow_cards', 'GW',
]

# List to hold DataFrames
season_dataframes = []

# Loop through each season and fetch the data
for season in seasons:
    url = base_url.format(season)
    print(f"Fetching data for season: {season} from {url}")
    try:
        # Read the CSV file
        df = pd.read_csv(url)

        # Add the season column
        df['season'] = season

        # Extract the required columns
        df = df[columns_to_extract + ['season']]

        # Append to the list of DataFrames
        season_dataframes.append(df)
    except Exception as e:
        print(f"Failed to fetch or process data for season {season}: {e}")

# Merge all seasons into one DataFrame
merged_data = pd.concat(season_dataframes, ignore_index=True)
# change GW to gameweek
merged_data = merged_data.rename(columns={'GW': 'gameweek'})

# Add lagged bps
merged_data.sort_values(['name', 'season', 'gameweek'], inplace=True)

# Create lagged bps
merged_data['bps_last_gameweek'] = merged_data.groupby(['name', 'season'])['bps'].shift(1)

# Use forward fill directly after grouping
merged_data['bps_last_gameweek'] = merged_data.groupby(['name', 'season'])['bps_last_gameweek'].ffill()
# Introduce a missing indicator for lagged bps
merged_data['missing_bps_last_gameweek'] = merged_data['bps_last_gameweek'].isna().astype(int)

# Fill NaN values in bps_last_gameweek with 0
merged_data.fillna({'bps_last_gameweek': 0}, inplace=True)




# Save the merged data to a CSV file
output_file = '/content/drive/My Drive/CPSC-171/Final Proj/fpl_historical_gameweek_data.csv'
merged_data.to_csv(output_file, index=False)

print(f"Data for all seasons saved to: {output_file}")


Fetching data for season: 2020-21 from https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/refs/heads/master/data/2020-21/gws/merged_gw.csv
Fetching data for season: 2021-22 from https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/refs/heads/master/data/2021-22/gws/merged_gw.csv
Fetching data for season: 2022-23 from https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/refs/heads/master/data/2022-23/gws/merged_gw.csv
Fetching data for season: 2023-24 from https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/refs/heads/master/data/2023-24/gws/merged_gw.csv
Data for all seasons saved to: /content/drive/My Drive/CPSC-171/Final Proj/fpl_historical_gameweek_data.csv
