# 02 - Feature Engineering

In [1]:
# Import libraries and packages
import pandas as pd
import numpy as np

In [2]:
# Read in Data
df = pd.read_csv(r"/Users/emmanuel/Documents/Portfolio Projects/FA_Player_Development_Model/data/processed/Wingers.csv").drop(['Unnamed: 0'],axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 46 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   player_key_or_id        22 non-null     object 
 1   Player                  22 non-null     object 
 2   season                  22 non-null     object 
 3   Primary_Squad           22 non-null     object 
 4   Primary_Squad_t1        22 non-null     object 
 5   Primary_Squad_t2        22 non-null     object 
 6   Playing Time_Min        22 non-null     int64  
 7   gls_per90               22 non-null     float64
 8   ast_per90               22 non-null     float64
 9   xg_per90                22 non-null     float64
 10  xag_per90               22 non-null     float64
 11  npxg_per90              22 non-null     float64
 12  prgC_per90              22 non-null     float64
 13  prgP_per90              22 non-null     float64
 14  prgR_per90              22 non-null     floa

## Rolling 3 Season Averages Metrics

In [3]:
# Rolling 3-season averages for per90 metrics 
per90_metrics = ['gls_per90', 'ast_per90', 'xg_per90', 'xag_per90','npxg_per90', 'prgC_per90', 'prgP_per90', 'prgR_per90']

# create rolling 3-season averages
for col in per90_metrics:
    t1, t2 = f'{col}_t1', f'{col}_t2'
    df[f'{col}_roll3'] = df[[col, t1, t2]].mean(axis=1)

# confirm new columns
roll_cols = [f'{col}_roll3' for col in per90_metrics]
print("Added columns:", roll_cols)

# preview a few
df[['Player', 'xg_per90', 'xg_per90_t1', 'xg_per90_t2', 'xg_per90_roll3']].head()


Added columns: ['gls_per90_roll3', 'ast_per90_roll3', 'xg_per90_roll3', 'xag_per90_roll3', 'npxg_per90_roll3', 'prgC_per90_roll3', 'prgP_per90_roll3', 'prgR_per90_roll3']


Unnamed: 0,Player,xg_per90,xg_per90_t1,xg_per90_t2,xg_per90_roll3
0,Brennan Johnson,0.429949,0.410853,0.263804,0.368202
1,Cody Gakpo,0.330233,0.491207,0.382716,0.401385
2,Anthony Gordon,0.295809,0.317647,0.28445,0.299302
3,Dango Ouattara,0.377057,0.190399,0.26108,0.276179
4,Leandro Trossard,0.254517,0.43117,0.248442,0.311376


## Team Strength

In [4]:
# Read Premier League Table
pl_2425 = pd.read_csv(r"/Users/emmanuel/Documents/Portfolio Projects/FA_Player_Development_Model/data/raw/premier_league_2425.csv")
pl_2324 = pd.read_csv(r"/Users/emmanuel/Documents/Portfolio Projects/FA_Player_Development_Model/data/raw/premier_league_2324.csv")
pl_2223 = pd.read_csv(r"/Users/emmanuel/Documents/Portfolio Projects/FA_Player_Development_Model/data/raw/premier_league_2223.csv")

In [5]:
# Add season labels
pl_2223["season"] = "2022-2023"
pl_2324["season"] = "2023-2024"
pl_2425["season"] = "2024-2025"

In [6]:
# Combine into one DataFrame
standings = pd.concat([pl_2223, pl_2324, pl_2425], ignore_index=True)
# Create team_strength score (1st = strongest)
standings["team_strength"] = (21 - standings["Rank"]) / 20
standings.head()

Unnamed: 0,Rank,Club,MP,W,D,L,GF,GA,GD,Pts,season,team_strength
0,1,Manchester City,38,28,5,5,94,33,61,89,2022-2023,1.0
1,2,Arsenal,38,26,6,6,88,43,45,84,2022-2023,0.95
2,3,Manchester Utd,38,23,6,9,58,43,15,75,2022-2023,0.9
3,4,Newcastle Utd,38,19,14,5,68,33,35,71,2022-2023,0.85
4,5,Liverpool,38,19,10,9,75,47,28,67,2022-2023,0.8


In [7]:
# Merge current season standings
df = df.merge(
    standings[["season", "Club", "team_strength", "GF", "GA", "GD", "Pts"]],
    left_on=["season", "Primary_Squad"],
    right_on=["season", "Club"],
    how="left",
    suffixes=("", "_curr")
)
df = df.rename(columns={
    "GF": "team_gf_curr", "GA": "team_ga_curr",
    "GD": "team_gd_curr", "Pts": "team_pts_curr",
    "team_strength": "team_strength_curr"
})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 60 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   player_key_or_id        22 non-null     object 
 1   Player                  22 non-null     object 
 2   season                  22 non-null     object 
 3   Primary_Squad           22 non-null     object 
 4   Primary_Squad_t1        22 non-null     object 
 5   Primary_Squad_t2        22 non-null     object 
 6   Playing Time_Min        22 non-null     int64  
 7   gls_per90               22 non-null     float64
 8   ast_per90               22 non-null     float64
 9   xg_per90                22 non-null     float64
 10  xag_per90               22 non-null     float64
 11  npxg_per90              22 non-null     float64
 12  prgC_per90              22 non-null     float64
 13  prgP_per90              22 non-null     float64
 14  prgR_per90              22 non-null     floa

In [8]:
# Merge previous season (t1)
df["season_t1"] = (df["season"].str[:4].astype(int) - 1).astype(str) + "-" + df["season"].str[:4]
df = df.merge(
    standings[["season", "Club", "team_strength", "GF", "GA", "GD", "Pts"]].rename(
        columns={
            "season": "season_t1",
            "team_strength": "team_strength_t1",
            "GF": "team_gf_t1", "GA": "team_ga_t1",
            "GD": "team_gd_t1", "Pts": "team_pts_t1"
        }
    ),
    left_on=["season_t1", "Primary_Squad_t1"],
    right_on=["season_t1", "Club"],
    how="left"
).drop(columns=["Primary_Squad_t1"])

In [9]:
# Merge two seasons ago (t2)
df["season_t2"] = (df["season"].str[:4].astype(int) - 2).astype(str) + "-" + (df["season"].str[:4].astype(int) - 1).astype(str)
df = df.merge(
    standings[["season", "Club", "team_strength", "GF", "GA", "GD", "Pts"]].rename(
        columns={
            "season": "season_t2",
            "team_strength": "team_strength_t2",
            "GF": "team_gf_t2", "GA": "team_ga_t2",
            "GD": "team_gd_t2", "Pts": "team_pts_t2"
        }
    ),
    left_on=["season_t2", "Primary_Squad_t2"],
    right_on=["season_t2", "Club"],
    how="left"
).drop(columns=["Primary_Squad_t2"])

In [10]:
# Merge team strength for the club going into 25/26 season
right = (
    standings.loc[standings["season"] == "2024-2025", ["Club", "team_strength"]]
    .rename(columns={"Club": "Transfermarkt_Club"})
)

df = df.drop(columns=[c for c in ["Club", "Club_x", "Club_y"] if c in df.columns])

# merge on Transfermarkt_Club, then rename the strength column
df = df.merge(right, on="Transfermarkt_Club", how="left")
df = df.rename(columns={"team_strength": "team_strength_25_26"})


In [11]:
# Create rolling 3-season averages
df["team_strength_roll3"] = df[["team_strength_curr", "team_strength_t1", "team_strength_t2"]].mean(axis=1)
df["team_gf_roll3"] = df[["team_gf_curr", "team_gf_t1", "team_gf_t2"]].mean(axis=1)
df["team_pts_roll3"] = df[["team_pts_curr", "team_pts_t1", "team_pts_t2"]].mean(axis=1)

### Team Strength Feature Engineering

**Overview**

To capture the contextual influence of a player’s club environment, I engineered a team strength feature for each Premier League season in the dataset (2022/23 – 2024/25).
This feature quantifies the relative quality of each club based on its final league position.

**Calculation**

- Team strength was derived from the club’s final rank using the following formula:

team_strength = (21 - Rank) / 20

**Purpose**

Including team strength allows the model to account for the effect of club quality on player performance.

Players in stronger teams typically produce more xG and xA due to:
- Greater possession share
- Better teammates and tactical setups
- Increased attacking opportunities

> The rolling average smooths short-term fluctuations (e.g., managerial changes or one-off over/underperformances) and reflects a club’s long-term quality trend.

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 73 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   player_key_or_id        22 non-null     object 
 1   Player                  22 non-null     object 
 2   season                  22 non-null     object 
 3   Primary_Squad           22 non-null     object 
 4   Playing Time_Min        22 non-null     int64  
 5   gls_per90               22 non-null     float64
 6   ast_per90               22 non-null     float64
 7   xg_per90                22 non-null     float64
 8   xag_per90               22 non-null     float64
 9   npxg_per90              22 non-null     float64
 10  prgC_per90              22 non-null     float64
 11  prgP_per90              22 non-null     float64
 12  prgR_per90              22 non-null     float64
 13  gls_per90_t1            22 non-null     float64
 14  gls_per90_t2            22 non-null     floa

In [13]:
# For any player whose 25/26 club had no match, impute a default "promoted team" strength
df["team_strength_25_26"] = df["team_strength_25_26"].fillna(0.2)

### Imputing Team Strength for Newly Promoted Clubs

Because the 2025–26 season includes newly promoted teams that did not appear in the 2024–25 Premier League standings, those clubs do not have an existing team_strength value.
To ensure the model has consistent numeric input for all players, I manually imputed a 17th-place equivalent team strength (0.20) for newly promoted sides.

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 73 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   player_key_or_id        22 non-null     object 
 1   Player                  22 non-null     object 
 2   season                  22 non-null     object 
 3   Primary_Squad           22 non-null     object 
 4   Playing Time_Min        22 non-null     int64  
 5   gls_per90               22 non-null     float64
 6   ast_per90               22 non-null     float64
 7   xg_per90                22 non-null     float64
 8   xag_per90               22 non-null     float64
 9   npxg_per90              22 non-null     float64
 10  prgC_per90              22 non-null     float64
 11  prgP_per90              22 non-null     float64
 12  prgR_per90              22 non-null     float64
 13  gls_per90_t1            22 non-null     float64
 14  gls_per90_t2            22 non-null     floa

In [None]:
# Extracting Dataset
df.to_csv("/Users/emmanuel/Documents/Portfolio Projects/FA_Player_Development_Model/data/processed/Feature_Engineered.csv", index=True)