In [59]:
import pandas as pd

In [60]:
# Datasets

laliga_matches = pd.read_csv("../../Datasets/LaLiga_Matches.csv")
matches_full = pd.read_csv("../../Datasets/matches_full.csv")
matches_laliga = pd.read_csv("../../Datasets/matches_laliga.csv")
players = pd.read_csv("../../Datasets/database.csv")

## Some teams are not properly written so renamed it


In [61]:
team_mapping = {
    "Ath Bilbao": "Athletic Club",
    "Ath Madrid": "Atletico Madrid",
    "Athletic Club": "Athletic Club",
    "Atletico Madrid": "Atletico Madrid",
    "Real Madrid": "Real Madrid",
    "Barcelona": "Barcelona",
    "Valencia": "Valencia",
    "Sevilla": "Sevilla",
    "Real Sociedad": "Real Sociedad",
    "Villarreal": "Villarreal",
    "Real Betis": "Real Betis",
    "Betis": "Real Betis",
    "Espanyol": "Espanyol",
    "Celta": "Celta Vigo",
    "Celta Vigo": "Celta Vigo",
    "Getafe": "Getafe",
    "Alaves": "Alavés",
    "Alavés": "Alavés",
    "Levante": "Levante",
    "Osasuna": "Osasuna",
    "Granada": "Granada",
    "Cadiz": "Cádiz",
    "Cádiz": "Cádiz",
    "Elche": "Elche",
    "Valladolid": "Valladolid",
    "Mallorca": "Mallorca",
    "Rayo Vallecano": "Rayo Vallecano",
    "Vallecano": "Rayo Vallecano",
    "Las Palmas": "Las Palmas",
    "Girona": "Girona",
    "Leganes": "Leganés",
    "Leganés": "Leganés",
    "Eibar": "Eibar",
    "Huesca": "Huesca",
    "Oviedo": "Oviedo",
    "Sociedad": "Real Sociedad",
}

In [62]:
def standardize_team_names(df, cols):
    df = df.copy()
    for col in cols:
        if col in df.columns:
            df[col] = df[col].replace(team_mapping)
    return df


In [63]:
target_seasons = ["2019-20", "2020-21", "2021-22", "2022-23", "2023-24", "2024-25", "2025-26"]

def filter_last_5_years(df, season_col="Season"):
    if season_col in df.columns:
        return df[df[season_col].isin(target_seasons)].copy()
    return df


## 1. LaLiga Matches Dataset

**Source:** `LaLiga_Matches.csv`  
**Coverage:** Late 1990s to 2024

**Key Columns:**
- Date, HomeTeam, AwayTeam
- FTHG (Full Time Home Goals), FTAG (Full Time Away Goals)
- FTR (Full Time Result): H = Home Win, D = Draw, A = Away Win
- HTR (Half Time Result): H = Home Lead, D = Draw, A = Away Lead

**Cleaning Steps:**
1. Removed rows with missing values in critical columns (HomeTeam, AwayTeam, FTHG, FTAG)
2. Standardized team names using predefined mapping
3. Filtered to last 5 seasons (2019-20 to 2025-26)
4. Converted Date to datetime format
5. Created HomePoints and AwayPoints columns
   - **HomePoints** = 3 if FTR='H', 1 if FTR='D', 0 if FTR='A'
   - **AwayPoints** = 3 if FTR='A', 1 if FTR='D', 0 if FTR='H'

In [64]:
laliga_matches

Unnamed: 0,Season,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR
0,1995-96,02-09-1995,La Coruna,Valencia,3,0,H,2.0,0.0,H
1,1995-96,02-09-1995,Sp Gijon,Albacete,3,0,H,3.0,0.0,H
2,1995-96,03-09-1995,Ath Bilbao,Santander,4,0,H,2.0,0.0,H
3,1995-96,03-09-1995,Ath Madrid,Sociedad,4,1,H,1.0,1.0,D
4,1995-96,03-09-1995,Celta,Compostela,0,1,A,0.0,0.0,D
...,...,...,...,...,...,...,...,...,...,...
11659,2025-26,26-10-2025,Mallorca,Levante,1,1,D,0.0,1.0,A
11660,2025-26,26-10-2025,Real Madrid,Barcelona,2,1,H,2.0,1.0,H
11661,2025-26,26-10-2025,Osasuna,Celta,2,3,A,2.0,1.0,H
11662,2025-26,26-10-2025,Vallecano,Alaves,1,0,H,0.0,0.0,D


In [65]:
laliga_matches.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11664 entries, 0 to 11663
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Season    11664 non-null  object 
 1   Date      11664 non-null  object 
 2   HomeTeam  11664 non-null  object 
 3   AwayTeam  11664 non-null  object 
 4   FTHG      11664 non-null  int64  
 5   FTAG      11664 non-null  int64  
 6   FTR       11664 non-null  object 
 7   HTHG      11662 non-null  float64
 8   HTAG      11662 non-null  float64
 9   HTR       11662 non-null  object 
dtypes: float64(2), int64(2), object(6)
memory usage: 911.4+ KB


In [66]:
laliga_matches.describe()

Unnamed: 0,FTHG,FTAG,HTHG,HTAG
count,11664.0,11664.0,11662.0,11662.0
mean,1.546639,1.119084,0.686932,0.484823
std,1.300203,1.110989,0.840816,0.698339
min,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0
50%,1.0,1.0,0.0,0.0
75%,2.0,2.0,1.0,1.0
max,10.0,8.0,6.0,6.0


In [67]:
laliga_matches.isnull().sum()
laliga_matches[laliga_matches['HTR'].isna()]


Unnamed: 0,Season,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR
136,1995-96,19-11-1995,Ath Bilbao,La Coruna,1,0,H,,,
1472,1998-99,10-01-1999,Valladolid,Betis,0,3,A,,,


In [68]:
# Cleaning LaLiga Matches Dataset

matches_5y = filter_last_5_years(laliga_matches)
matches_5y = standardize_team_names(matches_5y, ["HomeTeam", "AwayTeam"])
matches_5y["Date"] = pd.to_datetime(matches_5y["Date"], format="%d-%m-%Y", errors="coerce")
matches_5y["HomePoints"] = matches_5y["FTR"].map({"H": 3, "D": 1, "A": 0})
matches_5y["AwayPoints"] = matches_5y["FTR"].map({"H": 0, "D": 1, "A": 3})
matches_5y = matches_5y.dropna(subset=["HomeTeam", "AwayTeam", "FTHG", "FTAG"])
matches_5y


Unnamed: 0,Season,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HomePoints,AwayPoints
9284,2019-20,2019-08-16,Athletic Club,Barcelona,1,0,H,0.0,0.0,D,3,0
9285,2019-20,2019-08-17,Celta Vigo,Real Madrid,1,3,A,0.0,1.0,A,0,3
9286,2019-20,2019-08-17,Valencia,Real Sociedad,1,1,D,0.0,0.0,D,1,1
9287,2019-20,2019-08-17,Mallorca,Eibar,2,1,H,1.0,0.0,H,3,0
9288,2019-20,2019-08-17,Leganés,Osasuna,0,1,A,0.0,0.0,D,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...
11659,2025-26,2025-10-26,Mallorca,Levante,1,1,D,0.0,1.0,A,1,1
11660,2025-26,2025-10-26,Real Madrid,Barcelona,2,1,H,2.0,1.0,H,3,0
11661,2025-26,2025-10-26,Osasuna,Celta Vigo,2,3,A,2.0,1.0,H,0,3
11662,2025-26,2025-10-26,Rayo Vallecano,Alavés,1,0,H,0.0,0.0,D,3,0


### Cleaned Dataset: matches_5y

**Contents:**
- Last 5 seasons only (2019-20 to 2025-26)
- Standardized team names
- No missing values in critical columns
- Date in datetime format
- Added HomePoints and AwayPoints columns

**Columns:** Season, Date, HomeTeam, AwayTeam, FTHG, FTAG, FTR, HTR, HomePoints, AwayPoints

## 2. Detailed Match Statistics

**Sources:** `matches_full.csv` + `matches_laliga.csv`

**Key Columns:**
- date, time, team, opponent
- gf (goals for), ga (goals against)
- xg (expected goals), xga (expected goals against)
- poss (possession %), attendance
- team_formation, opponent_formation
- sh (shots), sot (shots on target), dist (distance), fk (free kicks), pk (penalty kicks)

**Cleaning Steps:**
1. Combined both datasets
2. Filtered for seasons 2019-20 to 2025-26
3. Standardized team names
4. Converted date to datetime format
5. Handled missing values in numeric columns (coercion to numeric type)

In [69]:
matches_full

Unnamed: 0.1,Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,0,2024-08-17,21:30,La Liga,Matchweek 1,Sat,Away,W,2.0,1.0,...,Match Report,,17.0,5.0,18.6,1.0,1,1,2025,Barcelona
1,1,2024-08-24,19:00,La Liga,Matchweek 2,Sat,Home,W,2.0,1.0,...,Match Report,,13.0,5.0,16.6,0.0,0,0,2025,Barcelona
2,2,2024-08-27,21:30,La Liga,Matchweek 3,Tue,Away,W,2.0,1.0,...,Match Report,,22.0,5.0,19.3,1.0,0,0,2025,Barcelona
3,3,2024-08-31,17:00,La Liga,Matchweek 4,Sat,Home,W,7.0,0.0,...,Match Report,,23.0,11.0,13.7,1.0,0,0,2025,Barcelona
4,4,2024-09-15,16:15,La Liga,Matchweek 5,Sun,Away,W,4.0,1.0,...,Match Report,,20.0,9.0,19.1,0.0,0,0,2025,Barcelona
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4313,48,2020-07-05,17:00,La Liga,Matchweek 34,Sun,Home,L,0.0,1.0,...,Match Report,,7.0,2.0,18.9,1.0,0,0,2020,Espanyol
4314,49,2020-07-08,22:00,La Liga,Matchweek 35,Wed,Away,L,0.0,1.0,...,Match Report,,11.0,2.0,18.1,1.0,0,0,2020,Espanyol
4315,50,2020-07-12,14:00,La Liga,Matchweek 36,Sun,Home,L,0.0,2.0,...,Match Report,,14.0,3.0,20.8,3.0,0,0,2020,Espanyol
4316,51,2020-07-16,21:00,La Liga,Matchweek 37,Thu,Away,L,0.0,1.0,...,Match Report,,18.0,6.0,21.3,3.0,0,0,2020,Espanyol


In [70]:
matches_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4318 entries, 0 to 4317
Data columns (total 29 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     4318 non-null   int64  
 1   date           4318 non-null   object 
 2   time           4318 non-null   object 
 3   comp           4318 non-null   object 
 4   round          4318 non-null   object 
 5   day            4318 non-null   object 
 6   venue          4318 non-null   object 
 7   result         4318 non-null   object 
 8   gf             4318 non-null   float64
 9   ga             4318 non-null   float64
 10  opponent       4318 non-null   object 
 11  xg             4318 non-null   float64
 12  xga            4318 non-null   float64
 13  poss           4318 non-null   float64
 14  attendance     3344 non-null   float64
 15  captain        4318 non-null   object 
 16  formation      4318 non-null   object 
 17  opp formation  4318 non-null   object 
 18  referee 

In [71]:
matches_full.describe()

Unnamed: 0.1,Unnamed: 0,gf,ga,xg,xga,poss,attendance,notes,sh,sot,dist,fk,pk,pkatt,season
count,4318.0,4318.0,4318.0,4318.0,4318.0,4318.0,3344.0,0.0,4318.0,4318.0,4315.0,4318.0,4318.0,4318.0,4318.0
mean,21.691524,1.270032,1.270032,1.262112,1.262112,50.001158,27864.883373,,11.512043,3.785086,18.184774,0.453219,0.129921,0.169291,2022.359889
std,13.893162,1.16897,1.16897,0.774208,0.774208,11.435318,18103.653603,,4.838242,2.288167,3.413213,0.697135,0.362766,0.417713,1.646465
min,0.0,0.0,0.0,0.0,0.0,18.0,13.0,,0.0,0.0,4.8,0.0,0.0,0.0,2020.0
25%,9.0,0.0,0.0,0.7,0.7,42.0,14195.0,,8.0,2.0,15.9,0.0,0.0,0.0,2021.0
50%,21.0,1.0,1.0,1.1,1.1,50.0,20123.0,,11.0,3.0,18.0,0.0,0.0,0.0,2022.0
75%,33.0,2.0,2.0,1.7,1.7,58.0,39611.5,,14.0,5.0,20.0,1.0,0.0,0.0,2024.0
max,58.0,7.0,7.0,5.9,5.9,82.0,95745.0,,36.0,17.0,46.2,5.0,3.0,3.0,2025.0


In [72]:
matches_full.isnull().sum()

Unnamed: 0          0
date                0
time                0
comp                0
round               0
day                 0
venue               0
result              0
gf                  0
ga                  0
opponent            0
xg                  0
xga                 0
poss                0
attendance        974
captain             0
formation           0
opp formation       0
referee            54
match report        0
notes            4318
sh                  0
sot                 0
dist                3
fk                  0
pk                  0
pkatt               0
season              0
team                0
dtype: int64

In [73]:
matches_laliga

Unnamed: 0.1,Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,0,2025-08-16,19:30,La Liga,Matchweek 1,Sat,Away,W,3.0,0.0,...,Match Report,,24.0,8.0,18.9,1.0,0,0,2025,Barcelona
1,1,2025-08-23,21:30,La Liga,Matchweek 2,Sat,Away,W,3.0,2.0,...,Match Report,,26.0,10.0,17.0,1.0,0,0,2025,Barcelona
2,2,2025-08-31,21:30,La Liga,Matchweek 3,Sun,Away,D,1.0,1.0,...,Match Report,,11.0,2.0,20.3,0.0,1,1,2025,Barcelona
3,3,2025-09-14,21:00,La Liga,Matchweek 4,Sun,Home,W,6.0,0.0,...,Match Report,,24.0,10.0,18.4,0.0,0,0,2025,Barcelona
4,5,2025-09-21,21:00,La Liga,Matchweek 5,Sun,Home,W,3.0,0.0,...,Match Report,,16.0,7.0,18.2,2.0,0,0,2025,Barcelona
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4695,48,2020-07-05,17:00,La Liga,Matchweek 34,Sun,Home,L,0.0,1.0,...,Match Report,,7.0,2.0,18.9,1.0,0,0,2019,Espanyol
4696,49,2020-07-08,22:00,La Liga,Matchweek 35,Wed,Away,L,0.0,1.0,...,Match Report,,11.0,2.0,18.1,1.0,0,0,2019,Espanyol
4697,50,2020-07-12,14:00,La Liga,Matchweek 36,Sun,Home,L,0.0,2.0,...,Match Report,,14.0,3.0,20.8,3.0,0,0,2019,Espanyol
4698,51,2020-07-16,21:00,La Liga,Matchweek 37,Thu,Away,L,0.0,1.0,...,Match Report,,18.0,6.0,21.3,3.0,0,0,2019,Espanyol


In [74]:
matches_laliga.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4700 entries, 0 to 4699
Data columns (total 29 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     4700 non-null   int64  
 1   date           4700 non-null   object 
 2   time           4700 non-null   object 
 3   comp           4700 non-null   object 
 4   round          4700 non-null   object 
 5   day            4700 non-null   object 
 6   venue          4700 non-null   object 
 7   result         4700 non-null   object 
 8   gf             4700 non-null   float64
 9   ga             4700 non-null   float64
 10  opponent       4700 non-null   object 
 11  xg             4700 non-null   float64
 12  xga            4700 non-null   float64
 13  poss           4700 non-null   float64
 14  attendance     3724 non-null   float64
 15  captain        4700 non-null   object 
 16  formation      4700 non-null   object 
 17  opp formation  4700 non-null   object 
 18  referee 

In [75]:
matches_laliga.describe()

Unnamed: 0.1,Unnamed: 0,gf,ga,xg,xga,poss,attendance,notes,sh,sot,dist,fk,pk,pkatt,season
count,4700.0,4700.0,4700.0,4700.0,4700.0,4700.0,3724.0,0.0,4700.0,4700.0,4697.0,4700.0,4700.0,4700.0,4700.0
mean,22.061702,1.273617,1.273617,1.264532,1.264532,50.001064,28097.842105,,11.550213,3.804468,18.154652,0.443191,0.130426,0.168936,2021.604255
std,14.395784,1.165048,1.165048,0.776251,0.776251,11.509529,18104.760706,,4.906037,2.29637,3.386085,0.688555,0.362374,0.415661,1.784514
min,0.0,0.0,0.0,0.0,0.0,18.0,13.0,,0.0,0.0,4.8,0.0,0.0,0.0,2019.0
25%,9.0,0.0,0.0,0.7,0.7,42.0,14266.0,,8.0,2.0,15.9,0.0,0.0,0.0,2020.0
50%,22.0,1.0,1.0,1.1,1.1,50.0,20298.5,,11.0,3.0,18.0,0.0,0.0,0.0,2022.0
75%,34.0,2.0,2.0,1.7,1.7,58.0,39957.0,,14.0,5.0,20.0,1.0,0.0,0.0,2023.0
max,60.0,7.0,7.0,5.9,5.9,82.0,95745.0,,40.0,17.0,46.2,5.0,3.0,3.0,2025.0


In [76]:
matches_laliga[matches_laliga['attendance'].isna()]

Unnamed: 0.1,Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
83,6,2025-09-30,20:00,La Liga,Matchweek 7,Tue,Home,L,1.0,2.0,...,Match Report,,15.0,4.0,20.6,0.0,0,1,2025,Valencia
97,6,2025-09-30,20:00,La Liga,Matchweek 7,Tue,Away,W,2.0,1.0,...,Match Report,,10.0,4.0,19.0,1.0,0,0,2025,Oviedo
3180,0,2020-09-27,16:00,La Liga,Matchweek 3,Sun,Home,W,6.0,1.0,...,Match Report,,16.0,9.0,15.2,0.0,0,1,2020,Atletico Madrid
3181,1,2020-09-30,19:00,La Liga,Matchweek 4,Wed,Away,D,0.0,0.0,...,Match Report,,16.0,2.0,19.0,1.0,0,0,2020,Atletico Madrid
3182,2,2020-10-03,16:00,La Liga,Matchweek 5,Sat,Home,D,0.0,0.0,...,Match Report,,13.0,0.0,23.3,0.0,0,0,2020,Atletico Madrid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4695,48,2020-07-05,17:00,La Liga,Matchweek 34,Sun,Home,L,0.0,1.0,...,Match Report,,7.0,2.0,18.9,1.0,0,0,2019,Espanyol
4696,49,2020-07-08,22:00,La Liga,Matchweek 35,Wed,Away,L,0.0,1.0,...,Match Report,,11.0,2.0,18.1,1.0,0,0,2019,Espanyol
4697,50,2020-07-12,14:00,La Liga,Matchweek 36,Sun,Home,L,0.0,2.0,...,Match Report,,14.0,3.0,20.8,3.0,0,0,2019,Espanyol
4698,51,2020-07-16,21:00,La Liga,Matchweek 37,Thu,Away,L,0.0,1.0,...,Match Report,,18.0,6.0,21.3,3.0,0,0,2019,Espanyol


In [77]:
# Combined matches_full and matches_laliga
matches_detailed = pd.concat(
    [matches_full, matches_laliga], ignore_index=True
)

# Filtered Seasons
if "season" in matches_detailed.columns:
    # Extract year from season or date
    matches_detailed["year"] = pd.to_datetime(matches_detailed["date"]).dt.year
    matches_detailed = matches_detailed[
        matches_detailed["year"].isin([2019, 2020, 2021, 2022, 2023, 2024, 2025, 2026])

    ]

# Standardize team names
matches_detailed = standardize_team_names(
    matches_detailed, ["team", "opponent"]
)

# Date Dtype
matches_detailed["date"] = pd.to_datetime(
    matches_detailed["date"], errors="coerce"
)

# Handled missing values in numeric columns
numeric_cols = ["gf", "ga", "xg", "xga", "poss", "attendance"]
for col in numeric_cols:
    if col in matches_detailed.columns:
        matches_detailed[col] = pd.to_numeric(matches_detailed[col], errors="coerce")

matches_detailed

Unnamed: 0.1,Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,...,notes,sh,sot,dist,fk,pk,pkatt,season,team,year
0,0,2024-08-17,21:30,La Liga,Matchweek 1,Sat,Away,W,2.0,1.0,...,,17.0,5.0,18.6,1.0,1,1,2025,Barcelona,2024
1,1,2024-08-24,19:00,La Liga,Matchweek 2,Sat,Home,W,2.0,1.0,...,,13.0,5.0,16.6,0.0,0,0,2025,Barcelona,2024
2,2,2024-08-27,21:30,La Liga,Matchweek 3,Tue,Away,W,2.0,1.0,...,,22.0,5.0,19.3,1.0,0,0,2025,Barcelona,2024
3,3,2024-08-31,17:00,La Liga,Matchweek 4,Sat,Home,W,7.0,0.0,...,,23.0,11.0,13.7,1.0,0,0,2025,Barcelona,2024
4,4,2024-09-15,16:15,La Liga,Matchweek 5,Sun,Away,W,4.0,1.0,...,,20.0,9.0,19.1,0.0,0,0,2025,Barcelona,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9013,48,2020-07-05,17:00,La Liga,Matchweek 34,Sun,Home,L,0.0,1.0,...,,7.0,2.0,18.9,1.0,0,0,2019,Espanyol,2020
9014,49,2020-07-08,22:00,La Liga,Matchweek 35,Wed,Away,L,0.0,1.0,...,,11.0,2.0,18.1,1.0,0,0,2019,Espanyol,2020
9015,50,2020-07-12,14:00,La Liga,Matchweek 36,Sun,Home,L,0.0,2.0,...,,14.0,3.0,20.8,3.0,0,0,2019,Espanyol,2020
9016,51,2020-07-16,21:00,La Liga,Matchweek 37,Thu,Away,L,0.0,1.0,...,,18.0,6.0,21.3,3.0,0,0,2019,Espanyol,2020


### Cleaned Dataset: matches_detailed

**Contents:**
- Combined data from matches_full + matches_laliga
- Last 5 seasons only (2019-20 to 2025-26)
- Standardized team names
- Cleaned numeric columns with proper data types

**Columns:** date, time, team, opponent, gf, ga, xg, xga, poss, attendance, team_formation, opponent_formation, sh, sot, dist, fk, pk, pkatt

## Player Dataset

**Source:** `database.csv`

**Cleaning Steps:**
1. Standardized team names
2. Converted Date to datetime format
3. Filtered to 2019-2025
4. Cleaned numeric columns (comma → decimal point conversion)
5. Cleaned Pass Completion % (removed % sign, converted to numeric)

In [78]:
# Player Dataset Cleaning

# Standardize team names
players_clean = standardize_team_names(players.copy(), ["Team"])

# Date Dtype
players_clean["Date"] = pd.to_datetime(players_clean["Date"], errors="coerce")

# Filtered Season
players_clean["year"] = players_clean["Date"].dt.year
players_clean = players_clean[
    players_clean["year"].isin([2019, 2020, 2021, 2022, 2023, 2024, 2025, 2026])

]

# Cleaned numeric columns
numeric_cols = [
    "Goals",
    "Assists",
    "Expected Goals (xG)",
    "Non-Penalty xG (npxG)",
    "Expected Assists (xAG)",
    "Passes Completed",
    "Passes Attempted",
]

for col in numeric_cols:
    if col in players_clean.columns:
        # Handle comma as decimal separator
        players_clean[col] = (
            players_clean[col].astype(str).str.replace(",", ".")
        )
        players_clean[col] = pd.to_numeric(players_clean[col], errors="coerce")

# Cleaned Pass Completion % (removed comma)
if "Pass Completion %" in players_clean.columns:
    players_clean["Pass Completion %"] = (
        players_clean["Pass Completion %"]
        .astype(str)
        .str.replace(",", ".")
        .str.replace("%", "")
    )
    players_clean["Pass Completion %"] = pd.to_numeric(
        players_clean["Pass Completion %"], errors="coerce"
    )

players_clean

Unnamed: 0,Player,Team,#,Nation,Position,Age,Minutes,Goals,Assists,Penalty Shoot on Goal,...,Passes Completed,Passes Attempted,Pass Completion %,Progressive Passes,Carries,Progressive Carries,Dribble Attempts,Successful Dribbles,Date,year
0,Gorka Guruzeta,Athletic Club,12,ESP,FW,27-338,90,0,1,0,...,16,21,76.2,0,15,0,1,0,2024-08-15,2024
1,Álex Berenguer,Athletic Club,7,ESP,LW,29-042,71,0,0,0,...,12,23,52.2,1,17,3,5,2,2024-08-15,2024
2,Nico Williams,Athletic Club,10,ESP,LW,22-034,19,0,0,0,...,4,8,50.0,0,6,2,1,1,2024-08-15,2024
3,Iñaki Williams,Athletic Club,9,GHA,RW,30-061,90,0,0,0,...,17,26,65.4,1,22,1,2,1,2024-08-15,2024
4,Oihan Sancet,Athletic Club,8,ESP,AM,24-112,90,1,0,0,...,18,23,78.3,3,15,2,0,0,2024-08-15,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4653,Antonio Rüdiger,Real Madrid,22,GER,CB,31-276,90,0,0,0,...,83,89,93.3,5,65,1,0,0,2024-12-04,2024
4654,Raúl Asencio,Real Madrid,35,ESP,CB,21-295,90,0,0,0,...,71,75,94.7,1,52,0,1,1,2024-12-04,2024
4655,Lucas Vázquez,Real Madrid,17,ESP,RB,33-156,87,0,0,0,...,44,51,86.3,4,33,0,0,0,2024-12-04,2024
4656,Arda Güler,Real Madrid,15,TUR,CM,19-283,3,0,0,0,...,9,11,81.8,0,7,0,0,0,2024-12-04,2024


### Cleaned Dataset: players_clean

**Contents:**
- Last 5 years (2019-2025)
- Standardized team names
- All numeric columns properly formatted
- Pass Completion % as numeric value (without % symbol)

**Columns:** Player, Team, Date, Goals, Assists, Expected Goals (xG), Non-Penalty xG (npxG), Expected Assists (xAG), Passes Completed, Passes Attempted, Pass Completion %, Progressive Carries, Progressive Passes, Dribble Attempts, Successful Dribbles

In [79]:
# calculating Team Aggregates

team_stats_list = []
for team in matches_5y["HomeTeam"].unique():
    if pd.isna(team):
        continue
    # Home matches
    home_matches = matches_5y[matches_5y["HomeTeam"] == team]
    # Away matches
    away_matches = matches_5y[matches_5y["AwayTeam"] == team]

    # Combined and renamed columns
    all_matches = pd.concat([
        home_matches.rename(columns={
            "HomeTeam": "Team",
            "FTHG": "GoalsFor",
            "FTAG": "GoalsAgainst",
            "HomePoints": "Points",
        })[["Team", "Season", "GoalsFor", "GoalsAgainst", "Points"]],

        away_matches.rename(columns={
            "AwayTeam": "Team",
            "FTAG": "GoalsFor",
            "FTHG": "GoalsAgainst",
            "AwayPoints": "Points",
        })[["Team", "Season", "GoalsFor", "GoalsAgainst", "Points"]],
    ])

    # Per-season stats
    for season in target_seasons:
        season_matches = all_matches[all_matches["Season"] == season]

        if len(season_matches) > 0:
            team_stats_list.append({
                "Team": team,
                "Season": season,
                "Matches": len(season_matches),
                "Wins": len(season_matches[season_matches["Points"] == 3]),
                "Draws": len(season_matches[season_matches["Points"] == 1]),
                "Losses": len(season_matches[season_matches["Points"] == 0]),
                "GoalsFor": season_matches["GoalsFor"].sum(),
                "GoalsAgainst": season_matches["GoalsAgainst"].sum(),
                "Points": season_matches["Points"].sum(),
                "WinRate": len(season_matches[season_matches["Points"] == 3]) / len(season_matches),
                "PointsPerGame": season_matches["Points"].sum() / len(season_matches),
                "GoalDifference": season_matches["GoalsFor"].sum() - season_matches["GoalsAgainst"].sum(),
            })

team_stats = pd.DataFrame(team_stats_list)
team_stats


Unnamed: 0,Team,Season,Matches,Wins,Draws,Losses,GoalsFor,GoalsAgainst,Points,WinRate,PointsPerGame,GoalDifference
0,Athletic Club,2019-20,38,13,12,13,41,38,51,0.342105,1.342105,3
1,Athletic Club,2020-21,38,11,13,14,46,42,46,0.289474,1.210526,4
2,Athletic Club,2021-22,38,14,13,11,43,36,55,0.368421,1.447368,7
3,Athletic Club,2022-23,38,14,9,15,47,43,51,0.368421,1.342105,4
4,Athletic Club,2023-24,38,19,11,8,61,37,68,0.500000,1.789474,24
...,...,...,...,...,...,...,...,...,...,...,...,...
135,Girona,2024-25,38,11,8,19,44,60,41,0.289474,1.078947,-16
136,Girona,2025-26,10,1,4,5,9,22,7,0.100000,0.700000,-13
137,Las Palmas,2023-24,38,10,10,18,33,47,40,0.263158,1.052632,-14
138,Las Palmas,2024-25,38,8,8,22,40,61,32,0.210526,0.842105,-21


## 4. Team Statistics Aggregation

Aggregates detailed match statistics per team from matches_detailed dataset.

**Calculated Metrics:**
- **Avg Attendance** = Mean(attendance) per team
- **Total Attendance** = Sum(attendance) per team
- **Avg xG** = Mean(xg) per team
- **Avg xGA** = Mean(xga) per team
- **Avg Possession** = Mean(poss) per team
- **Avg Goals For** = Mean(gf) per team
- **Avg Goals Against** = Mean(ga) per team

In [80]:
detailed_stats = (
    matches_detailed.groupby("team")
    .agg({
        "attendance": ["mean", "sum", "count"],
        "xg": "mean",
        "xga": "mean",
        "poss": "mean",
        "gf": "mean",
        "ga": "mean",
    })
    .reset_index()
)
detailed_stats.columns = [
    "Team",
    "AvgAttendance",
    "TotalAttendance",
    "MatchesWithAttendance",
    "AvgxG",
    "AvgxGA",
    "AvgPossession",
    "AvgGoalsFor",
    "AvgGoalsAgainst",
]
team_stats = team_stats.merge(detailed_stats, on="Team", how="left")
team_stats


Unnamed: 0,Team,Season,Matches,Wins,Draws,Losses,GoalsFor,GoalsAgainst,Points,WinRate,PointsPerGame,GoalDifference,AvgAttendance,TotalAttendance,MatchesWithAttendance,AvgxG,AvgxGA,AvgPossession,AvgGoalsFor,AvgGoalsAgainst
0,Athletic Club,2019-20,38,13,12,13,41,38,51,0.342105,1.342105,3,35119.560563,12467444.0,355.0,1.295565,1.049667,49.543237,1.288248,1.002217
1,Athletic Club,2020-21,38,11,13,14,46,42,46,0.289474,1.210526,4,35119.560563,12467444.0,355.0,1.295565,1.049667,49.543237,1.288248,1.002217
2,Athletic Club,2021-22,38,14,13,11,43,36,55,0.368421,1.447368,7,35119.560563,12467444.0,355.0,1.295565,1.049667,49.543237,1.288248,1.002217
3,Athletic Club,2022-23,38,14,9,15,47,43,51,0.368421,1.342105,4,35119.560563,12467444.0,355.0,1.295565,1.049667,49.543237,1.288248,1.002217
4,Athletic Club,2023-24,38,19,11,8,61,37,68,0.500000,1.789474,24,35119.560563,12467444.0,355.0,1.295565,1.049667,49.543237,1.288248,1.002217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,Girona,2024-25,38,11,8,19,44,60,41,0.289474,1.078947,-16,20957.363229,4673492.0,223.0,1.440807,1.423767,54.565022,1.645740,1.421525
136,Girona,2025-26,10,1,4,5,9,22,7,0.100000,0.700000,-13,20957.363229,4673492.0,223.0,1.440807,1.423767,54.565022,1.645740,1.421525
137,Las Palmas,2023-24,38,10,10,18,33,47,40,0.263158,1.052632,-14,26562.771429,3718788.0,140.0,0.907143,1.626429,55.314286,0.971429,1.421429
138,Las Palmas,2024-25,38,8,8,22,40,61,32,0.210526,0.842105,-21,26562.771429,3718788.0,140.0,0.907143,1.626429,55.314286,0.971429,1.421429


## 5. Player Statistics Aggregation

Aggregates player performance statistics per team from players_clean dataset.

**Calculated Metrics:**
- **Total Goals** = Sum(Goals) per team
- **Total Assists** = Sum(Assists) per team
- **Total xG** = Sum(Expected Goals) per team
- **Total xAG** = Sum(Expected Assists) per team
- **Player Count** = Count(Player) per team
- **Avg Age** = Mean(Age in years) per team

In [81]:
if len(players_clean) > 0:
    players_with_age = players_clean.copy()

    if "Age" in players_with_age.columns:
        players_with_age["AgeYears"] = players_with_age["Age"].astype(str).str.split("-").str[0]
        players_with_age["AgeYears"] = pd.to_numeric(players_with_age["AgeYears"], errors="coerce")
    else:
        players_with_age["AgeYears"] = None

    player_stats = (
        players_with_age.groupby("Team")
        .agg({
            "Goals": "sum",
            "Assists": "sum",
            "Expected Goals (xG)": "sum",
            "Expected Assists (xAG)": "sum",
            "Player": "count",
            "AgeYears": "mean",
        })
        .reset_index()
    )

    player_stats.columns = [
        "Team",
        "TotalGoals",
        "TotalAssists",
        "TotalxG",
        "TotalxAG",
        "PlayerCount",
        "AvgAge",
    ]

    team_stats = team_stats.merge(player_stats, on="Team", how="left")
team_stats


Unnamed: 0,Team,Season,Matches,Wins,Draws,Losses,GoalsFor,GoalsAgainst,Points,WinRate,...,AvgxGA,AvgPossession,AvgGoalsFor,AvgGoalsAgainst,TotalGoals,TotalAssists,TotalxG,TotalxAG,PlayerCount,AvgAge
0,Athletic Club,2019-20,38,13,12,13,41,38,51,0.342105,...,1.049667,49.543237,1.288248,1.002217,24.0,20.0,22.3,15.9,256.0,26.664062
1,Athletic Club,2020-21,38,11,13,14,46,42,46,0.289474,...,1.049667,49.543237,1.288248,1.002217,24.0,20.0,22.3,15.9,256.0,26.664062
2,Athletic Club,2021-22,38,14,13,11,43,36,55,0.368421,...,1.049667,49.543237,1.288248,1.002217,24.0,20.0,22.3,15.9,256.0,26.664062
3,Athletic Club,2022-23,38,14,9,15,47,43,51,0.368421,...,1.049667,49.543237,1.288248,1.002217,24.0,20.0,22.3,15.9,256.0,26.664062
4,Athletic Club,2023-24,38,19,11,8,61,37,68,0.500000,...,1.049667,49.543237,1.288248,1.002217,24.0,20.0,22.3,15.9,256.0,26.664062
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,Girona,2024-25,38,11,8,19,44,60,41,0.289474,...,1.423767,54.565022,1.645740,1.421525,21.0,15.0,19.5,13.9,229.0,26.733624
136,Girona,2025-26,10,1,4,5,9,22,7,0.100000,...,1.423767,54.565022,1.645740,1.421525,21.0,15.0,19.5,13.9,229.0,26.733624
137,Las Palmas,2023-24,38,10,10,18,33,47,40,0.263158,...,1.626429,55.314286,0.971429,1.421429,18.0,14.0,13.9,10.4,243.0,27.407407
138,Las Palmas,2024-25,38,8,8,22,40,61,32,0.210526,...,1.626429,55.314286,0.971429,1.421429,18.0,14.0,13.9,10.4,243.0,27.407407


## Final Team Statistics Dataset

**Description:** Comprehensive aggregated statistics per team and season

**Includes:**
- Match results (Wins, Draws, Losses)
- Points and win rates
  - **Win Rate** = Wins / Total Matches
  - **Points Per Game** = Total Points / Total Matches
  - **Goal Difference** = Goals For - Goals Against
- Attendance metrics
- Expected goals (xG, xGA)
- Player contributions (goals, assists, xG, xAG)
- Average team age

## Understanding 27 Teams Across 5 Seasons

**La Liga has 20 teams per season**, but across **5 seasons (2019-20 to 2025-26)**, the dataset contains **27 unique teams** due to promotion and relegation.

### Teams Appearing in Certain Seasons Only

| Team | La Liga Participation |
|------|----------------------|
| Eibar | Only before 2020-21 |
| Huesca | 2020-21 only |
| Leganés | 2019-20 only |
| Granada | Not every season |
| Valladolid | Not every season |
| Cádiz | Promoted later |
| Mallorca | Promoted later |
| Alavés | Not every season |
| Elche | Promoted then relegated |
| Las Palmas | Promoted in 2023-24 |
| Girona | Promoted in 2022-23 |

**Summary:**
- Each season = 20 teams
- 5 seasons with promotion/relegation = 27 unique teams total
- This is **expected and correct**

In [82]:
print(f"Calculated statistics for {team_stats['Team'].nunique()} teams")
team_stats.shape


Calculated statistics for 28 teams


(140, 26)

In [83]:
matches_5y['HomeTeam'].unique()

array(['Athletic Club', 'Celta Vigo', 'Valencia', 'Mallorca', 'Leganés',
       'Villarreal', 'Alavés', 'Espanol', 'Real Betis', 'Atletico Madrid',
       'Granada', 'Levante', 'Osasuna', 'Real Madrid', 'Getafe',
       'Barcelona', 'Sevilla', 'Real Sociedad', 'Eibar', 'Valladolid',
       'Cádiz', 'Huesca', 'Elche', 'Rayo Vallecano', 'Almeria', 'Girona',
       'Las Palmas', 'Oviedo'], dtype=object)

In [84]:
team_stats.to_csv('../CleanedDatasets/Cleaning/team_stats.csv', index=False)

In [85]:
matches_detailed.to_csv('../CleanedDatasets/Cleaning/matches_detailed.csv', index=False)

In [86]:
players_clean.to_csv('../CleanedDatasets/Cleaning/players_clean.csv', index=False)

In [87]:
matches_5y.to_csv('../CleanedDatasets/Cleaning/matches_5y.csv', index=False)