In [1]:
import pandas as pd

In [2]:
matches_5y = pd.read_csv('../CleanedDatasets/Cleaning/matches_5y.csv')
matches_detailed = pd.read_csv('../CleanedDatasets/Cleaning/matches_detailed.csv')

In [3]:
print(matches_5y.shape, matches_detailed.shape)

(2380, 12) (9018, 30)


# Team Performance Analysis
Seven-season LaLiga team analysis (2019-20 to 2025-26) covering results, xG, home/away splits, and head-to-head performance.

Datasets:
- matches_5y.csv: match results and points
- matches_detailed.csv: detailed match stats with xG

## League Positions Dataset

Calculates final league positions for each team per season (2019-20 to 2025-26).

Datasets `matches_5y.csv`

**Calculated Metrics:**
- **Points:** Total points earned (Win = 3 pts, Draw = 1 pt, Loss = 0 pts)
- **GoalDifference:** Goals For - Goals Against
- **Position:** Ranked by Points → Goal Difference → Goals For

In [4]:
# League Positions Calculation

target_seasons = ["2019-20", "2020-21", "2021-22", "2022-23", "2023-24", "2024-25", "2025-26"]
positions_list = []

for season in target_seasons:
    df = matches_5y[matches_5y["Season"] == season]
    home = df[["HomeTeam", "HomePoints", "FTHG", "FTAG"]].rename(
        columns={"HomeTeam": "Team", "HomePoints": "Points", "FTHG": "GoalsFor", "FTAG": "GoalsAgainst"}
    )
    away = df[["AwayTeam", "AwayPoints", "FTAG", "FTHG"]].rename(
        columns={"AwayTeam": "Team", "AwayPoints": "Points", "FTAG": "GoalsFor", "FTHG": "GoalsAgainst"}
    )
    standings = (
        pd.concat([home, away])
        .groupby("Team", as_index=False)
        .sum()
    )
    standings["GoalDifference"] = standings["GoalsFor"] - standings["GoalsAgainst"]
    standings = standings.sort_values(
        ["Points", "GoalDifference", "GoalsFor"],
        ascending=False
    ).reset_index(drop=True)
    standings["Position"] = standings.index + 1
    standings["Season"] = season
    positions_list.append(standings)

league_positions = pd.concat(positions_list, ignore_index=True)
league_positions

Unnamed: 0,Team,Points,GoalsFor,GoalsAgainst,GoalDifference,Position,Season
0,Real Madrid,87,70,25,45,1,2019-20
1,Barcelona,82,86,38,48,2,2019-20
2,Atletico Madrid,70,51,27,24,3,2019-20
3,Sevilla,70,54,34,20,4,2019-20
4,Villarreal,60,63,49,14,5,2019-20
...,...,...,...,...,...,...,...
135,Mallorca,9,11,15,-4,16,2025-26
136,Real Sociedad,9,10,14,-4,17,2025-26
137,Valencia,9,10,16,-6,18,2025-26
138,Oviedo,7,7,19,-12,19,2025-26


## Performance Metrics

Datasets matches_5y.csv + league_positions

**Calculated Metrics:**

**Match Statistics:**
- Win Rate = Wins / Total Matches
- Draw Rate = Draws / Total Matches
- Loss Rate = Losses / Total Matches

**Goal Statistics:**
- Avg Goals For = Total Goals For / Total Matches
- Avg Goals Against = Total Goals Against / Total Matches
- Avg Goal Difference = (Total Goals For - Total Goals Against) / Total Matches

**Points:**
- Points Per Game = Total Points / Total Matches

**Position Consistency:**
- **Consistency Score** = max(0, 100 - (Position Std Dev / Avg Position × 100))
  - Higher score = more consistent league positioning
  - Score of 100 = perfect consistency (same position every season)

In [5]:
# Team Performance Metrics Calculation

performance_list = []

for team in matches_5y['HomeTeam'].unique():
    home_matches = matches_5y[matches_5y['HomeTeam'] == team]
    away_matches = matches_5y[matches_5y['AwayTeam'] == team]

    # Combined
    all_matches = pd.concat([
        home_matches.rename(
            columns={"HomeTeam": "Team", "FTHG": "GoalsFor", "FTAG": "GoalsAgainst", "HomePoints": "Points"})
        [["Team", "Season", "GoalsFor", "GoalsAgainst", "Points"]],
        away_matches.rename(
            columns={"AwayTeam": "Team", "FTAG": "GoalsFor", "FTHG": "GoalsAgainst", "AwayPoints": "Points"})
        [["Team", "Season", "GoalsFor", "GoalsAgainst", "Points"]]
    ])

    total_matches = len(all_matches)
    wins = len(all_matches[all_matches['Points'] == 3])
    draws = len(all_matches[all_matches['Points'] == 1])
    losses = len(all_matches[all_matches['Points'] == 0])
    total_goals_for = all_matches['GoalsFor'].sum()
    total_goals_against = all_matches['GoalsAgainst'].sum()

    # league positions
    team_positions = league_positions[league_positions["Team"] == team]
    avg_position = team_positions['Position'].mean()
    best_position = team_positions['Position'].min()
    worst_position = team_positions['Position'].max()
    position_std = team_positions['Position'].std()

    consistency_score = 100 - (position_std * 10) if position_std else None
    consistency_score = max(0, consistency_score) if consistency_score else None

    performance_list.append({
        'Team': team,
        'TotalMatches': total_matches,
        'Wins': wins,
        'Draws': draws,
        'Losses': losses,
        'WinRate': wins / total_matches,
        'DrawRate': draws / total_matches,
        'LossRate': losses / total_matches,
        'TotalGoalsFor': total_goals_for,
        'TotalGoalsAgainst': total_goals_against,
        'AvgGoalsFor': total_goals_for / total_matches,
        'AvgGoalsAgainst': total_goals_against / total_matches,
        'GoalDifference': total_goals_for - total_goals_against,
        'AvgGoalDifference': (total_goals_for - total_goals_against) / total_matches,
        'TotalPoints': all_matches['Points'].sum(),
        'PointsPerGame': all_matches['Points'].sum() / total_matches,
        'AvgLeaguePosition': avg_position,
        'BestPosition': best_position,
        'WorstPosition': worst_position,
        'PositionStdDev': position_std,
        'ConsistencyScore': consistency_score
    })

performance_metrics = pd.DataFrame(performance_list)
performance_metrics

Unnamed: 0,Team,TotalMatches,Wins,Draws,Losses,WinRate,DrawRate,LossRate,TotalGoalsFor,TotalGoalsAgainst,...,AvgGoalsAgainst,GoalDifference,AvgGoalDifference,TotalPoints,PointsPerGame,AvgLeaguePosition,BestPosition,WorstPosition,PositionStdDev,ConsistencyScore
0,Athletic Club,238,94,73,71,0.394958,0.306723,0.298319,301,235,...,0.987395,66,0.277311,355,1.491597,7.714286,4,11,2.429972,75.700284
1,Celta Vigo,238,71,72,95,0.298319,0.302521,0.39916,294,329,...,1.382353,-35,-0.147059,285,1.197479,11.857143,7,17,3.48466,65.153397
2,Valencia,238,72,74,92,0.302521,0.310924,0.386555,280,319,...,1.340336,-39,-0.163866,290,1.218487,11.857143,9,18,3.387653,66.123474
3,Mallorca,200,56,51,93,0.28,0.255,0.465,192,274,...,1.37,-82,-0.41,219,1.095,14.333333,9,19,3.983298,60.167015
4,Leganés,76,17,25,34,0.223684,0.328947,0.447368,69,107,...,1.407895,-38,-0.5,76,1.0,18.0,18,18,0.0,
5,Villarreal,238,108,60,70,0.453782,0.252101,0.294118,399,296,...,1.243697,103,0.432773,384,1.613445,5.714286,3,8,1.704336,82.956638
6,Alavés,200,52,52,96,0.26,0.26,0.48,184,284,...,1.42,-100,-0.5,208,1.04,14.666667,10,20,3.50238,64.976199
7,Espanol,162,39,47,76,0.240741,0.290123,0.469136,173,242,...,1.493827,-69,-0.425926,164,1.012346,14.6,5,20,5.94138,40.586197
8,Real Betis,238,97,69,72,0.407563,0.289916,0.302521,326,298,...,1.252101,28,0.117647,360,1.512605,7.285714,5,15,3.450328,65.496722
9,Atletico Madrid,238,139,58,41,0.584034,0.243697,0.172269,409,211,...,0.886555,198,0.831933,475,1.995798,3.0,1,4,1.0,90.0


## Expected Goals (xG) Metrics

Analyzes expected goals performance to measure team efficiency in creating and preventing chances.

Datasets `matches_detailed.csv`

**Calculated Metrics:**
- **xG Overperformance** = Avg Goals - Avg xG
  - Positive = scoring more than expected (clinical finishing)
  - Negative = scoring less than expected (poor finishing)
  
- **xGA Overperformance** = Avg xGA - Avg Goals Against
  - Positive = conceding more than expected (poor goalkeeping/defense)
  - Negative = conceding less than expected (strong goalkeeping/defense)
  
- **xG Difference** = Avg xG - Avg xGA
  - Indicates overall quality of chances created vs conceded

In [6]:
# xG Performance Metrics

xg_metrics_list = []

for team in matches_detailed['team'].unique():
    team_matches = matches_detailed[
        (matches_detailed['team'] == team) &
        (matches_detailed['xg'].notna()) &
        (matches_detailed['xga'].notna())
    ]

    # xG metrics
    avg_xg = team_matches['xg'].mean()
    avg_xga = team_matches['xga'].mean()
    avg_goals = team_matches['gf'].mean()
    avg_goals_against = team_matches['ga'].mean()

    # xG difference
    xg_diff = avg_xg - avg_xga

    xg_metrics_list.append({
        'Team': team,
        'AvgxG': avg_xg,
        'AvgxGA': avg_xga,
        'AvgGoals': avg_goals,
        'AvgGoalsAgainst': avg_goals_against,
        'xGDifference': xg_diff,
        'xGMatches': len(team_matches)
    })

xg_metrics = pd.DataFrame(xg_metrics_list)
xg_metrics

Unnamed: 0,Team,AvgxG,AvgxGA,AvgGoals,AvgGoalsAgainst,xGDifference,xGMatches
0,Barcelona,1.958315,1.008204,2.150776,0.94235,0.950111,451
1,Atletico Madrid,1.545233,0.952993,1.709534,0.880266,0.592239,451
2,Real Madrid,1.845676,1.01153,2.011086,0.804878,0.834146,451
3,Athletic Club,1.295565,1.049667,1.288248,1.002217,0.245898,451
4,Villarreal,1.608889,1.254222,1.671111,1.246667,0.354667,450
5,Real Betis,1.329268,1.286253,1.35255,1.24612,0.043016,451
6,Rayo Vallecano,1.160535,1.258528,1.010033,1.284281,-0.097993,299
7,Mallorca,1.0304,1.3128,0.954667,1.378667,-0.2824,375
8,Real Sociedad,1.325942,1.058093,1.283814,1.059867,0.267849,451
9,Celta Vigo,1.220399,1.251885,1.221729,1.383592,-0.031486,451


## Home vs Away Performance
Compares team performance in home and away matches.

Datasets `matches_5y.csv`

**Calculated Metrics:**
- Home Win Rate = Home Wins / Total Home Matches
- Away Win Rate = Away Wins / Total Away Matches
- Home Points Per Game = Total Home Points / Home Matches
- Away Points Per Game = Total Away Points / Away Matches
- **Home Advantage** = Home PPG - Away PPG
  - Positive = performs better at home
  - Negative = performs better away (rare)

In [7]:
# Home vs Away Performance

home_away_list = []

for team in matches_5y['HomeTeam'].unique():
    # Home matches
    home_matches = matches_5y[matches_5y['HomeTeam'] == team]
    home_wins = len(home_matches[home_matches['FTR'] == 'H'])
    home_points = home_matches['HomePoints'].sum()
    home_goals_for = home_matches['FTHG'].sum()
    home_goals_against = home_matches['FTAG'].sum()

    # Away matches
    away_matches = matches_5y[matches_5y['AwayTeam'] == team]
    away_wins = len(away_matches[away_matches['FTR'] == 'A'])
    away_points = away_matches['AwayPoints'].sum()
    away_goals_for = away_matches['FTAG'].sum()
    away_goals_against = away_matches['FTHG'].sum()

    total_home = len(home_matches)
    total_away = len(away_matches)

    home_away_list.append({
        'Team': team,
        'HomeMatches': total_home,
        'HomeWins': home_wins,
        'HomeWinRate': home_wins / total_home,
        'HomePointsPerGame': home_points / total_home,
        'HomeGoalsFor': home_goals_for,
        'HomeGoalsAgainst': home_goals_against,
        'AwayMatches': total_away,
        'AwayWins': away_wins,
        'AwayWinRate': away_wins / total_away,
        'AwayPointsPerGame': away_points / total_away,
        'AwayGoalsFor': away_goals_for,
        'AwayGoalsAgainst': away_goals_against,
        'HomeAdvantage': (home_points / total_home) - (away_points / total_away)
    })

home_away_metrics = pd.DataFrame(home_away_list)
home_away_metrics

Unnamed: 0,Team,HomeMatches,HomeWins,HomeWinRate,HomePointsPerGame,HomeGoalsFor,HomeGoalsAgainst,AwayMatches,AwayWins,AwayWinRate,AwayPointsPerGame,AwayGoalsFor,AwayGoalsAgainst,HomeAdvantage
0,Athletic Club,120,61,0.508333,1.775,182,103,118,33,0.279661,1.20339,119,132,0.57161
1,Celta Vigo,120,45,0.375,1.416667,162,145,118,26,0.220339,0.974576,132,184,0.44209
2,Valencia,119,52,0.436975,1.630252,169,121,119,20,0.168067,0.806723,111,198,0.823529
3,Mallorca,100,39,0.39,1.48,108,102,100,17,0.17,0.71,84,172,0.77
4,Leganés,38,12,0.315789,1.184211,40,53,38,5,0.131579,0.815789,29,54,0.368421
5,Villarreal,119,59,0.495798,1.764706,233,146,119,49,0.411765,1.462185,166,150,0.302521
6,Alavés,100,37,0.37,1.39,102,106,100,15,0.15,0.69,82,178,0.7
7,Espanol,82,27,0.329268,1.304878,100,112,80,12,0.15,0.7125,73,130,0.592378
8,Real Betis,119,59,0.495798,1.739496,187,133,119,38,0.319328,1.285714,139,165,0.453782
9,Atletico Madrid,119,86,0.722689,2.361345,239,95,119,53,0.445378,1.630252,170,116,0.731092


## Head-to-Head vs Top Teams

Performance analysis against the top 6 teams by league position.

DataSets: matches_5y.csv + league_positions

**Calculated Metrics:**
- **H2H Win Rate** = H2H Wins / Total H2H Matches
  - Measures performance against strongest opponents

In [8]:
# Calculate head-to-head performance against top teams

top_n = 6
top_teams = league_positions[league_positions['Position'] <= top_n]['Team'].unique()

all_h2h_details = []
h2h_list = []

for team in matches_5y['HomeTeam'].unique():
    if team not in top_teams:
        continue
    vs_top_teams = []
    for opponent in top_teams:
        if opponent == team:
            continue

        home_vs = matches_5y[(matches_5y['HomeTeam'] == team) & (matches_5y['AwayTeam'] == opponent)]
        away_vs = matches_5y[(matches_5y['HomeTeam'] == opponent) & (matches_5y['AwayTeam'] == team)]

        total = len(home_vs) + len(away_vs)
        if total == 0:
            continue

        wins = len(home_vs[home_vs['FTR'] == 'H']) + len(away_vs[away_vs['FTR'] == 'A'])
        draws = len(home_vs[home_vs['FTR'] == 'D']) + len(away_vs[away_vs['FTR'] == 'D'])
        losses = total - wins - draws

        vs_top_teams.append({
            'Team': team,
            'Opponent': opponent,
            'Matches': total,
            'Wins': wins,
            'Draws': draws,
            'Losses': losses,
            'WinRate': wins / total
        })

    all_h2h_details.extend(vs_top_teams)

    temp_df = pd.DataFrame(vs_top_teams)
    h2h_list.append({
        'Team': team,
        'TotalH2HMatches': temp_df['Matches'].sum(),
        'H2HWins': temp_df['Wins'].sum(),
        'H2HDraws': temp_df['Draws'].sum(),
        'H2HLosses': temp_df['Losses'].sum(),
        'H2HWinRate': temp_df['Wins'].sum() / temp_df['Matches'].sum() if temp_df['Matches'].sum() > 0 else 0
    })

h2h_metrics = pd.DataFrame(h2h_list)
h2h_details_df = pd.DataFrame(all_h2h_details)
h2h_details_df, h2h_metrics

(             Team         Opponent  Matches  Wins  Draws  Losses   WinRate
 0   Athletic Club      Real Madrid       12     1      2       9  0.083333
 1   Athletic Club        Barcelona       12     1      2       9  0.083333
 2   Athletic Club  Atletico Madrid       12     3      2       7  0.250000
 3   Athletic Club          Sevilla       13     6      3       4  0.461538
 4   Athletic Club       Villarreal       13     5      6       2  0.384615
 ..            ...              ...      ...   ...    ...     ...       ...
 85         Girona       Villarreal        7     1      1       5  0.142857
 86         Girona    Real Sociedad        6     0      3       3  0.000000
 87         Girona       Real Betis        6     1      2       3  0.166667
 88         Girona    Athletic Club        7     3      2       2  0.428571
 89         Girona          Espanol        5     2      3       0  0.400000
 
 [90 rows x 7 columns],
               Team  TotalH2HMatches  H2HWins  H2HDraws  H2HLo

## Results
Merged dataset combining all metrics.

In [9]:
# Merging All Metrics
result = performance_metrics.copy()
result = result.merge(xg_metrics, on="Team", how="left")
result = result.merge(home_away_metrics, on="Team", how="left")
result = result.merge(h2h_metrics, on="Team", how="left")
result

Unnamed: 0,Team,TotalMatches,Wins,Draws,Losses,WinRate,DrawRate,LossRate,TotalGoalsFor,TotalGoalsAgainst,...,AwayWinRate,AwayPointsPerGame,AwayGoalsFor,AwayGoalsAgainst,HomeAdvantage,TotalH2HMatches,H2HWins,H2HDraws,H2HLosses,H2HWinRate
0,Athletic Club,238,94,73,71,0.394958,0.306723,0.298319,301,235,...,0.279661,1.20339,119,132,0.57161,102.0,32.0,27.0,43.0,0.313725
1,Celta Vigo,238,71,72,95,0.298319,0.302521,0.39916,294,329,...,0.220339,0.974576,132,184,0.44209,,,,,
2,Valencia,238,72,74,92,0.302521,0.310924,0.386555,280,319,...,0.168067,0.806723,111,198,0.823529,,,,,
3,Mallorca,200,56,51,93,0.28,0.255,0.465,192,274,...,0.17,0.71,84,172,0.77,,,,,
4,Leganés,76,17,25,34,0.223684,0.328947,0.447368,69,107,...,0.131579,0.815789,29,54,0.368421,,,,,
5,Villarreal,238,108,60,70,0.453782,0.252101,0.294118,399,296,...,0.411765,1.462185,166,150,0.302521,104.0,35.0,30.0,39.0,0.336538
6,Alavés,200,52,52,96,0.26,0.26,0.48,184,284,...,0.15,0.69,82,178,0.7,,,,,
7,Espanol,162,39,47,76,0.240741,0.290123,0.469136,173,242,...,0.15,0.7125,73,130,0.592378,73.0,7.0,21.0,45.0,0.09589
8,Real Betis,238,97,69,72,0.407563,0.289916,0.302521,326,298,...,0.319328,1.285714,139,165,0.453782,103.0,27.0,30.0,46.0,0.262136
9,Atletico Madrid,238,139,58,41,0.584034,0.243697,0.172269,409,211,...,0.445378,1.630252,170,116,0.731092,102.0,49.0,31.0,22.0,0.480392


In [10]:
league_positions.to_csv('../CleanedDatasets/Analysis/league_positions.csv', index=False)
performance_metrics.to_csv('../CleanedDatasets/Analysis/performance_metrics.csv', index=False)
xg_metrics.to_csv('../CleanedDatasets/Analysis/xg_metrics.csv', index=False)
home_away_metrics.to_csv('../CleanedDatasets/Analysis/home_away_metrics.csv', index=False)
h2h_details_df.to_csv('../CleanedDatasets/Analysis/h2h_details.csv', index=False)
h2h_metrics.to_csv('../CleanedDatasets/Analysis/h2h_metrics.csv', index=False)
result.to_csv('../CleanedDatasets/Analysis/team_performance_summary.csv', index=False)