<a href="https://colab.research.google.com/github/AtfastrSlushyMaker/pl-standings-prediction-project/blob/main/notebooks/algorithms/SVM/svm_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from IPython.display import display

## Load and Inspect Dataset



In [6]:
# Attempt to load data from multiple possible locations
possible_paths = [
    Path('team_season_aggregated.csv'),
    Path('/content/team_season_aggregated.csv'),
    Path('/mnt/data/team_season_aggregated.csv'),
    Path('data/team_season_aggregated.csv')
]
for path in possible_paths:
    try:
        data = pd.read_csv(path)
        print(f"Dataset loaded from: {path}")
        break
    except FileNotFoundError:
        continue
else:
    raise FileNotFoundError("team_season_aggregated.csv not found in specified paths.")

# Show basic information
print(f"Dataset shape: {data.shape}")
if 'Season' in data.columns:
    seasons = sorted(data['Season'].unique())
    print(f"Seasons in data: {seasons}")
else:
    print("No 'Season' column found in data.")
print("\nFirst few rows of the dataset:")
display(data.head())

# Distribution of the target variable FTR
if 'FTR' in data.columns:
    result_counts = data['FTR'].value_counts()
    print("\nFull Time Result value counts:")
    print(result_counts.to_string())

    # Plot the distribution of FTR
    plt.figure(figsize=(6,4))
    sns.countplot(x='FTR', data=data, order=result_counts.index)
    plt.title("Distribution of Full Time Result (FTR)")
    plt.xlabel("Full Time Result")
    plt.ylabel("Match Count")
    plt.tight_layout()
    plt.show()
else:
    print("No 'FTR' column found in data.")


Dataset loaded from: team_season_aggregated.csv
Dataset shape: (500, 35)
Seasons in data: ['2000-01', '2001-02', '2002-03', '2003-04', '2004-05', '2005-06', '2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12', '2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23', '2023-24', '2024-25']

First few rows of the dataset:


Unnamed: 0,Season,Season_encoded,Team,Matches_Played,Home_Matches,Away_Matches,Wins,Draws,Losses,Home_Wins,...,Yellow_Cards,Red_Cards,Fouls,Corners,Avg_Corners,Win_Rate,Home_Win_Rate,Away_Win_Rate,Final_Position,Team_encoded
0,2000-01,0,Man United,38,19,19,24,8,6,15,...,44.0,3.0,433.0,269.0,7.078947,0.631579,0.789474,0.473684,1,27
1,2000-01,0,Arsenal,38,19,19,20,10,8,15,...,48.0,3.0,495.0,295.0,7.763158,0.526316,0.789474,0.263158,2,0
2,2000-01,0,Liverpool,38,19,19,20,9,9,13,...,50.0,4.0,473.0,243.0,6.394737,0.526316,0.684211,0.368421,3,24
3,2000-01,0,Leeds,38,19,19,20,8,10,11,...,72.0,3.0,589.0,270.0,7.105263,0.526316,0.578947,0.473684,4,22
4,2000-01,0,Ipswich,38,19,19,20,6,12,11,...,31.0,2.0,414.0,206.0,5.421053,0.526316,0.578947,0.473684,5,21


No 'FTR' column found in data.


## Train-Test Split ‚Äî Feature and Target Preparation



In [8]:
# --- Feature Selection & Target Setup ---

# ‚úÖ Define input features
feature_cols = [
    'Team_encoded', 'Season_encoded',
    'Wins', 'Draws', 'Losses',
    'Goals_Scored', 'Goals_Conceded', 'Goal_Difference',
    'Avg_Goals_Scored', 'Avg_Goals_Conceded',
    'Total_Shots', 'Total_Shots_On_Target',
    'Avg_Shots', 'Avg_Shots_On_Target',
    'Shot_Accuracy',
    'Clean_Sheets', 'Clean_Sheet_Rate',
    'Yellow_Cards', 'Red_Cards', 'Fouls', 'Corners',
    'Win_Rate', 'Home_Win_Rate', 'Away_Win_Rate',
    'Points_Per_Game'
]

# ‚úÖ Check that features exist in dataset
available_features = [col for col in feature_cols if col in df_agg.columns]
missing_features = [col for col in feature_cols if col not in df_agg.columns]

if missing_features:
    print(f"‚ö†Ô∏è Missing features: {missing_features}")
else:
    print(f"‚úÖ All {len(available_features)} features found.")

# üéØ Target variable
target_col = 'Final_Position'

# ‚úÇÔ∏è Split by season (train on all seasons except 2024-25)
train_seasons = df_agg[df_agg['Season'] != '2024-25']
test_season = df_agg[df_agg['Season'] == '2024-25']

X_train = train_seasons[available_features].copy()
y_train = train_seasons[target_col].copy()

X_test = test_season[available_features].copy()
y_test = test_season[target_col].copy()

print("\n‚úÖ Dataset split completed:")
print(f"Training set: {X_train.shape} | Test set: {X_test.shape}")


‚úÖ All 25 features found.

‚úÖ Dataset split completed:
Training set: (480, 25) | Test set: (20, 25)
