# Soccer Predictions

In [8]:
import numpy as np
from numpy import float64
from numpy.typing import NDArray
from sklearn.linear_model import LogisticRegression  # type: ignore
from sklearn.preprocessing import OneHotEncoder  # type: ignore
import requests
import csv
from abc import ABC,abstractmethod
from typing import Dict, Callable, List, Optional, cast
from dataclasses import dataclass
from enum import Enum
from typing import List, Tuple, Optional, cast
import pandas as pd
from ydata_profiling import ProfileReport

In [9]:
# Create required classes 

@dataclass(frozen=True)
class Team(object):
    name:str

@dataclass(frozen=True)
class Fixture(object):
    home_team: Team
    away_team: Team
    league: str

@dataclass(frozen=True)
class Scenario(object):
    minutes_elapsed:int
    home_goals: int
    away_goals: int

class Outcome(str,Enum):
    HOME="home"
    AWAY="away"
    DRAW="draw"

@dataclass
class Result(object):
    fixture: Fixture
    outcome: Outcome
    home_goals: int
    away_goals: int
    season: int



In [10]:
@dataclass
class Prediction:
    outcome: Outcome
    confidence: Optional[float] = None


class Predictor(ABC):
    @abstractmethod
    def predict(self, fixture: Fixture) -> Prediction:
        pass


class InProgressPredictor(Predictor):
    @abstractmethod
    def predict_in_progress(self, fixture: Fixture, scenario: Scenario) -> Prediction:
        pass


In [11]:
def match_outcome(home_goals: int, away_goals: int) -> Outcome:
        if home_goals > away_goals:
            return Outcome.HOME
        if away_goals > home_goals:
            return Outcome.AWAY
        return Outcome.DRAW

In [12]:
def result_from_row(row: Dict[str, str]) -> Optional[Result]:
        try:
            home_goals = int(row['score1'])
            away_goals = int(row['score2'])

            return Result(
                fixture=Fixture(
                    home_team=Team(row['team1']),
                    away_team=Team(row['team2']),
                    league=row['league']
                ),
                outcome=match_outcome(home_goals, away_goals),
                home_goals=home_goals,
                away_goals=away_goals,
                season=int(row['season'])
            )
        except (KeyError, ValueError):
            return None

In [13]:
def build_model(results: List[Result]) -> Tuple[LogisticRegression, OneHotEncoder]:
    home_names = np.array([r.fixture.home_team.name for r in results])
    away_names = np.array([r.fixture.away_team.name for r in results])
    home_goals = np.array([r.home_goals for r in results])
    away_goals = np.array([r.away_goals for r in results])

    team_names = np.array(list(home_names) + list(away_names)).reshape(-1, 1)
    team_encoding = OneHotEncoder(sparse_output==False).fit(team_names)

    encoded_home_names = team_encoding.transform(home_names.reshape(-1, 1))
    encoded_away_names = team_encoding.transform(away_names.reshape(-1, 1))

    x: NDArray[float64] = np.concatenate([encoded_home_names, encoded_away_names], 1)
    y = np.sign(home_goals - away_goals)

    model = LogisticRegression(penalty="l2", fit_intercept=False, multi_class="ovr", C=1)
    model.fit(x, y)

    return model, team_encoding


In [14]:
class LinearRegressionPredictor(Predictor):
    def __init__(self, model: LogisticRegression, team_encoding: OneHotEncoder) -> None:
        self.model = model
        self.team_encoding = team_encoding

    def predict(self, fixture: Fixture) -> Prediction:
        encoded_home_name = self.__encode_team(fixture.home_team)
        encoded_away_name = self.__encode_team(fixture.away_team)

        if encoded_home_name is None:
            return Prediction(outcome=Outcome.AWAY)
        if encoded_away_name is None:
            return Prediction(outcome=Outcome.HOME)

        x: NDArray[float64] = np.concatenate([encoded_home_name, encoded_away_name], 1)
        pred = self.model.predict(x)

        if pred > 0:
            return Prediction(outcome=Outcome.HOME)
        elif pred < 0:
            return Prediction(outcome=Outcome.AWAY)
        else:
            return Prediction(outcome=Outcome.DRAW)

    def __encode_team(self, team: Team) -> Optional[NDArray[float64]]:
        try:
            result: NDArray[float64] = self.team_encoding.transform(np.array(team.name).reshape(-1, 1))
            return result
        except ValueError:
            return None

In [15]:
csv_location = 'https://projects.fivethirtyeight.com/soccer-api/club/spi_matches.csv'

In [16]:
raw_training_data = requests.get(csv_location).text

In [18]:
df.head()

Unnamed: 0,season,date,league_id,league,team1,team2,spi1,spi2,prob1,prob2,...,importance1,importance2,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2
0,2016,2016-07-09,7921,FA Women's Super League,Liverpool Women,Reading,51.56,50.42,0.4389,0.2767,...,,,2.0,0.0,,,,,,
1,2016,2016-07-10,7921,FA Women's Super League,Arsenal Women,Notts County Ladies,46.61,54.03,0.3572,0.3608,...,,,2.0,0.0,,,,,,
2,2016,2016-07-10,7921,FA Women's Super League,Chelsea FC Women,Birmingham City,59.85,54.64,0.4799,0.2487,...,,,1.0,1.0,,,,,,
3,2016,2016-07-16,7921,FA Women's Super League,Liverpool Women,Notts County Ladies,53.0,52.35,0.4289,0.2699,...,,,0.0,0.0,,,,,,
4,2016,2016-07-17,7921,FA Women's Super League,Chelsea FC Women,Arsenal Women,59.43,60.99,0.4124,0.3157,...,,,1.0,2.0,,,,,,


In [19]:
raw_training_data

"season,date,league_id,league,team1,team2,spi1,spi2,prob1,prob2,probtie,proj_score1,proj_score2,importance1,importance2,score1,score2,xg1,xg2,nsxg1,nsxg2,adj_score1,adj_score2\n2016,2016-07-09,7921,FA Women's Super League,Liverpool Women,Reading,51.56,50.42,0.4389,0.2767,0.2844,1.39,1.05,,,2,0,,,,,,\n2016,2016-07-10,7921,FA Women's Super League,Arsenal Women,Notts County Ladies,46.61,54.03,0.3572,0.3608,0.2819,1.27,1.28,,,2,0,,,,,,\n2016,2016-07-10,7921,FA Women's Super League,Chelsea FC Women,Birmingham City,59.85,54.64,0.4799,0.2487,0.2714,1.53,1.03,,,1,1,,,,,,\n2016,2016-07-16,7921,FA Women's Super League,Liverpool Women,Notts County Ladies,53.0,52.35,0.4289,0.2699,0.3013,1.27,0.94,,,0,0,,,,,,\n2016,2016-07-17,7921,FA Women's Super League,Chelsea FC Women,Arsenal Women,59.43,60.99,0.4124,0.3157,0.2719,1.45,1.24,,,1,2,,,,,,\n2016,2016-07-24,7921,FA Women's Super League,Reading,Birmingham City,50.75,55.03,0.3821,0.32,0.2979,1.22,1.09,,,1,1,,,,,,\n2016,2016-07-24,7921,FA Women's Super 

In [20]:
rows = csv.DictReader(raw_training_data.splitlines())

In [21]:
results = filter(lambda r: type(r) is Result and r.season >= 2023 - 2, map(result_from_row, rows))

In [22]:
results = list(results)

In [23]:
home_names = np.array([r.fixture.home_team.name for r in results])
away_names = np.array([r.fixture.away_team.name for r in results])
home_goals = np.array([r.home_goals for r in results])
away_goals = np.array([r.away_goals for r in results])

In [24]:
print(home_names.shape)
print(away_names.shape)
print(home_goals.shape)
print(away_goals.shape)


(24325,)
(24325,)
(24325,)
(24325,)


In [25]:
team_names = np.array(list(home_names) + list(away_names)).reshape(-1,1)

In [26]:
team_names.shape

(48650, 1)

In [27]:
team_encoding = OneHotEncoder(sparse_output=False).fit(team_names)

In [28]:
team_encoding

In [29]:
team_encoding.get_params()

{'categories': 'auto',
 'drop': None,
 'dtype': numpy.float64,
 'feature_name_combiner': 'concat',
 'handle_unknown': 'error',
 'max_categories': None,
 'min_frequency': None,
 'sparse': 'deprecated',
 'sparse_output': False}

In [30]:
encoded_home_names = team_encoding.transform(home_names.reshape(-1, 1))
encoded_away_names = team_encoding.transform(away_names.reshape(-1, 1))

In [31]:
print(encoded_away_names.shape)
print(encoded_home_names.shape)

(24325, 725)
(24325, 725)


In [32]:
x: NDArray[float64] = np.concatenate([encoded_home_names, encoded_away_names], 1)


In [33]:
y = np.sign(home_goals-away_goals)

In [34]:
model = LogisticRegression(penalty="l2", fit_intercept=False, multi_class="ovr",C=1)
model.fit(x,y)

In [35]:
model.predict(x[0].reshape(1,-1))

array([1])

## Data Summarization w/ Pandas Profiling

In [36]:
df = pd.read_csv('https://projects.fivethirtyeight.com/soccer-api/club/spi_matches.csv')

In [37]:
profile = ProfileReport(df, title='Soccer Prediction')

In [38]:
profile.to_file("report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'could not convert string to float: "FA Women's Super League"')
  annotation = ("{:" + self.fmt + "}").format(val)
(using `df.profile_report(missing_diagrams={"Heatmap": False}`)
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'could not convert string to float: '--'')


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]