# Soccer Predictions

In [27]:
import numpy as np
from numpy import float64
from numpy.typing import NDArray
from sklearn.linear_model import LogisticRegression  # type: ignore
from sklearn.preprocessing import OneHotEncoder  # type: ignore
import requests
import csv
from abc import ABC,abstractmethod
from typing import Dict, Callable, List, Optional, cast
from dataclasses import dataclass
from enum import Enum
from typing import List, Tuple, Optional, cast

In [3]:
# Create required classes 

@dataclass(frozen=True)
class Team(object):
    name:str

@dataclass(frozen=True)
class Fixture(object):
    home_team: Team
    away_team: Team
    league: str

@dataclass(frozen=True)
class Scenario(object):
    minutes_elapsed:int
    home_goals: int
    away_goals: int

class Outcome(str,Enum):
    HOME="home"
    AWAY="away"
    DRAW="draw"

@dataclass
class Result(object):
    fixture: Fixture
    outcome: Outcome
    home_goals: int
    away_goals: int
    season: int



In [28]:
@dataclass
class Prediction:
    outcome: Outcome
    confidence: Optional[float] = None


class Predictor(ABC):
    @abstractmethod
    def predict(self, fixture: Fixture) -> Prediction:
        pass


class InProgressPredictor(Predictor):
    @abstractmethod
    def predict_in_progress(self, fixture: Fixture, scenario: Scenario) -> Prediction:
        pass


In [4]:
def match_outcome(home_goals: int, away_goals: int) -> Outcome:
        if home_goals > away_goals:
            return Outcome.HOME
        if away_goals > home_goals:
            return Outcome.AWAY
        return Outcome.DRAW

In [5]:
def result_from_row(row: Dict[str, str]) -> Optional[Result]:
        try:
            home_goals = int(row['score1'])
            away_goals = int(row['score2'])

            return Result(
                fixture=Fixture(
                    home_team=Team(row['team1']),
                    away_team=Team(row['team2']),
                    league=row['league']
                ),
                outcome=match_outcome(home_goals, away_goals),
                home_goals=home_goals,
                away_goals=away_goals,
                season=int(row['season'])
            )
        except (KeyError, ValueError):
            return None

In [6]:
def build_model(results: List[Result]) -> Tuple[LogisticRegression, OneHotEncoder]:
    home_names = np.array([r.fixture.home_team.name for r in results])
    away_names = np.array([r.fixture.away_team.name for r in results])
    home_goals = np.array([r.home_goals for r in results])
    away_goals = np.array([r.away_goals for r in results])

    team_names = np.array(list(home_names) + list(away_names)).reshape(-1, 1)
    team_encoding = OneHotEncoder(sparse_output==False).fit(team_names)

    encoded_home_names = team_encoding.transform(home_names.reshape(-1, 1))
    encoded_away_names = team_encoding.transform(away_names.reshape(-1, 1))

    x: NDArray[float64] = np.concatenate([encoded_home_names, encoded_away_names], 1)
    y = np.sign(home_goals - away_goals)

    model = LogisticRegression(penalty="l2", fit_intercept=False, multi_class="ovr", C=1)
    model.fit(x, y)

    return model, team_encoding


In [29]:
class LinearRegressionPredictor(Predictor):
    def __init__(self, model: LogisticRegression, team_encoding: OneHotEncoder) -> None:
        self.model = model
        self.team_encoding = team_encoding

    def predict(self, fixture: Fixture) -> Prediction:
        encoded_home_name = self.__encode_team(fixture.home_team)
        encoded_away_name = self.__encode_team(fixture.away_team)

        if encoded_home_name is None:
            return Prediction(outcome=Outcome.AWAY)
        if encoded_away_name is None:
            return Prediction(outcome=Outcome.HOME)

        x: NDArray[float64] = np.concatenate([encoded_home_name, encoded_away_name], 1)
        pred = self.model.predict(x)

        if pred > 0:
            return Prediction(outcome=Outcome.HOME)
        elif pred < 0:
            return Prediction(outcome=Outcome.AWAY)
        else:
            return Prediction(outcome=Outcome.DRAW)

    def __encode_team(self, team: Team) -> Optional[NDArray[float64]]:
        try:
            result: NDArray[float64] = self.team_encoding.transform(np.array(team.name).reshape(-1, 1))
            return result
        except ValueError:
            return None

In [7]:
csv_location = 'https://projects.fivethirtyeight.com/soccer-api/club/spi_matches.csv'

In [8]:
raw_training_data = requests.get(csv_location).text

In [9]:
rows = csv.DictReader(raw_training_data.splitlines())

In [10]:
results = filter(lambda r: type(r) is Result and r.season >= 2023 - 2, map(result_from_row, rows))

In [11]:
results = list(results)

In [12]:
home_names = np.array([r.fixture.home_team.name for r in results])
away_names = np.array([r.fixture.away_team.name for r in results])
home_goals = np.array([r.home_goals for r in results])
away_goals = np.array([r.away_goals for r in results])

In [13]:
print(home_names.shape)
print(away_names.shape)
print(home_goals.shape)
print(away_goals.shape)


(24325,)
(24325,)
(24325,)
(24325,)


In [14]:
team_names = np.array(list(home_names) + list(away_names)).reshape(-1,1)

In [15]:
team_names.shape

(48650, 1)

In [16]:
team_encoding = OneHotEncoder(sparse_output=False).fit(team_names)

In [17]:
team_encoding

In [18]:
team_encoding.get_params()

{'categories': 'auto',
 'drop': None,
 'dtype': numpy.float64,
 'feature_name_combiner': 'concat',
 'handle_unknown': 'error',
 'max_categories': None,
 'min_frequency': None,
 'sparse_output': False}

In [19]:
encoded_home_names = team_encoding.transform(home_names.reshape(-1, 1))
encoded_away_names = team_encoding.transform(away_names.reshape(-1, 1))

In [20]:
print(encoded_away_names.shape)
print(encoded_home_names.shape)

(24325, 725)
(24325, 725)


In [21]:
x: NDArray[float64] = np.concatenate([encoded_home_names, encoded_away_names], 1)


In [22]:
y = np.sign(home_goals-away_goals)

In [23]:
model = LogisticRegression(penalty="l2", fit_intercept=False, multi_class="ovr",C=1)
model.fit(x,y)

In [24]:
model.predict()

TypeError: LinearClassifierMixin.predict() missing 1 required positional argument: 'X'