# Read CSV

In [1]:
import pandas as pd
import numpy as np

# Italian league

In [2]:
#data = pd.read_csv("SerieA-B/Serie_A-B_stats.csv")

# Dutch league

In [3]:
import os
data_folder = "Eredivisie"
data_frames = []
for data_file in os.listdir(data_folder):
    if data_file[-4:] == ".csv":
        data_frames.append(pd.read_csv(os.path.join(data_folder,data_file)))
        
data = pd.concat(data_frames)

In [4]:
data.head(1)

Unnamed: 0,AwayTeam,B365A,B365D,B365H,BSA,BSD,BSH,BWA,BWD,BWH,...,SJH,Unnamed: 58,Unnamed: 59,Unnamed: 60,VCA,VCD,VCH,WHA,WHD,WHH
0,Feyenoord,2.1,3.4,3.4,1.91,3.5,3.75,1.91,3.5,3.4,...,3.4,,,,1.95,3.5,3.75,1.83,3.5,3.75


# Set Index

In [5]:
all_data = data.set_index('Date')

# Select features

In [6]:
data = all_data[['HomeTeam', 'AwayTeam', 'FTR', 'FTHG', 'FTAG']] 
data.head(5)

Unnamed: 0_level_0,HomeTeam,AwayTeam,FTR,FTHG,FTAG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
05/08/11,Excelsior,Feyenoord,A,0.0,2.0
06/08/11,Heerenveen,Nijmegen,D,2.0,2.0
06/08/11,Roda,Groningen,H,2.0,1.0
06/08/11,VVV Venlo,Utrecht,D,0.0,0.0
06/08/11,Waalwijk,Heracles,D,2.0,2.0


In [7]:
data.tail(5)

Unnamed: 0_level_0,HomeTeam,AwayTeam,FTR,FTHG,FTAG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
01/10/16,Vitesse,Groningen,H,2.0,1.0
02/10/16,Ajax,Utrecht,H,3.0,2.0
02/10/16,Heracles,Twente,D,1.0,1.0
02/10/16,Sparta Rotterdam,AZ Alkmaar,D,1.0,1.0
02/10/16,Willem II,Feyenoord,A,0.0,2.0


In [8]:
input_features = data

In [9]:
g = input_features.columns.to_series().groupby(data.dtypes).groups

In [10]:
object_features = {key.name: value for key, value in g.items()}['object']

In [11]:
object_features

['HomeTeam', 'AwayTeam', 'FTR']

# Create performance history per team

1. Track winning or losing streak as feature
2. Track total wins, losses and draws

In [17]:
class Stats:
    def __init__(self):
        self.win_streak = 0
        self.lose_streak = 0
        self.total_wins = 0
        self.total_losses = 0
        self.total_draws = 0
        self.total_games = 0
        self.previous_encounters = {}
        self.result_series = []
        
    def get_win_ratio(self):
        try:
            return (float(self.total_wins) / float(self.total_games))
        except:
            return(0)
        
    def get_win_ratio_against_opponent(self, opponent):
        try:
            results = self.previous_encounters[opponent]
            return sum(results)
        except:
            return(0)
        
    def average_goals_scored_and_conceded(self, number_of_games):
        if self.total_games == 0:
            return (0,0)
        elif number_of_games > self.total_games:
            number_of_games = self.total_games
        games = self.result_series[-number_of_games:]
        goals_scored = 0
        goals_conceded = 0
        for game in games:
            goals_scored = goals_scored + game["goals_scored"]
            goals_conceded = goals_conceded + game["goals_conceded"]
            
        return (goals_scored/number_of_games, goals_conceded/number_of_games)
            
        
    #Results: 1 = win, 0 = draw, -1 = loss for this team against the opponent
    def update_previous_encounters(self, opponent, result):
        if opponent in self.previous_encounters:
            self.previous_encounters[opponent].append(result)
        else:
            self.previous_encounters[opponent] = [result]

    def update_result(self, opponent, goals_scored, goals_conceded):
        if pd.isnull(goals_scored) or pd.isnull(goals_conceded):
            return False
        result = 2
        if(goals_scored > goals_conceded):
            result = 1
            self.update_win()
        elif(goals_scored < goals_conceded):
            result = -1
            self.update_loss()
        else:
            result = 0
            self.update_draw()
        self.result_series.append({"result":result, "goals_scored":goals_scored, "goals_conceded":goals_conceded})
        self.update_previous_encounters(opponent, result)
    
    def update_win(self):
        self.win_streak = self.win_streak + 1
        self.lose_streak = 0
        self.total_wins = self.total_wins + 1
        self.total_games = self.total_games + 1
        
    def update_loss(self):
        self.lose_streak = self.lose_streak + 1
        self.win_streak = 0
        self.total_losses = self.total_losses + 1
        self.total_games = self.total_games + 1
    
    def update_draw(self):
        self.total_draws = self.total_draws + 1
        self.win_streak = 0
        self.lose_streak = 0
        self.total_games = self.total_games + 1

In [18]:
teams = input_features.HomeTeam.unique()
stats = {}
for team in teams:
    stats[team] = Stats()

## Create feature arrays as columns for data frame

In [19]:
feature_names = ["win_streak",
    "lose_streak",
    "total_wins",
    "total_losses",
    "total_draws",
    "win_ratio",
    "opponent_win_ratio",
    "total_games",
    "goals_scored",
    "goals_conceded"]

home_results = {}
away_results = {}

for feature in feature_names:
    home_results[feature] = []
    away_results[feature] = []

## Add aggregated stats from previous matches as features then update stats based on this match

In [20]:
def update_results(results, stats, opponent):
    results["win_streak"].append(stats.win_streak)
    results["lose_streak"].append(stats.lose_streak)
    results["total_wins"].append(stats.total_wins)
    results["total_losses"].append(stats.total_losses)
    results["total_draws"].append(stats.total_draws)
    results["win_ratio"].append(stats.get_win_ratio())
    results["opponent_win_ratio"].append(stats.get_win_ratio_against_opponent(opponent))
    results["total_games"].append(stats.total_games)
    goals_scored, goals_conceded = stats.average_goals_scored_and_conceded(3)
    if pd.isnull(goals_scored):
        print(stats.result_series[-3:])
    results["goals_scored"].append(goals_scored)
    results["goals_conceded"].append(goals_conceded)
    
    return results
    

for date, match in input_features.iterrows():
    home_team = match.HomeTeam
    home_stats = stats[home_team]
    
    away_team = match.AwayTeam
    away_stats = stats[away_team]
    
    home_results = update_results(home_results, home_stats, away_team)

    away_results = update_results(away_results, away_stats, home_team)
    
    #Update stats based on results of this match
    match_result = match.FTR    
    home_goals = match.FTHG
    away_goals = match.FTAG
        
    home_stats.update_result(away_team,home_goals,away_goals)
    away_stats.update_result(home_team,away_goals,home_goals)

In [21]:
    """if home_team == "Juventus" and away_team == "Roma":
        print("CHECK")
    if home_team == "Roma" and away_team == "Juventus":
        print("CHECK")"""
    
    """if home_team == "Juventus":
        print("total wins: " + str(home_stats.total_wins))
        print("total games: " + str(home_stats.total_games))
        print("win ratio: " + str(home_stats.get_win_ratio_against_opponent(away_team)))
    elif away_team == "Juventus":
        print("total wins: " + str(away_stats.total_wins))
        print("total games: " + str(away_stats.total_games))
        print("win ratio: " + str(away_stats.get_win_ratio_against_opponent(home_team)))"""

'if home_team == "Juventus":\n    print("total wins: " + str(home_stats.total_wins))\n    print("total games: " + str(home_stats.total_games))\n    print("win ratio: " + str(home_stats.get_win_ratio_against_opponent(away_team)))\nelif away_team == "Juventus":\n    print("total wins: " + str(away_stats.total_wins))\n    print("total games: " + str(away_stats.total_games))\n    print("win ratio: " + str(away_stats.get_win_ratio_against_opponent(home_team)))'

In [22]:
#stats["Juventus"].previous_encounters

In [23]:
columns = {}

for column_name, column  in home_results.iteritems():
    columns["home_" + column_name] = column
 
for column_name, column  in away_results.iteritems():
    columns["away_" + column_name] = column

In [24]:
input_features = pd.DataFrame(data=columns, index=data.index.values)

In [40]:
#input_features = input_features.drop(["away_total_losses", "home_total_losses", "away_total_wins", "home_total_wins", "away_total_draws", "home_total_draws"], 1)
input_features = input_features[['home_win_streak', 'away_win_streak', 'home_lose_streak', 'away_lose_streak']]

In [41]:
input_features.head(10)

Unnamed: 0,home_win_streak,away_win_streak,home_lose_streak,away_lose_streak
10/09/11,2,0,0,1
10/09/11,1,2,0,0
10/09/11,0,1,1,0
10/09/11,0,1,1,0
10/09/11,0,4,3,0
11/09/11,0,1,1,0
11/09/11,0,0,0,1
11/09/11,0,0,1,0
11/09/11,0,3,1,0
16/09/11,0,0,1,2


# Remove first half of a season so all stats have something to work with

In [27]:
#Season has 18 teams
#Each team plays 36 matches
#after 36 games each team should have played two games
buffer_games = 36
input_features = input_features[buffer_games:]

In [28]:
input_features.head(5)

Unnamed: 0,away_goals_conceded,away_goals_scored,away_lose_streak,away_opponent_win_ratio,away_total_games,away_win_ratio,away_win_streak,home_goals_conceded,home_goals_scored,home_lose_streak,home_opponent_win_ratio,home_total_games,home_win_ratio,home_win_streak
10/09/11,1.666667,2.333333,1,0,4,0.5,0,0.666667,2.333333,0,0,4,0.75,2
10/09/11,0.666667,1.333333,0,0,4,0.5,2,2.333333,1.666667,0,0,4,0.25,1
10/09/11,1.666667,1.333333,0,0,4,0.5,1,2.0,1.0,1,0,4,0.0,0
10/09/11,1.333333,3.666667,0,0,4,0.75,1,1.333333,1.333333,1,0,4,0.25,0
10/09/11,0.666667,3.666667,0,0,4,1.0,4,2.666667,0.333333,3,0,4,0.25,0


# Check for unusable fields

In [42]:
column_names = list(input_features.columns.values)

## Infinite values

In [43]:
for column in column_names:
    total_infinite_values = sum(np.isinf(input_features[column]))
    print (column + ": " + str(total_infinite_values))

home_win_streak: 0
away_win_streak: 0
home_lose_streak: 0
away_lose_streak: 0


In [44]:
for column in column_names:
    total_NaN_values = sum(np.isnan(input_features[column]))
    print (column + ": " + str(total_NaN_values))

home_win_streak: 0
away_win_streak: 0
home_lose_streak: 0
away_lose_streak: 0


#### Full time result as output
### First convert output to numeric for easyer processing

In [45]:
def ResultToNumeric(results):
    numericResults = []
    for result in results:
        if result == 'H':
            numericResults.append(0)
        elif result == 'A':
            numericResults.append(1)
        else:
            numericResults.append(2)
    return np.array(numericResults)

output = ResultToNumeric(data['FTR'])
output = output[buffer_games:]

# Baseline model

In [74]:
bins = np.bincount(output)
index = np.nonzero(bins)[0]
np.vstack((index,bins[index])).T

array([[  0, 721],
       [  1, 464],
       [  2, 384]], dtype=int64)

## Predict all games as a home win will result in 721/1569=0.460

# Try random forest

In [48]:
from sklearn.ensemble import RandomForestClassifier

random_forest_model = RandomForestClassifier(n_estimators=1000,
                                                   max_features=4,
                                                   max_depth=None,
                                                   min_samples_split=1,
                                                   n_jobs=-1)

## Scoring

In [67]:
def report_scoring(scores):
    print("List of scores: " + str(scores))
    print("mean scoring: " + str(scores.mean()))
    print("Standard deviation between scorings: " + str(scores.std()))

In [69]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(random_forest_model, input_features, output)

report_scoring(scores)

List of scores: [ 0.43129771  0.43977055  0.44444444]
mean scoring: 0.438504236287
Standard deviation between scorings: 0.00544131286177


## Feature importance

In [70]:
trained_random_forest_model = random_forest_model.fit(input_features,output)
pd.Series(trained_random_forest_model.feature_importances_, input_features.columns)

home_win_streak     0.265643
away_win_streak     0.299770
home_lose_streak    0.239936
away_lose_streak    0.194651
dtype: float64

# Boosted random forest

In [59]:
from sklearn.ensemble import AdaBoostClassifier

boosted_random_forest_model = AdaBoostClassifier(n_estimators=100)

## Scoring

In [68]:
scores = cross_val_score(boosted_random_forest_model, input_features, output)

report_scoring(scores)

List of scores: [ 0.4389313  0.4665392  0.4559387]
mean scoring: 0.45380306399
Standard deviation between scorings: 0.0113715939155


## Feature importance

In [62]:
trained_boosted_random_forest_model = boosted_random_forest_model.fit(input_features,output)
pd.Series(trained_boosted_random_forest_model.feature_importances_, input_features.columns)

home_win_streak     0.34
away_win_streak     0.18
home_lose_streak    0.18
away_lose_streak    0.30
dtype: float64

# SVM

In [76]:
from sklearn import svm

svm_model = svm.SVC()

## Scoring

In [77]:
scores = cross_val_score(svm_model, input_features, output)

report_scoring(scores)

List of scores: [ 0.46183206  0.47418738  0.47318008]
mean scoring: 0.469733172731
Standard deviation between scorings: 0.00560204363905


## Feature importance

In [79]:
trained_svm_model = svm_model.fit(input_features,output)
pd.Series(trained_svm_model.feature_importances_, input_features.columns)

AttributeError: 'SVC' object has no attribute 'feature_importances_'

In [78]:
from tpot import TPOTClassifier

tpot = TPOTClassifier(generations=10, population_size=40, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_soccer_pipeline.py')

Version 0.6.4 of tpot is outdated. Version 0.6.6 was released 2 days ago.




Generation 1 - Current best internal CV score: 0.581118906051




Generation 2 - Current best internal CV score: 0.582689161462




Generation 3 - Current best internal CV score: 0.588256600537




Generation 4 - Current best internal CV score: 0.588321128638




Generation 5 - Current best internal CV score: 0.588321128638




Generation 6 - Current best internal CV score: 0.588321128638




Generation 7 - Current best internal CV score: 0.588321128638




Generation 8 - Current best internal CV score: 0.588321128638




Generation 9 - Current best internal CV score: 0.588321128638




Generation 10 - Current best internal CV score: 0.589116290135





Best pipeline: LogisticRegression(SelectFromModel(input_matrix, 0.089999999999999997, 29, 0.60999999999999999), 11.0, 22, True)
0.594980465294
