# Read CSV

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("Serie_A-B_stats.csv")

In [3]:
data.head(1)

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BbAv<2.5,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,PSCH,PSCD,PSCA
0,I1,24/08/13,Sampdoria,Juventus,0,1,A,0,0,D,...,1.8,19,1.0,2.09,2.04,1.88,1.83,8.38,4.53,1.47


# Set Index

In [4]:
data = data.set_index('Date')

# Select features

In [5]:
data = data[['Div', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR']] 
data.head(1)

Unnamed: 0_level_0,Div,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HS,...,HST,AST,HC,AC,HF,AF,HY,AY,HR,AR
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
24/08/13,I1,Sampdoria,Juventus,0,1,A,0,0,D,4,...,1,4,4,10,21,21,2,3,1,0


In [6]:
input_features = data.drop(['Div', 'HTR'],1)

In [7]:
g = input_features.columns.to_series().groupby(data.dtypes).groups

In [8]:
object_features = {key.name: value for key, value in g.items()}['object']

In [9]:
object_features

['HomeTeam', 'AwayTeam', 'FTR']

# Create performance history per team

1. Track winning or losing streak as feature
2. Track total wins, losses and draws

In [10]:
class Stats:
    def __init__(self):
        self.win_streak = 0
        self.lose_streak = 0
        self.total_wins = 0
        self.total_losses = 0
        self.total_draws = 0
        self.total_games = 0
        self.previous_encounters = {}
        self.result_series = []
        
    def get_win_ratio(self):
        try:
            return (float(self.total_wins) / float(self.total_games))
        except:
            return(0)
        
    def get_win_ratio_against_opponent(self, opponent):
        try:
            results = self.previous_encounters[opponent]
            return sum(results)
        except:
            return(0)
        
    #Results: 1 = win, 0 = draw, -1 = loss for this team against the opponent
    def update_previous_encounters(self, opponent, result):
        if opponent in self.previous_encounters:
            self.previous_encounters[opponent].append(result)
        else:
            self.previous_encounters[opponent] = [result]
    
    def update_win(self, opponent):
        self.win_streak = self.win_streak + 1
        self.lose_streak = 0
        self.total_wins = self.total_wins + 1
        self.total_games = self.total_games + 1
        self.update_previous_encounters(opponent, 1)
        self.result_series.append(1)
        
    def update_loss(self, opponent):
        self.lose_streak = self.lose_streak + 1
        self.win_streak = 0
        self.total_losses = self.total_losses + 1
        self.total_games = self.total_games + 1
        self.update_previous_encounters(opponent, -1)
        self.result_series.append(-1)
    
    def update_draw(self, opponent):
        self.total_draws = self.total_draws + 1
        self.win_streak = 0
        self.lose_streak = 0
        self.total_games = self.total_games + 1
        self.update_previous_encounters(opponent, 0)
        self.result_series.append(0)

In [11]:
teams = input_features.HomeTeam.unique()
stats = {}
for team in teams:
    stats[team] = Stats()

## Create feature arrays as columns for data frame

In [12]:
feature_names = ["win_streak",
    "lose_streak",
    "total_wins",
    "total_losses",
    "total_draws",
    "win_ratio",
    "opponent_win_ratio",
    "total_games"]

home_results = {}
away_results = {}

for feature in feature_names:
    home_results[feature] = []
    away_results[feature] = []

## Add aggregated stats from previous matches as features then update stats based on this match

In [13]:
def update_results(results, stats, opponent):
    results["win_streak"].append(stats.win_streak)
    results["lose_streak"].append(stats.lose_streak)
    results["total_wins"].append(stats.total_wins)
    results["total_losses"].append(stats.total_losses)
    results["total_draws"].append(stats.total_draws)
    results["win_ratio"].append(stats.get_win_ratio())
    results["opponent_win_ratio"].append(stats.get_win_ratio_against_opponent(opponent))
    results["total_games"].append(stats.total_games)
    
    return results
    

for date, match in input_features.iterrows():
    home_team = match.HomeTeam
    home_stats = stats[home_team]
    
    away_team = match.AwayTeam
    away_stats = stats[away_team]
    
    home_results = update_results(home_results, home_stats, away_team)

    away_results = update_results(away_results, away_stats, home_team)
    
    #Update stats based on results of this match
    match_result = match.FTR    
        
    if match_result == 'H':
        home_stats.update_win(away_team)
        away_stats.update_loss(home_team)
    elif match_result == 'A':
        away_stats.update_win(home_team)
        home_stats.update_loss(away_team)
    else:
        home_stats.update_draw(away_team)
        away_stats.update_draw(home_team)
        
    """if home_team == "Juventus" and away_team == "Roma":
        print("CHECK")
    if home_team == "Roma" and away_team == "Juventus":
        print("CHECK")"""
    
    """if home_team == "Juventus":
        print("total wins: " + str(home_stats.total_wins))
        print("total games: " + str(home_stats.total_games))
        print("win ratio: " + str(home_stats.get_win_ratio_against_opponent(away_team)))
    elif away_team == "Juventus":
        print("total wins: " + str(away_stats.total_wins))
        print("total games: " + str(away_stats.total_games))
        print("win ratio: " + str(away_stats.get_win_ratio_against_opponent(home_team)))"""

In [14]:
#stats["Juventus"].previous_encounters

In [15]:
columns = {}

for column_name, column  in home_results.iteritems():
    columns["home_" + column_name] = column
 
for column_name, column  in away_results.iteritems():
    columns["away_" + column_name] = column

In [16]:
input_features = pd.DataFrame(data=columns, index=data.index.values)

In [17]:
input_features = input_features.drop(["away_total_losses", "home_total_losses", "away_total_wins", "home_total_wins", "away_total_draws", "home_total_draws"], 1)

In [18]:
input_features.tail(10)

Unnamed: 0,away_lose_streak,away_opponent_win_ratio,away_total_games,away_win_ratio,away_win_streak,home_lose_streak,home_opponent_win_ratio,home_total_games,home_win_ratio,home_win_streak
17/05/14,2,1,37,0.324324,0,0,-1,37,0.324324,0
18/05/14,0,1,37,0.405405,1,0,-1,37,0.189189,2
18/05/14,0,0,37,0.405405,1,0,0,37,0.243243,1
18/05/14,0,0,37,0.405405,0,0,0,37,0.513514,1
18/05/14,2,1,37,0.702703,0,1,-1,37,0.27027,0
18/05/14,2,-1,37,0.243243,0,0,1,37,0.864865,6
18/05/14,1,0,37,0.135135,0,1,0,37,0.378378,0
18/05/14,0,1,37,0.243243,2,1,-1,37,0.405405,0
18/05/14,0,-1,37,0.432432,0,0,1,37,0.594595,2
18/05/14,6,-1,37,0.162162,0,0,1,37,0.378378,0


## Full time result as output
### First convert output to numeric

In [19]:
def ResultToNumeric(results):
    numericResults = []
    for result in results:
        if result == 'H':
            numericResults.append(0)
        elif result == 'A':
            numericResults.append(1)
        else:
            numericResults.append(2)
    return np.array(numericResults)

output = ResultToNumeric(data['FTR'])

## Create train and test set

In [20]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(input_features, output,
                                                    train_size=0.75, test_size=0.25)

In [21]:
from tpot import TPOTClassifier

tpot = TPOTClassifier(generations=10, population_size=40, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_soccer_pipeline.py')



Generation 1 - Current best internal CV score: 0.54600477622




Generation 2 - Current best internal CV score: 0.54600477622




Generation 3 - Current best internal CV score: 0.555150633766




Generation 4 - Current best internal CV score: 0.55679971996




Generation 5 - Current best internal CV score: 0.557328704904




Generation 6 - Current best internal CV score: 0.561827836979




Generation 7 - Current best internal CV score: 0.561827836979




Generation 8 - Current best internal CV score: 0.561827836979




Generation 9 - Current best internal CV score: 0.570015517609




Generation 10 - Current best internal CV score: 0.570015517609





Best pipeline: BernoulliNB(GaussianNB(input_matrix), 0.53000000000000003, 0.53000000000000003)
0.600563937146
