In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report 
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('FootballDataset.csv')
df.shape[0]

6840

In [3]:
df.columns

Index(['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG',
       'HTAG', 'HTR', 'Attendance', 'Referee', 'HS', 'AS', 'HST', 'AST', 'HHW',
       'AHW', 'HC', 'AC', 'HF', 'AF', 'HO', 'AO', 'HY', 'AY', 'HR', 'AR',
       'Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31',
       'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35',
       'Unnamed: 36', 'Unnamed: 37', 'Unnamed: 38', 'Unnamed: 39',
       'Unnamed: 40', 'Unnamed: 41', 'Unnamed: 42', 'Unnamed: 43',
       'Unnamed: 44', 'Unnamed: 45', 'Unnamed: 46', 'Unnamed: 47',
       'Unnamed: 48', 'Unnamed: 49', 'Unnamed: 50', 'Unnamed: 51',
       'Unnamed: 52', 'Unnamed: 53', 'Unnamed: 54', 'Unnamed: 55',
       'Unnamed: 56', 'Unnamed: 57', 'Unnamed: 58', 'Unnamed: 59',
       'Unnamed: 60', 'Unnamed: 61', 'Unnamed: 62', 'Unnamed: 63',
       'Unnamed: 64', 'Unnamed: 65', 'Unnamed: 66', 'Unnamed: 67',
       'Unnamed: 68', 'Unnamed: 69', 'Unnamed: 70'],
      dtype='object')

In [4]:
df.drop(['HomeTeam', 'AwayTeam', 'Div', 'Date', 'FTHG', 'FTAG', 'HTHG',
         'HTAG', 'HTR', 'Attendance', 'Referee', 'HHW', 'AHW','HO', 'AO',
         'HY', 'AY', 'HR', 'AR', 'Unnamed: 28', 'Unnamed: 29', 
         'Unnamed: 30','Unnamed: 31','Unnamed: 32', 'Unnamed: 33', 
         'Unnamed: 34', 'Unnamed: 35','Unnamed: 36', 'Unnamed: 37', 
         'Unnamed: 38', 'Unnamed: 39', 'Unnamed: 40','Unnamed: 41', 
         'Unnamed: 42', 'Unnamed: 43','Unnamed: 44', 'Unnamed: 45',
         'Unnamed: 46', 'Unnamed: 47', 'Unnamed: 48', 'Unnamed: 49', 
         'Unnamed: 50', 'Unnamed: 51', 'Unnamed: 52', 'Unnamed: 53', 
         'Unnamed: 54', 'Unnamed: 55', 'Unnamed: 56', 'Unnamed: 57', 
         'Unnamed: 58', 'Unnamed: 59', 'Unnamed: 60', 'Unnamed: 61', 
         'Unnamed: 62', 'Unnamed: 63', 'Unnamed: 64', 'Unnamed: 65', 
         'Unnamed: 66', 'Unnamed: 67', 'Unnamed: 68', 'Unnamed: 69',
         'Unnamed: 70'],1, inplace=True)
display(df.head())

Unnamed: 0,FTR,HS,AS,HST,AST,HC,AC,HF,AF
0,H,17,8,14,4,6,6,13,12
1,H,17,12,10,5,7,7,19,14
2,A,6,16,3,9,8,4,15,21
3,D,6,13,4,6,5,8,11,13
4,H,17,12,8,6,6,4,21,20


In [5]:
def transformResult(row):
    '''Converts results (H,A or D) into numeric values'''
    if(row.FTR == 'H'):
        return 1
    elif(row.FTR == 'A'):
        return -1
    else:
        return 0

In [6]:
df["FTR"] = df.apply(lambda row: transformResult(row),axis=1)
df.head()

Unnamed: 0,FTR,HS,AS,HST,AST,HC,AC,HF,AF
0,1,17,8,14,4,6,6,13,12
1,1,17,12,10,5,7,7,19,14
2,-1,6,16,3,9,8,4,15,21
3,0,6,13,4,6,5,8,11,13
4,1,17,12,8,6,6,4,21,20


In [7]:
totalMatches = df.shape[0]
totalFeatures = df.shape[1] - 1
homeWins = len(df[df.FTR == 1])
draw = len(df[df.FTR == 0])
awayWins = len(df[df.FTR == -1])
homeRate = (float(homeWins) / (totalMatches)) * 100
drawRate = (float(draw) / (totalMatches)) * 100
awayRate = (float(awayWins) / (totalMatches)) * 100


print ("Total number of matches: ", totalMatches)
print ("Number of features: ", totalFeatures,'\n')
print ("Number of matches won by home team: ", homeWins)
print ("Win rate of home team: ", homeRate,'\n')
print ("Number of matches drawn: ", draw)
print ("Draw rate of the total matches: ", drawRate,'\n')
print ("Number of matches won by away team: ", awayWins)
print ("Win rate of away team: ", awayRate)

Total number of matches:  6840
Number of features:  8 

Number of matches won by home team:  3176
Win rate of home team:  46.4327485380117 

Number of matches drawn:  1751
Draw rate of the total matches:  25.599415204678362 

Number of matches won by away team:  1913
Win rate of away team:  27.96783625730994


In [8]:
X_all= df.drop(['FTR'],1)
y_all= df['FTR']

In [9]:
from sklearn.cross_validation import train_test_split

# Shuffle and split the dataset into training and testing set.
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, 
                                                    test_size = 700,
                                                    random_state = 13)




In [10]:
from sklearn.preprocessing import StandardScaler


In [11]:
X_train.shape[0]

6140

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier

  from numpy.core.umath_tests import inner1d


In [13]:
clfDT = DecisionTreeClassifier(criterion='gini', splitter='best', min_samples_split=3)
clfGNB = GaussianNB()
clfMLP = MLPClassifier(activation='relu', solver='lbfgs', alpha= 10.0,
                       hidden_layer_sizes=(35,), random_state=12)
clfKNN = KNeighborsClassifier(n_neighbors=18, weights='distance', algorithm='brute')
clfSVC = LinearSVC(random_state=100, max_iter=1000)
clfGBC = GradientBoostingClassifier(loss='deviance', learning_rate=0.2, n_estimators=118)

In [66]:
predDT = clfDT.fit(X_train,y_train).predict(X_test)
print("Accuracy=",accuracy_score(predDT,y_test))
matrixDT = confusion_matrix(y_test, predDT)
matrixDT

Accuracy= 0.4257142857142857


array([[ 71,  68,  62],
       [ 61,  44,  63],
       [ 73,  75, 183]], dtype=int64)

In [15]:
predGNB = clfGNB.fit(X_train,y_train).predict(X_test)
print("Accuracy=",accuracy_score(predGNB,y_test))
matrixGNB = confusion_matrix(y_test, predGNB)
matrixGNB

Accuracy= 0.49142857142857144


array([[ 75,  12, 114],
       [ 55,   9, 104],
       [ 58,  13, 260]], dtype=int64)

In [16]:
predMLP = clfMLP.fit(X_train,y_train).predict(X_test)
print("Accuracy=",accuracy_score(predMLP,y_test))
matrixMLP = confusion_matrix(y_test, predMLP)
matrixMLP

Accuracy= 0.5485714285714286


array([[101,   9,  91],
       [ 51,  11, 106],
       [ 49,  10, 272]], dtype=int64)

In [17]:
predKNN = clfKNN.fit(X_train,y_train).predict(X_test)
print("Accuracy=",accuracy_score(predKNN,y_test))
matrixKNN = confusion_matrix(y_test, predKNN)
matrixKNN

Accuracy= 0.52


array([[ 85,  26,  90],
       [ 44,  29,  95],
       [ 45,  36, 250]], dtype=int64)

In [18]:
predSVC = clfSVC.fit(X_train,y_train).predict(X_test)
print("Accuracy=",accuracy_score(predSVC,y_test))
matrixSVC = confusion_matrix(y_test, predSVC)
matrixSVC

Accuracy= 0.24


array([[  0, 201,   0],
       [  0, 168,   0],
       [  0, 331,   0]], dtype=int64)

In [19]:
predGBC = clfGBC.fit(X_train,y_train).predict(X_test)
print("Accuracy=",accuracy_score(predGBC,y_test))
matrixGBC = confusion_matrix(y_test, predGBC)
matrixGBC

Accuracy= 0.5585714285714286


array([[102,   9,  90],
       [ 41,  21, 106],
       [ 48,  15, 268]], dtype=int64)

In [20]:
df2 = pd.read_csv('FootballDataset.csv')
ht="Chelsea"
at="Man United"

In [21]:
homeData = df2["HomeTeam"]
homePlayed = 0
for i in homeData:
    if i == ht:
        homePlayed= homePlayed + 1

awayData = df2["AwayTeam"]
awayPlayed = 0
for i in awayData:
    if i == at:
        awayPlayed= awayPlayed + 1

homePlayed,awayPlayed

(342, 342)

In [22]:
HS= (df2[df2["HomeTeam"] == ht].sum().HS)/homePlayed
AS= (df2[df2["AwayTeam"] == at].sum().AS)/awayPlayed
HST= (df2[df2["HomeTeam"] == ht].sum().HST)/homePlayed
AST= (df2[df2["AwayTeam"] == at].sum().AST)/awayPlayed
HC= (df2[df2["HomeTeam"] == ht].sum().HC)/homePlayed
AC= (df2[df2["AwayTeam"] == at].sum().AC)/awayPlayed
HF= (df2[df2["HomeTeam"] == ht].sum().HF)/homePlayed
AF= (df2[df2["AwayTeam"] == at].sum().AF)/awayPlayed

In [23]:
p= [[HS,AS,HST,AST,HC,AC,HF,AF]]
p

[[9.026315789473685,
  6.12280701754386,
  4.099415204678363,
  11.400584795321638,
  4.084795321637427,
  2.1842105263157894,
  3.0964912280701755,
  1.3830409356725146]]

In [24]:
Result = clfDT.predict(p)
Result

array([-1], dtype=int64)

In [25]:
import pickle
filename = 'decisionTreeModel.sav'
pickle.dump(clfDT, open(filename, 'wb'))