# Model Predicting the Winner before the Game without Team Features

In [2]:
import pandas as pd
import numpy as np
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, CategoricalNB, ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, confusion_matrix, classification_report
pd.set_option('display.max_columns', None)   # Print all columns

# Preprocessing

In [3]:
# Load the data after general preprocessing
data = pd.read_csv('./data/pjt02_data_processed.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,ground,team1,team2,neutral_venue,home_venue,first_innings,toss_winner,toss_choice,dbMktPriceTeam1,dbMktPriceTeam2,victor,derby,start_month,month_02,month_12
0,0,Manuka Oval,Perth Scorchers,Sydney Sixers,1,0,Sydney Sixers,Sydney Sixers,Bat,1.893,2.02,Perth Scorchers,0,1,0,0
1,4,W.A.C.A. Ground,Perth Scorchers,Melbourne Stars,0,1,Perth Scorchers,Perth Scorchers,Bat,1.775,2.17,Perth Scorchers,0,1,0,0
2,5,Adelaide Oval,Adelaide Strikers,Hobart Hurricanes,0,1,Hobart Hurricanes,Hobart Hurricanes,Bat,2.22,1.741,Adelaide Strikers,0,12,0,1
3,6,Docklands Stadium,Melbourne Renegades,Sydney Thunder,0,1,Sydney Thunder,Melbourne Renegades,Bowl,1.69,2.31,Melbourne Renegades,0,12,0,1
4,7,Sydney Cricket Ground,Sydney Sixers,Perth Scorchers,0,1,Perth Scorchers,Perth Scorchers,Bat,1.633,2.43,Perth Scorchers,0,12,0,1


In [4]:
# Dataframe of team1
t1 = data.drop('team2', axis=1)
t1.head()

Unnamed: 0.1,Unnamed: 0,ground,team1,neutral_venue,home_venue,first_innings,toss_winner,toss_choice,dbMktPriceTeam1,dbMktPriceTeam2,victor,derby,start_month,month_02,month_12
0,0,Manuka Oval,Perth Scorchers,1,0,Sydney Sixers,Sydney Sixers,Bat,1.893,2.02,Perth Scorchers,0,1,0,0
1,4,W.A.C.A. Ground,Perth Scorchers,0,1,Perth Scorchers,Perth Scorchers,Bat,1.775,2.17,Perth Scorchers,0,1,0,0
2,5,Adelaide Oval,Adelaide Strikers,0,1,Hobart Hurricanes,Hobart Hurricanes,Bat,2.22,1.741,Adelaide Strikers,0,12,0,1
3,6,Docklands Stadium,Melbourne Renegades,0,1,Sydney Thunder,Melbourne Renegades,Bowl,1.69,2.31,Melbourne Renegades,0,12,0,1
4,7,Sydney Cricket Ground,Sydney Sixers,0,1,Perth Scorchers,Perth Scorchers,Bat,1.633,2.43,Perth Scorchers,0,12,0,1


In [5]:
# Convert first_innings as binary
t1['first_innings'] = t1.apply(lambda x: 1 if x['team1'] == x['first_innings'] else 0, axis=1)
t1.head()

Unnamed: 0.1,Unnamed: 0,ground,team1,neutral_venue,home_venue,first_innings,toss_winner,toss_choice,dbMktPriceTeam1,dbMktPriceTeam2,victor,derby,start_month,month_02,month_12
0,0,Manuka Oval,Perth Scorchers,1,0,0,Sydney Sixers,Bat,1.893,2.02,Perth Scorchers,0,1,0,0
1,4,W.A.C.A. Ground,Perth Scorchers,0,1,1,Perth Scorchers,Bat,1.775,2.17,Perth Scorchers,0,1,0,0
2,5,Adelaide Oval,Adelaide Strikers,0,1,0,Hobart Hurricanes,Bat,2.22,1.741,Adelaide Strikers,0,12,0,1
3,6,Docklands Stadium,Melbourne Renegades,0,1,0,Melbourne Renegades,Bowl,1.69,2.31,Melbourne Renegades,0,12,0,1
4,7,Sydney Cricket Ground,Sydney Sixers,0,1,0,Perth Scorchers,Bat,1.633,2.43,Perth Scorchers,0,12,0,1


In [6]:
# Convert toss_winner as binary
t1['toss_winner'] = t1.apply(lambda x: 1 if x['team1'] == x['toss_winner'] else 0, axis=1)
t1.head()

Unnamed: 0.1,Unnamed: 0,ground,team1,neutral_venue,home_venue,first_innings,toss_winner,toss_choice,dbMktPriceTeam1,dbMktPriceTeam2,victor,derby,start_month,month_02,month_12
0,0,Manuka Oval,Perth Scorchers,1,0,0,0,Bat,1.893,2.02,Perth Scorchers,0,1,0,0
1,4,W.A.C.A. Ground,Perth Scorchers,0,1,1,1,Bat,1.775,2.17,Perth Scorchers,0,1,0,0
2,5,Adelaide Oval,Adelaide Strikers,0,1,0,0,Bat,2.22,1.741,Adelaide Strikers,0,12,0,1
3,6,Docklands Stadium,Melbourne Renegades,0,1,0,1,Bowl,1.69,2.31,Melbourne Renegades,0,12,0,1
4,7,Sydney Cricket Ground,Sydney Sixers,0,1,0,0,Bat,1.633,2.43,Perth Scorchers,0,12,0,1


In [7]:
# Convert victor as binary
t1['victor'] = t1.apply(lambda x: 1 if x['team1'] == x['victor'] else 0, axis=1)
t1.head()

Unnamed: 0.1,Unnamed: 0,ground,team1,neutral_venue,home_venue,first_innings,toss_winner,toss_choice,dbMktPriceTeam1,dbMktPriceTeam2,victor,derby,start_month,month_02,month_12
0,0,Manuka Oval,Perth Scorchers,1,0,0,0,Bat,1.893,2.02,1,0,1,0,0
1,4,W.A.C.A. Ground,Perth Scorchers,0,1,1,1,Bat,1.775,2.17,1,0,1,0,0
2,5,Adelaide Oval,Adelaide Strikers,0,1,0,0,Bat,2.22,1.741,1,0,12,0,1
3,6,Docklands Stadium,Melbourne Renegades,0,1,0,1,Bowl,1.69,2.31,1,0,12,0,1
4,7,Sydney Cricket Ground,Sydney Sixers,0,1,0,0,Bat,1.633,2.43,0,0,12,0,1


In [8]:
# Extract only necessary columns
t1 = t1[['neutral_venue', 'home_venue', 'first_innings', 'toss_winner', 'derby', 'month_02', 'month_12', 'victor']]
t1.head()

Unnamed: 0,neutral_venue,home_venue,first_innings,toss_winner,derby,month_02,month_12,victor
0,1,0,0,0,0,0,0,1
1,0,1,1,1,0,0,0,1
2,0,1,0,0,0,0,1,1
3,0,1,0,1,0,0,1,1
4,0,1,0,0,0,0,1,0


In [9]:
# Do the same for team2
# Dataframe of team2
t2 = data.drop('team1', axis=1)
t2['first_innings'] = t2.apply(lambda x: 1 if x['team2'] == x['first_innings'] else 0, axis=1)
t2['toss_winner'] = t2.apply(lambda x: 1 if x['team2'] == x['toss_winner'] else 0, axis=1)
t2['victor'] = t2.apply(lambda x: 1 if x['team2'] == x['victor'] else 0, axis=1)

# Additional process for team2 - 0 for home_venue
t2['home_venue'] = 0

# Extract only necessary columns
t2 = t2[['neutral_venue', 'home_venue', 'first_innings', 'toss_winner', 'derby', 'month_02', 'month_12', 'victor']]
t2.head()

Unnamed: 0,neutral_venue,home_venue,first_innings,toss_winner,derby,month_02,month_12,victor
0,1,0,1,1,0,0,0,0
1,0,0,0,0,0,0,0,0
2,0,0,1,1,0,0,1,0
3,0,0,1,0,0,0,1,0
4,0,0,1,1,0,0,1,1


In [10]:
# Combine team1 and team2
data2 = pd.concat([t1, t2], axis=0, ignore_index=True)
data2.tail()

Unnamed: 0,neutral_venue,home_venue,first_innings,toss_winner,derby,month_02,month_12,victor
937,1,0,0,0,0,0,0,0
938,1,0,0,0,0,0,0,0
939,1,0,0,0,0,0,0,0
940,0,0,1,0,0,0,0,0
941,1,0,0,1,0,0,0,0


In [11]:
# Train-test split
X = data2[['neutral_venue', 'home_venue', 'first_innings', 'toss_winner', 'derby', 'month_02', 'month_12']]
y = data2['victor']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)
X_train.shape, y_test.shape

((753, 7), (189,))

# Feature Selection

## Select K best by chi-squared
Due to the high p-values, no features can be selected.


In [12]:
SelectKBest(chi2, k=7).fit(X, y).scores_    # rank: toss_winner, home_venue, first_innings

array([0.        , 0.12158809, 0.10403397, 1.12314225, 0.        ,
       0.        , 0.        ])

In [13]:
SelectKBest(chi2, k=3).fit(X, y).get_feature_names_out()   # name of the features of 3 high scores

array(['home_venue', 'first_innings', 'toss_winner'], dtype=object)

In [14]:
SelectKBest(chi2, k=7).fit(X, y).pvalues_    # to big to be valid

array([1.        , 0.72731842, 0.74704139, 0.28924285, 1.        ,
       1.        , 1.        ])

# Classification Analysis

## Logistic Regression

### Permutation importance

In [15]:
# Feature selection
logr = LogisticRegression().fit(X_train, y_train)
result = permutation_importance(logr, X_train, y_train, scoring='f1', n_repeats=10, random_state=0)

for i in result.importances_mean.argsort()[::-1]:
    if result.importances_mean[i] - 2*result.importances_std[i] > 0:
        print(i, X_train.columns[i], result.importances_mean[i]) 

4 derby 0.013682335496290342


In [16]:
# Cross validation
X_5 = X.iloc[:, [4]]
cross_validate(LogisticRegression(), X_5, y, cv=10, scoring='f1')['test_score'].mean()

0.14396375259403163

In [17]:
# Use all features
cross_validate(LogisticRegression(), X, y, cv=10, scoring='f1')['test_score'].mean()

0.4330280083148443

### Select from model

In [18]:
logr = LogisticRegression()
selector = SelectFromModel(logr).fit(X_train, y_train)
selector.get_support()    # Feature 4, 5, 6 -> 4, 5 or 4, 5, 6, 7

array([False, False, False,  True,  True,  True, False])

In [19]:
# Evaluation with feature 4, 5
X_45 = X.iloc[:, [3, 4]]
cross_validate(logr, X_45, y, cv=10, scoring='f1')['test_score'].mean()

0.5213386472090447

In [20]:
# Use feature 4, 5, 6, 7
X_4567 = X.iloc[:, [3, 4, 5, 6]]
cross_validate(logr, X_4567, y, cv=10, scoring='f1')['test_score'].mean()

0.5131043864364285

## SVM

### Permutation importance

In [21]:
svc = SVC().fit(X_train, y_train)
result = permutation_importance(svc, X_train, y_train, scoring='f1', n_repeats=10, random_state=0)

for i in result.importances_mean.argsort()[::-1]:
    if result.importances_mean[i] - 2*result.importances_std[i] > 0:
        print(i, X_train.columns[i], result.importances_mean[i])

3 toss_winner 0.035597563162168985
4 derby 0.03169169006249824
1 home_venue 0.029680483312681756
6 month_12 0.02866407496913296
5 month_02 0.010493926985335817


In [22]:
# Use feature 2, 4, 5, 6, 7
X_24567 = X.iloc[:, [1, 3, 4, 5, 6]]
cross_validate(SVC(), X_24567, y, cv=10, scoring='f1')['test_score'].mean()

0.4722165445794618

### Select from model

In [23]:
svc = SVC(kernel='linear')      # To use coef_ attribute 
selector = SelectFromModel(svc).fit(X_train, y_train)
selector.get_support()          # Feature 4, 5
cross_validate(svc, selector.transform(X), y, cv=10, scoring='f1')['test_score'].mean()

0.5213386472090447

In [24]:
# Use all features
cross_validate(SVC(kernel='linear'), X, y, cv=10, scoring='f1')['test_score'].mean()

0.48648963259231587

## Decision Tree

### Permutation importance

In [25]:
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=5, random_state=0).fit(X_train, y_train)
result = permutation_importance(dt, X_train, y_train, scoring='f1', n_repeats=10, random_state=0)

for i in result.importances_mean.argsort()[::-1]:
    if result.importances_mean[i] - 2*result.importances_std[i] > 0:
        print(i, X_train.columns[i], result.importances_mean[i])    

2 first_innings 0.05195846446397721
3 toss_winner 0.03031218735927641
4 derby 0.024830070808176286
1 home_venue 0.023902889729842537
0 neutral_venue 0.01955687293514511


In [26]:
# Use feature 1, 2, 3, 4, 5
dt = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=5, random_state=0)
X_12345 = X.iloc[:, [1, 2, 3, 4, 5]]
cross_validate(dt, X_12345, y, cv=10, scoring='f1')['test_score'].mean()

0.47579582425724454

### Select from model

In [27]:
selector = SelectFromModel(dt).fit(X_train, y_train)
selector.get_support()        # Feature 2, 3, 5 
cross_validate(dt, selector.transform(X), y, cv=10, scoring='f1')['test_score'].mean()

0.46556051609891175

In [28]:
# Use all features
cross_validate(dt, X, y, cv=10, scoring='f1')['test_score'].mean()

0.4532040773235783

## AdaBoost

### Permutation importance

In [29]:
ada = AdaBoostClassifier(n_estimators=100, random_state=0).fit(X_train, y_train)
result = permutation_importance(ada, X_train, y_train, scoring='f1', n_repeats=10, random_state=0)

for i in result.importances_mean.argsort()[::-1]:
    if result.importances_mean[i] - 2*result.importances_std[i] > 0:
        print(i, X_train.columns[i], result.importances_mean[i])   # No features returned

4 derby 0.012636125866387137


In [30]:
# Use feature 5
ada = AdaBoostClassifier(n_estimators=100, random_state=0)
cross_validate(ada, X_5, y, cv=10, scoring='f1')['test_score'].mean()

0.14396375259403163

### Select from model

In [31]:
selector = SelectFromModel(ada).fit(X_train, y_train)
selector.get_support()       # Feature 1, 2, 3, 5
cross_validate(ada, selector.transform(X), y, cv=10, scoring='f1')['test_score'].mean()

0.37863411683567066

In [32]:
# Use all features
cross_validate(ada, X, y, cv=10, scoring='f1')['test_score'].mean()

0.4318825329425648

## Random Forest

### Permutation importance

In [33]:
rf = RandomForestClassifier(criterion='entropy', max_depth=5, min_samples_leaf=5, random_state=0, max_samples=200).fit(X_train, y_train)
result = permutation_importance(rf, X_train, y_train, scoring='f1', n_repeats=10, random_state=0)

for i in result.importances_mean.argsort()[::-1]:
    if result.importances_mean[i] - 2*result.importances_std[i] > 0:
        print(i, X_train.columns[i], result.importances_mean[i])    

1 home_venue 0.03190725538521132
4 derby 0.023808230149773745


In [34]:
# Use feature 1, 2, 3
rf = RandomForestClassifier(criterion='entropy', max_depth=5, min_samples_leaf=5, random_state=0, max_samples=200)
X_123 = X.iloc[:, [0, 1, 2]]
cross_validate(rf, X_123, y, cv=10, scoring='f1')['test_score'].mean()

0.42456844191553367

### Select from model

In [35]:
selector = SelectFromModel(rf).fit(X_train, y_train)
selector.get_support()      # Feature 3, 4, 7 -> Feature 3, 4 OR 3, 4, 6, 7

array([False, False,  True,  True, False, False,  True])

In [36]:
# Use feature 3, 4
X_34 = X.iloc[:, [2, 3]]
cross_validate(rf, X_34, y, cv=10, scoring='f1')['test_score'].mean()

0.4709272963217751

In [37]:
# Use feature 3, 4, 6, 7
X_3467 = X.iloc[:, [2, 3, 5, 6]]
cross_validate(rf, X_3467, y, cv=10, scoring='f1')['test_score'].mean()

0.48367196149262204

In [38]:
# Use all features
cross_validate(rf, X, y, cv=10, scoring='f1')['test_score'].mean()

0.4840358510770481

## Bernoulli NB
for Multivatiate Bernoulli models: binary features

### Permutation importance

In [39]:
bnb = BernoulliNB(binarize=None).fit(X_train, y_train)
result = permutation_importance(bnb, X_train, y_train, scoring='f1', n_repeats=10, random_state=0)

for i in result.importances_mean.argsort()[::-1]:
    if result.importances_mean[i] - 2*result.importances_std[i] > 0:
        print(i, X_train.columns[i], result.importances_mean[i])

4 derby 0.014203293532846573


In [40]:
# Use feature 5
bnb = BernoulliNB(binarize=None)
cross_validate(bnb, X_5, y, cv=10, scoring='f1')['test_score'].mean()

0.14396375259403163

In [41]:
# Use all features
cross_validate(bnb, X, y, cv=10, scoring='f1')['test_score'].mean()

0.43330192338635687

## Categorical NB
For binary variables, the same results as BernoulliNB

### Permutation importance

In [42]:
ca_nb = CategoricalNB().fit(X_train, y_train)
result = permutation_importance(ca_nb, X_train, y_train, scoring='f1', n_repeats=10, random_state=0)

for i in result.importances_mean.argsort()[::-1]:
    if result.importances_mean[i] - 2*result.importances_std[i] > 0:
        print(i, X_train.columns[i], result.importances_mean[i]) 

4 derby 0.014203293532846573


In [43]:
# Use feature 5
cross_validate(CategoricalNB(), X_5, y, cv=10, scoring='f1')['test_score'].mean()

0.14396375259403163

In [44]:
# Use all features
cross_validate(CategoricalNB(), X, y, cv=10, scoring='f1')['test_score'].mean()

0.43330192338635687

## Complement NB

### Permutation importance

In [47]:
co_nb = ComplementNB().fit(X_train, y_train)
result = permutation_importance(co_nb, X_train, y_train, scoring='f1', n_repeats=10, random_state=0)

for i in result.importances_mean.argsort()[::-1]:
    if result.importances_mean[i] - 2*result.importances_std[i] > 0:
        print(i, X_train.columns[i], result.importances_mean[i])  # No features returned

In [48]:
# Use all features
cross_validate(co_nb, X, y, cv=10, scoring='f1')['test_score'].mean()

0.47832213896018433

## KNN

### Permutation importance

In [127]:
knn = KNeighborsClassifier(weights='distance').fit(X_train, y_train)
result = permutation_importance(knn, X_train, y_train, scoring='f1', n_repeats=10, random_state=0)

for i in result.importances_mean.argsort()[::-1]:
    if result.importances_mean[i] - 2*result.importances_std[i] > 0:
        print(i, X_train.columns[i], result.importances_mean[i]) 

4 derby 0.018864490053721546
5 month_02 0.010967198182377952


In [130]:
# Use feature 5
knn = KNeighborsClassifier(weights='distance')
cross_validate(knn, X_5, y, cv=10, scoring='f1')['test_score'].mean()

0.5818199740976031

In [132]:
# Use feature 5, 6, 7
X_567 = X.iloc[:, [4, 5, 6]]
cross_validate(knn, X_567, y, cv=10, scoring='f1')['test_score'].mean()

0.47731339043446186

In [133]:
# Use all features
cross_validate(knn, X, y, cv=10, scoring='f1')['test_score'].mean()

0.4818354341717036

### Select from model
Impossible for this model

## Neural Network

In [51]:
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Activation, Dropout

# Set random seed for reproducibility
tf.random.set_seed(50)

f1 = []

# Cross validation
for train_idx, test_idx in StratifiedKFold(n_splits=10).split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Transform data for neural network
    # One node predicted all instances to class 1
    y_train_nn = to_categorical(y_train, 2)

    # Define network
    net = Sequential()
    net.add(Dense(16, input_shape=(7,)))
    net.add(Activation('swish'))
    net.add(Dropout(rate=0.5))
    
    net.add(Dense(16))
    net.add(Activation('swish'))
    net.add(Dropout(rate=0.5))
    
    net.add(Dense(8))
    net.add(Activation('swish'))
    
    net.add(Dense(4))
    net.add(Activation('swish'))
    
    net.add(Dense(2, activation='softmax'))

    # Compile the network
    net.compile(loss='binary_crossentropy', optimizer='adam', metrics=['Precision', 'Recall'])
    
    # Fit the network with data
    net.fit(X_train, y_train_nn, epochs=20, batch_size=32, validation_split=0.25, verbose=0)
    outputs = net.predict(X_test)
    y_pred = np.argmax(outputs, axis=1)
    score = f1_score(y_test, y_pred)
    f1.append(score)
    
print(np.mean(f1))   

0.5844163986144484
