In [3]:
%config IPCompleter.greedy=True
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import glob, os
from IPython.display import display

In [4]:
# pre-processing data
def format(file):
    # skip reading all ball-related data
    df = pd.read_csv(file, sep=',', skiprows=1, warn_bad_lines=False, error_bad_lines=False, index_col=None, mangle_dupe_cols=True)
    # first column not required
    df.drop(columns=["info"], inplace=True)
    # df = pd.read_csv("C:\Users\alizo\OneDrive - University of Waterloo\Desktop\Ali\Universities\Data Science\Cricket ML\Source\1211660.csv")
    # data format is transposed
    df = df.transpose(copy=True)
    headers = df.iloc[0]
    df = df[1:]
    df.columns = headers
    # multiple columns with same names
    df.columns = pd.io.parsers.ParserBase({'names':df.columns})._maybe_dedup_names(df.columns)
    df.reset_index(inplace=True)
    df.rename(columns={ df.columns[0]: "team1", df.columns[1]: "team2", "umpire" : "umpire1", "umpire.1" : "umpire2"}, inplace=True)
    
    return df



In [5]:
df = pd.DataFrame()
path = "C:/Users/alizo/OneDrive - University of Waterloo/Desktop/Ali/Universities/Data Science/Cricket ML/Source"
all_files = glob.glob(os.path.join(path, "*.csv"))
for i in all_files:
    new_df = format(i)
    df = pd.concat([df, new_df], axis=0, ignore_index=True)
    
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   team1            142 non-null    object
 1   team2            142 non-null    object
 2   gender           142 non-null    object
 3   season           142 non-null    object
 4   date             142 non-null    object
 5   competition      142 non-null    object
 6   match_number     126 non-null    object
 7   venue            142 non-null    object
 8   city             96 non-null     object
 9   neutralvenue     23 non-null     object
 10  toss_winner      142 non-null    object
 11  toss_decision    142 non-null    object
 12  player_of_match  139 non-null    object
 13  umpire1          142 non-null    object
 14  umpire2          142 non-null    object
 15  reserve_umpire   117 non-null    object
 16  tv_umpire        139 non-null    object
 17  match_referee    134 non-null    ob

Unnamed: 0,team1,team2,gender,season,date,competition,match_number,venue,city,neutralvenue,...,umpire2,reserve_umpire,tv_umpire,match_referee,winner,winner_wickets,method,winner_runs,outcome,eliminator
0,Islamabad United,Peshawar Zalmi,male,2016/17,2017/02/09,Pakistan Super League,1,Dubai International Cricket Stadium,,True,...,Shozab Raza,Asif Yaqoob,Rashid Riaz,RS Mahanama,Islamabad United,7.0,D/L,,,
1,Lahore Qalandars,Quetta Gladiators,male,2016/17,2017/02/10,Pakistan Super League,2,Dubai International Cricket Stadium,,True,...,Shozab Raza,Asif Yaqoob,Ahsan Raza,Mohammed Anees,Quetta Gladiators,,,8.0,,
2,Karachi Kings,Peshawar Zalmi,male,2016/17,2017/02/10,Pakistan Super League,3,Dubai International Cricket Stadium,,True,...,Ahsan Raza,Shozab Raza,Asif Yaqoob,RS Mahanama,Peshawar Zalmi,7.0,,,,
3,Islamabad United,Lahore Qalandars,male,2016/17,2017/02/11,Pakistan Super League,4,Dubai International Cricket Stadium,,True,...,Shozab Raza,Rashid Riaz,Ahsan Raza,RS Mahanama,Lahore Qalandars,6.0,,,,
4,Karachi Kings,Quetta Gladiators,male,2016/17,2017/02/11,Pakistan Super League,5,Dubai International Cricket Stadium,,True,...,Rashid Riaz,Asif Yaqoob,Ahmed Shahab,Mohammed Anees,Quetta Gladiators,7.0,,,,


In [6]:
# city dictionary
city_map = {'Dubai International Cricket Stadium' : 'Dubai',
            'Sharjah Cricket Stadium' : 'Sharjah',
            'Gaddafi Stadium' : 'Lahore',
            'National Stadium' : 'Karachi',
            'Sheikh Zayed Stadium' : 'Abu Dhabi',
            'Multan Cricket Stadium' : 'Multan',
            'Rawalpindi Cricket Stadium' : 'Rawalpindi'}
df['city'] = df.apply(lambda row: city_map[row['venue']], axis=1)
# df.head()

In [8]:
# tie games
df['winner'] = df.apply(
    lambda row: row['eliminator'] if pd.isnull(row['winner']) and pd.notnull(row['eliminator']) else row['winner'],
    axis=1
)
# games decided by D/L (washed out) 
df['outcome'] = df.apply(
    lambda row: "D/L" if pd.isnull(row['outcome']) and (row['method'] == "D/L") else row['outcome'],
    axis=1
)
# neutralvenue column
df.neutralvenue.fillna(value=False, inplace=True)
df.neutralvenue.replace({'true' : True}, inplace=True)

# taking care of NA values
df.outcome.fillna("Result", inplace=True)
df.winner.fillna("Draw", inplace=True)    
df.drop(columns=["match_number", "eliminator", "method"], inplace=True)

In [10]:
# normalizing data
mapping = {"team1" : {"Islamabad United" : 1, "Quetta Gladiators" : 2, "Karachi Kings" : 3, "Peshawar Zalmi" : 4, "Multan Sultans" : 5, "Lahore Qalandars" : 6 },
           "team2" : {"Islamabad United" : 1, "Quetta Gladiators" : 2, "Karachi Kings" : 3, "Peshawar Zalmi" : 4, "Multan Sultans" : 5, "Lahore Qalandars" : 6 },
           "toss_winner" : {"Islamabad United" : 1, "Quetta Gladiators" : 2, "Karachi Kings" : 3, "Peshawar Zalmi" : 4, "Multan Sultans" : 5, "Lahore Qalandars" : 6 },
           "winner" : {"Islamabad United" : 1, "Quetta Gladiators" : 2, "Karachi Kings" : 3, "Peshawar Zalmi" : 4, "Multan Sultans" : 5, "Lahore Qalandars" : 6, "Draw" : 7}}

df.replace(mapping, inplace=True)

In [11]:
city_list = df['city']
city_map, city_index = pd.Series(city_list).factorize()
print(city_map)
print(city_index)

[0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 1 1 0 2 0 0 0 0 0 0 0 0 1 1 1 1 1
 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 2 2 3 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0
 0 0 0 0 0 4 4 4 4 3 3 3 3 3 3 3 3 3 3 2 3 2 3 2 5 6 5 6 5 6 6 2 2 6 2 6 2
 6 2 2 3 3 3 2 3 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0]
Index(['Dubai', 'Sharjah', 'Lahore', 'Karachi', 'Abu Dhabi', 'Multan',
       'Rawalpindi'],
      dtype='object')


In [12]:
venue_list = df['venue']
venue_map, venue_index = pd.Series(venue_list).factorize()
print(venue_map)
print(venue_index)

[0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 1 1 0 2 0 0 0 0 0 0 0 0 1 1 1 1 1
 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 2 2 3 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0
 0 0 0 0 0 4 4 4 4 3 3 3 3 3 3 3 3 3 3 2 3 2 3 2 5 6 5 6 5 6 6 2 2 6 2 6 2
 6 2 2 3 3 3 2 3 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0]
Index(['Dubai International Cricket Stadium', 'Sharjah Cricket Stadium',
       'Gaddafi Stadium', 'National Stadium', 'Sheikh Zayed Stadium',
       'Multan Cricket Stadium', 'Rawalpindi Cricket Stadium'],
      dtype='object')


In [13]:
toss_decision_list = df['toss_decision']
toss_decision_map, toss_decision_index = pd.Series(toss_decision_list).factorize()
print(toss_decision_map)
print(toss_decision_index)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1
 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0]
Index(['field', 'bat'], dtype='object')


In [14]:
# categorical to numerical
from sklearn import preprocessing, metrics
from sklearn.model_selection import train_test_split
var = ['team1', 'team2', 'venue', 'neutralvenue', 'city', 'toss_winner', 'toss_decision', 'winner']
matches = df[var]
le = preprocessing.LabelEncoder()
for att in var:
    matches[att] = le.fit_transform(matches[att])
    


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [15]:
# function to run models
def run_model(model, data, attributes, labels, test_size, count, print_predictions):
    maximum = 0
    total = 0
    for i in range(count):
        x_train, x_test, y_train, y_test = train_test_split(data[attributes], data[labels], test_size=test_size) 
        model.fit(x_train, y_train.values.ravel())
        predictions = model.predict(x_test)
        if print_predictions: print(predictions)
        accuracy = metrics.accuracy_score(predictions, y_test)
        total = total + accuracy
        print('Accuracy : %s' % '{0:.3%}'.format(accuracy))
        if maximum < accuracy: maximum = accuracy
    print("Average Accuracy: "+'%s' % '{0:.3%}'.format(total/count))
    print("Highest Accuracy: "+'%s' % '{0:.3%}'.format(maximum))

In [16]:
# create function uses test_train split
# run model multiple times and calculate average accuracy
# for KNN run muliple times with different neighbours from 1-7...


from sklearn.linear_model import LogisticRegression
# Logistic Regression

attributes = ['team1', 'team2', 'venue', 'neutralvenue', 'city', 'toss_winner', 'toss_decision']
labels = ['winner']


model = LogisticRegression(max_iter=1000)
run_model(model, matches, attributes, labels, 0.15, 10, False)

Accuracy : 31.818%
Accuracy : 27.273%
Accuracy : 18.182%
Accuracy : 18.182%
Accuracy : 18.182%
Accuracy : 22.727%
Accuracy : 13.636%
Accuracy : 18.182%
Accuracy : 36.364%
Accuracy : 27.273%
Average Accuracy: 23.182%
Highest Accuracy: 36.364%


In [17]:
# KNearestNeighbors
from sklearn.neighbors import KNeighborsClassifier

attributes = ['team1', 'team2', 'venue', 'neutralvenue', 'city', 'toss_winner', 'toss_decision']
labels = ['winner']

model = KNeighborsClassifier(n_neighbors=2)

run_model(model, matches, attributes, labels, 0.15, 10, False)

Accuracy : 50.000%
Accuracy : 50.000%
Accuracy : 31.818%
Accuracy : 45.455%
Accuracy : 36.364%
Accuracy : 36.364%
Accuracy : 54.545%
Accuracy : 50.000%
Accuracy : 50.000%
Accuracy : 31.818%
Average Accuracy: 43.636%
Highest Accuracy: 54.545%


In [18]:
# Random Forest
from sklearn import tree
attributes = ['team1', 'team2', 'venue', 'neutralvenue', 'city', 'toss_winner', 'toss_decision']
labels = ['winner']

print("Measure of Node Impurity: Gini")
model_gini = tree.DecisionTreeClassifier(criterion='gini', min_samples_leaf=3)
run_model(model_gini, matches, attributes, labels, 0.15, 5, False)
print(model_gini.feature_importances_)
print(model_gini.tree_)

print("Measure of Node Impurity: Entropy")
model_entropy = tree.DecisionTreeClassifier(criterion='entropy', min_samples_leaf=3)
run_model(model_entropy, matches, attributes, labels, 0.15, 5, False)
print(model_entropy.feature_importances_)
print(model_entropy.tree_)


Measure of Node Impurity: Gini
Accuracy : 45.455%
Accuracy : 54.545%
Accuracy : 54.545%
Accuracy : 40.909%
Accuracy : 54.545%
Average Accuracy: 50.000%
Highest Accuracy: 54.545%
[0.15198251 0.43554463 0.10247678 0.06199521 0.04093239 0.20706849
 0.        ]
<sklearn.tree._tree.Tree object at 0x00000289B3DE7CE0>
Measure of Node Impurity: Entropy
Accuracy : 68.182%
Accuracy : 40.909%
Accuracy : 63.636%
Accuracy : 59.091%
Accuracy : 54.545%
Average Accuracy: 57.273%
Highest Accuracy: 68.182%
[0.21003778 0.25239891 0.03006269 0.00900839 0.15462413 0.33696278
 0.00690533]
<sklearn.tree._tree.Tree object at 0x00000289B3DE76C0>


In [23]:
# Gradient Boosted Classification
from sklearn.ensemble import GradientBoostingClassifier
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0)
    print("Learning rate: ", learning_rate)
    run_model(gb_clf, matches, attributes, labels, 0.2, 3, False)

Learning rate:  0.05
Accuracy : 55.172%
Accuracy : 55.172%
Accuracy : 34.483%
Average Accuracy: 48.276%
Highest Accuracy: 55.172%
Learning rate:  0.075
Accuracy : 37.931%
Accuracy : 55.172%
Accuracy : 55.172%
Average Accuracy: 49.425%
Highest Accuracy: 55.172%
Learning rate:  0.1
Accuracy : 44.828%
Accuracy : 55.172%
Accuracy : 51.724%
Average Accuracy: 50.575%
Highest Accuracy: 55.172%
Learning rate:  0.25
Accuracy : 68.966%
Accuracy : 55.172%
Accuracy : 58.621%
Average Accuracy: 60.920%
Highest Accuracy: 68.966%
Learning rate:  0.5
Accuracy : 51.724%
Accuracy : 48.276%
Accuracy : 41.379%
Average Accuracy: 47.126%
Highest Accuracy: 51.724%
Learning rate:  0.75
Accuracy : 48.276%
Accuracy : 44.828%
Accuracy : 48.276%
Average Accuracy: 47.126%
Highest Accuracy: 48.276%
Learning rate:  1
Accuracy : 13.793%
Accuracy : 55.172%
Accuracy : 13.793%
Average Accuracy: 27.586%
Highest Accuracy: 55.172%
