In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import glob, os
from IPython.display import display

In [2]:
# pre-processing data
def format(file):
    # skip reading all ball-related data
    df = pd.read_csv(file, sep=',', skiprows=1, warn_bad_lines=False, error_bad_lines=False, index_col=None, mangle_dupe_cols=True) 
    df = df[df["info"] != "ball"] 
    # first column not required
    df.drop(columns=["info"], inplace=True)
    # df = pd.read_csv("C:\Users\alizo\OneDrive - University of Waterloo\Desktop\Ali\Universities\Data Science\Cricket ML\Source\1211660.csv")
    # data format is transposed
    df = df.transpose(copy=True)
    headers = df.iloc[0]
    df = df[1:]
    df.columns = headers
    # multiple columns with same names
    df.columns = pd.io.parsers.ParserBase({'names':df.columns})._maybe_dedup_names(df.columns)
    df.reset_index(inplace=True)
    df.rename(columns={ df.columns[0]: "team1", df.columns[1]: "team2", "umpire" : "umpire1", "umpire.1" : "umpire2"}, inplace=True)
    return df



In [3]:
df = pd.DataFrame()
path = "C:/Users/alizo/OneDrive - University of Waterloo/Desktop/Ali/Universities/Data Science/Cricket ML/Source"
all_files = glob.glob(os.path.join(path, "*.csv"))
for i in all_files:
    new_df = format(i)
    df = pd.concat([df, new_df], axis=0, ignore_index=True)    

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   team1            142 non-null    object
 1   team2            142 non-null    object
 2   gender           142 non-null    object
 3   season           142 non-null    object
 4   date             142 non-null    object
 5   competition      142 non-null    object
 6   match_number     126 non-null    object
 7   venue            142 non-null    object
 8   city             96 non-null     object
 9   neutralvenue     23 non-null     object
 10  toss_winner      142 non-null    object
 11  toss_decision    142 non-null    object
 12  player_of_match  139 non-null    object
 13  umpire1          142 non-null    object
 14  umpire2          142 non-null    object
 15  reserve_umpire   117 non-null    object
 16  tv_umpire        139 non-null    object
 17  match_referee    134 non-null    ob

Unnamed: 0,team1,team2,gender,season,date,competition,match_number,venue,city,neutralvenue,...,umpire2,reserve_umpire,tv_umpire,match_referee,winner,winner_wickets,method,winner_runs,outcome,eliminator
0,Islamabad United,Peshawar Zalmi,male,2016/17,2017/02/09,Pakistan Super League,1,Dubai International Cricket Stadium,,True,...,Shozab Raza,Asif Yaqoob,Rashid Riaz,RS Mahanama,Islamabad United,7.0,D/L,,,
1,Lahore Qalandars,Quetta Gladiators,male,2016/17,2017/02/10,Pakistan Super League,2,Dubai International Cricket Stadium,,True,...,Shozab Raza,Asif Yaqoob,Ahsan Raza,Mohammed Anees,Quetta Gladiators,,,8.0,,
2,Karachi Kings,Peshawar Zalmi,male,2016/17,2017/02/10,Pakistan Super League,3,Dubai International Cricket Stadium,,True,...,Ahsan Raza,Shozab Raza,Asif Yaqoob,RS Mahanama,Peshawar Zalmi,7.0,,,,
3,Islamabad United,Lahore Qalandars,male,2016/17,2017/02/11,Pakistan Super League,4,Dubai International Cricket Stadium,,True,...,Shozab Raza,Rashid Riaz,Ahsan Raza,RS Mahanama,Lahore Qalandars,6.0,,,,
4,Karachi Kings,Quetta Gladiators,male,2016/17,2017/02/11,Pakistan Super League,5,Dubai International Cricket Stadium,,True,...,Rashid Riaz,Asif Yaqoob,Ahmed Shahab,Mohammed Anees,Quetta Gladiators,7.0,,,,


In [4]:
# city dictionary
city_map = {'Dubai International Cricket Stadium' : 'Dubai',
            'Sharjah Cricket Stadium' : 'Sharjah',
            'Gaddafi Stadium' : 'Lahore',
            'National Stadium' : 'Karachi',
            'Sheikh Zayed Stadium' : 'Abu Dhabi',
            'Multan Cricket Stadium' : 'Multan',
            'Rawalpindi Cricket Stadium' : 'Rawalpindi'}
df['city'] = df.apply(lambda row: city_map[row['venue']], axis=1)
# df.head()

In [5]:
df.toss_decision.unique()
# df.outcome.unique()
df[pd.isnull(df['winner'])]


Unnamed: 0,team1,team2,gender,season,date,competition,match_number,venue,city,neutralvenue,...,umpire2,reserve_umpire,tv_umpire,match_referee,winner,winner_wickets,method,winner_runs,outcome,eliminator
8,Peshawar Zalmi,Quetta Gladiators,male,2016/17,2017/02/17,Pakistan Super League,9,Sharjah Cricket Stadium,Sharjah,True,...,Asif Yaqoob,Ahmed Shahab,Rashid Riaz,Mohammed Anees,,,,,no result,
34,Lahore Qalandars,Islamabad United,male,2017/18,2018/03/02,Pakistan Super League,12,Sharjah Cricket Stadium,Sharjah,,...,Khalid Mahmood,Rashid Riaz,Ahmed Shahab,Mohammed Anees,,,,,tie,Islamabad United
46,Karachi Kings,Lahore Qalandars,male,2017/18,2018/03/11,Pakistan Super League,24,Dubai International Cricket Stadium,Dubai,,...,Aleem Dar,Asif Yaqoob,,RS Mahanama,,,,,tie,Lahore Qalandars
108,Karachi Kings,Multan Sultans,male,2019/20,2020/03/06,Pakistan Super League,19,Gaddafi Stadium,Lahore,,...,MA Gough,Nasir Hussain,Asif Yaqoob,Aziz-ur-Rehman,,,,,no result,


In [6]:
# tie games
df['winner'] = df.apply(
    lambda row: row['eliminator'] if pd.isnull(row['winner']) and pd.notnull(row['eliminator']) else row['winner'],
    axis=1
)
# games decided by D/L (washed out) 
df['outcome'] = df.apply(
    lambda row: "D/L" if pd.isnull(row['outcome']) and (row['method'] == "D/L") else row['outcome'],
    axis=1
)
df.outcome.fillna("Result", inplace=True)
df.winner.fillna("Draw", inplace=True)
df.drop(columns=["match_number", "eliminator", "method"], inplace=True)

In [7]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   team1            142 non-null    object
 1   team2            142 non-null    object
 2   gender           142 non-null    object
 3   season           142 non-null    object
 4   date             142 non-null    object
 5   competition      142 non-null    object
 6   venue            142 non-null    object
 7   city             142 non-null    object
 8   neutralvenue     23 non-null     object
 9   toss_winner      142 non-null    object
 10  toss_decision    142 non-null    object
 11  player_of_match  139 non-null    object
 12  umpire1          142 non-null    object
 13  umpire2          142 non-null    object
 14  reserve_umpire   117 non-null    object
 15  tv_umpire        139 non-null    object
 16  match_referee    134 non-null    object
 17  winner           142 non-null    ob

Unnamed: 0,team1,team2,gender,season,date,competition,venue,city,neutralvenue,toss_winner,...,player_of_match,umpire1,umpire2,reserve_umpire,tv_umpire,match_referee,winner,winner_wickets,winner_runs,outcome
count,142,142,142,142,142,142,142,142,23,142,...,139,142,142,117,139,134,142,89,49,142
unique,6,6,1,5,101,1,7,7,1,6,...,78,12,12,14,12,6,7,10,32,4
top,Islamabad United,Quetta Gladiators,male,2018/19,2018/03/03,Pakistan Super League,Dubai International Cricket Stadium,Dubai,true,Quetta Gladiators,...,L Ronchi,REJ Martinesz,Rashid Riaz,Khalid Mahmood,Shozab Raza,RS Mahanama,Peshawar Zalmi,5,1,Result
freq,38,44,142,34,2,142,60,60,23,32,...,7,36,28,28,38,66,30,23,5,135


In [8]:
# normalizing data
mapping = {"team1" : {"Islamabad United" : 1, "Quetta Gladiators" : 2, "Karachi Kings" : 3, "Peshawar Zalmi" : 4, "Multan Sultans" : 5, "Lahore Qalandars" : 6 },
           "team2" : {"Islamabad United" : 1, "Quetta Gladiators" : 2, "Karachi Kings" : 3, "Peshawar Zalmi" : 4, "Multan Sultans" : 5, "Lahore Qalandars" : 6 },
           "toss_winner" : {"Islamabad United" : 1, "Quetta Gladiators" : 2, "Karachi Kings" : 3, "Peshawar Zalmi" : 4, "Multan Sultans" : 5, "Lahore Qalandars" : 6 },
           "winner" : {"Islamabad United" : 1, "Quetta Gladiators" : 2, "Karachi Kings" : 3, "Peshawar Zalmi" : 4, "Multan Sultans" : 5, "Lahore Qalandars" : 6, "Draw" : 7}}

df.replace(mapping, inplace=True)

In [9]:
city_list = df['city']
city_map, city_index = pd.Series(city_list).factorize()
print(city_map)
print(city_index)

[0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 1 1 0 2 0 0 0 0 0 0 0 0 1 1 1 1 1
 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 2 2 3 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0
 0 0 0 0 0 4 4 4 4 3 3 3 3 3 3 3 3 3 3 2 3 2 3 2 5 6 5 6 5 6 6 2 2 6 2 6 2
 6 2 2 3 3 3 2 3 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0]
Index(['Dubai', 'Sharjah', 'Lahore', 'Karachi', 'Abu Dhabi', 'Multan',
       'Rawalpindi'],
      dtype='object')


In [10]:
venue_list = df['venue']
venue_map, venue_index = pd.Series(venue_list).factorize()
print(venue_map)
print(venue_index)

[0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 1 1 0 2 0 0 0 0 0 0 0 0 1 1 1 1 1
 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 2 2 3 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0
 0 0 0 0 0 4 4 4 4 3 3 3 3 3 3 3 3 3 3 2 3 2 3 2 5 6 5 6 5 6 6 2 2 6 2 6 2
 6 2 2 3 3 3 2 3 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0]
Index(['Dubai International Cricket Stadium', 'Sharjah Cricket Stadium',
       'Gaddafi Stadium', 'National Stadium', 'Sheikh Zayed Stadium',
       'Multan Cricket Stadium', 'Rawalpindi Cricket Stadium'],
      dtype='object')


In [11]:
toss_decision_list = df['toss_decision']
toss_decision_map, toss_decision_index = pd.Series(toss_decision_list).factorize()
print(toss_decision_map)
print(toss_decision_index)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1
 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0]
Index(['field', 'bat'], dtype='object')


In [12]:
# categorical to numerical
from sklearn import preprocessing, metrics
var = ['team1', 'team2', 'venue', 'city', 'toss_winner', 'toss_decision', 'winner']
matches = df[var]
le = preprocessing.LabelEncoder()
for att in var:
    df[att] = le.fit_transform(df[att])
    
matches = df[['team1', 'team2', 'venue', 'city', 'toss_winner', 'toss_decision', 'winner']]

In [13]:
from sklearn.linear_model import LogisticRegression
# Logistic Regression

attributes = ['team1', 'team2', 'venue', 'city', 'toss_winner', 'toss_decision']
labels = ['winner']

model = LogisticRegression()
model.fit(matches[attributes], matches[labels])

predictions = model.predict(matches[attributes])
print(predictions)
accuracy = metrics.accuracy_score(predictions, matches[labels])
print('Accuracy : %s' % '{0:.3%}'.format(accuracy))

[0 4 3 4 1 5 0 3 0 0 3 0 3 4 1 3 0 3 1 0 3 0 3 0 4 0 5 0 2 1 3 3 0 3 3 3 5
 0 3 1 2 1 3 1 3 1 4 3 3 3 0 5 0 0 0 3 2 2 3 1 2 3 0 5 1 3 5 2 3 0 3 0 2 4
 0 3 3 2 1 1 2 3 1 2 1 1 3 2 1 1 2 1 2 5 1 0 1 1 5 0 3 5 3 0 3 1 5 1 5 2 5
 2 5 5 3 3 1 5 1 0 3 1 1 5 0 1 1 1 0 5 2 0 5 0 3 1 3 1 2 1 2 0]
Accuracy : 35.915%


  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [14]:
df.info()
df.describe()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   team1            142 non-null    int64 
 1   team2            142 non-null    int64 
 2   gender           142 non-null    object
 3   season           142 non-null    object
 4   date             142 non-null    object
 5   competition      142 non-null    object
 6   venue            142 non-null    int32 
 7   city             142 non-null    int32 
 8   neutralvenue     23 non-null     object
 9   toss_winner      142 non-null    int64 
 10  toss_decision    142 non-null    int32 
 11  player_of_match  139 non-null    object
 12  umpire1          142 non-null    object
 13  umpire2          142 non-null    object
 14  reserve_umpire   117 non-null    object
 15  tv_umpire        139 non-null    object
 16  match_referee    134 non-null    object
 17  winner           142 non-null    in

Unnamed: 0,team1,team2,gender,season,date,competition,venue,city,neutralvenue,toss_winner,...,player_of_match,umpire1,umpire2,reserve_umpire,tv_umpire,match_referee,winner,winner_wickets,winner_runs,outcome
0,0,3,male,2016/17,2017/02/09,Pakistan Super League,0,1,True,0,...,BJ Haddin,Ahsan Raza,Shozab Raza,Asif Yaqoob,Rashid Riaz,RS Mahanama,0,7.0,,D/L
1,5,1,male,2016/17,2017/02/10,Pakistan Super League,0,1,True,5,...,Hassan Khan,Rashid Riaz,Shozab Raza,Asif Yaqoob,Ahsan Raza,Mohammed Anees,1,,8.0,Result
2,2,3,male,2016/17,2017/02/10,Pakistan Super League,0,1,True,3,...,EJG Morgan,Ahmed Shahab,Ahsan Raza,Shozab Raza,Asif Yaqoob,RS Mahanama,3,7.0,,Result
3,0,5,male,2016/17,2017/02/11,Pakistan Super League,0,1,True,5,...,JJ Roy,Asif Yaqoob,Shozab Raza,Rashid Riaz,Ahsan Raza,RS Mahanama,5,6.0,,Result
4,2,1,male,2016/17,2017/02/11,Pakistan Super League,0,1,True,1,...,RR Rossouw,Aleem Dar,Rashid Riaz,Asif Yaqoob,Ahmed Shahab,Mohammed Anees,1,7.0,,Result
