In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import io
import requests

In [2]:
url='https://raw.githubusercontent.com/codophobia/CricketScorePredictor/master/data/odi.csv'
s=requests.get(url).content
df=pd.read_csv(io.StringIO(s.decode('utf-8')))


# data representation

In [3]:
df.head(100)

Unnamed: 0,mid,date,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,2006-06-13,"Civil Service Cricket Club, Stormont",England,Ireland,ME Trescothick,DT Johnston,0,0,0.1,0,0,0,0,301
1,1,2006-06-13,"Civil Service Cricket Club, Stormont",England,Ireland,ME Trescothick,DT Johnston,0,0,0.2,0,0,0,0,301
2,1,2006-06-13,"Civil Service Cricket Club, Stormont",England,Ireland,ME Trescothick,DT Johnston,4,0,0.3,4,0,0,0,301
3,1,2006-06-13,"Civil Service Cricket Club, Stormont",England,Ireland,ME Trescothick,DT Johnston,6,0,0.4,6,0,0,0,301
4,1,2006-06-13,"Civil Service Cricket Club, Stormont",England,Ireland,ME Trescothick,DT Johnston,6,0,0.5,6,0,0,0,301
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1,2006-06-13,"Civil Service Cricket Club, Stormont",England,Ireland,ME Trescothick,KJ O'Brien,94,3,14.5,31,1,50,0,301
96,1,2006-06-13,"Civil Service Cricket Club, Stormont",England,Ireland,ME Trescothick,KJ O'Brien,95,3,14.6,31,1,51,0,301
97,1,2006-06-13,"Civil Service Cricket Club, Stormont",England,Ireland,ME Trescothick,JF Mooney,99,3,15.1,32,1,55,0,301
98,1,2006-06-13,"Civil Service Cricket Club, Stormont",England,Ireland,ME Trescothick,JF Mooney,101,3,15.2,33,1,57,0,301


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350899 entries, 0 to 350898
Data columns (total 15 columns):
mid               350899 non-null int64
date              350899 non-null object
venue             350899 non-null object
bat_team          350899 non-null object
bowl_team         350899 non-null object
batsman           350899 non-null object
bowler            350899 non-null object
runs              350899 non-null int64
wickets           350899 non-null int64
overs             350899 non-null float64
runs_last_5       350899 non-null int64
wickets_last_5    350899 non-null int64
striker           350899 non-null int64
non-striker       350899 non-null int64
total             350899 non-null int64
dtypes: float64(1), int64(8), object(6)
memory usage: 40.2+ MB


In [5]:
## selecting the required features

In [None]:
# using only 5000 rows, it was taking too long to include more data

In [6]:
X = df.iloc[:5000,[7,8,9,12,13]].values #Input features
y = df.iloc[:5000, 14].values #Label

In [7]:
# splitting the dataset into train and test 

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [11]:
# data preprocessing using Standard Scalar

In [12]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Implementing SVM(SVR) 

In [13]:
from sklearn.svm import SVR

In [14]:
# using rbf and default C and gamma values

In [15]:
model = SVR(kernel='rbf')

In [16]:
model.fit(X_train,y_train)

SVR()

In [17]:
pred = model.predict(X_test)
score = model.score(X_test,y_test)*100

In [18]:
# printing the accuracy score using rbf and default C and gamma values

In [19]:
print(score)

54.08839233086103


# Using GridSearchCv to optimize the model and find the best C and Gamma values

In [20]:
from sklearn.model_selection import GridSearchCV

In [21]:
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel':['rbf']}

In [22]:
grid = GridSearchCV(SVR(),param_grid,refit=True,)

In [23]:
grid.fit(X_train,y_train)

GridSearchCV(estimator=SVR(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']})

In [24]:
grid.best_params_

{'C': 1000, 'gamma': 1, 'kernel': 'rbf'}

In [25]:
grid.best_estimator_

SVR(C=1000, gamma=1)

In [26]:
score = grid.score(X_test,y_test)*100

In [27]:
# printing the score with the best C and gamma values 

In [28]:
print(score)

85.9050253834259


In [29]:
# the model gives a best accuracy score of 85.9 for 5000 rows of the dataset. 

In [30]:
# I tried different kernels, Linear kernel gave a poorer accuracy than rbf, poly kernel couldn't run the 5000 row dataset