## **Problem Statement**
Entire data of previous IPLs is given as reference. Using the given data to create a model to predict runs scored in the first six overs in both the innings of the upcoming IPL matches.

In [1]:
import numpy as np
import pandas as pd 

## Collecting Data

In [2]:
dataset = pd.read_csv('all_matches.csv')
dataset.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,...,1,,,,1.0,,,,,
1,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.2,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,0,,,,,,,,,
2,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.3,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,1,1.0,,,,,,,,
3,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.4,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,0,,,,,,,,,
4,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.5,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,0,,,,,,,,,


In [3]:
dataset.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed'],
      dtype='object')

## Data Preprocessing

In [4]:
# Adding 'runs_off_bat' and 'extras' to a single quantity - 'total_runs'
dataset['total_runs'] = dataset['runs_off_bat'] + dataset['extras']
dataset.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed,total_runs
0,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.1,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,BB McCullum,...,,,,1.0,,,,,,1
1,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.2,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,,,,,,,,,,0
2,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.3,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,1.0,,,,,,,,,1
3,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.4,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,,,,,,,,,,0
4,335982,2007/08,2008-04-18,M Chinnaswamy Stadium,1,0.5,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,SC Ganguly,...,,,,,,,,,,0


In [5]:
# Taking only data within first 6 overs
dataset = dataset.loc[dataset['ball'] < 6.0]

In [6]:
# Removing unwanted columns
dataset = dataset.drop(columns = ['non_striker', 'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes', 'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type', 'other_player_dismissed'])

In [7]:
set(dataset['innings'])

{1, 2, 3, 4, 5, 6}

In the dataset obtained, the some values of innings is more than 2 (which is not possible in cricket). So we are removing the matches in which innings have wrong values.

In [8]:
improper_matches = set(dataset[dataset.innings > 2].match_id)
for match in improper_matches:
    indexNames = dataset[dataset.match_id == match].index
    dataset = dataset.drop(indexNames)

In [9]:
set(dataset.innings)

{1, 2}

Now the dataset contains only matches with innings is less than 2.

In [10]:
# Creating a dataset containing average runs of each batsmen in first six overs
batsmen_runs = dataset.groupby(['match_id', 'striker']).total_runs.sum().reset_index()
batsmen_data = batsmen_runs.groupby(['striker']).total_runs.mean().reset_index()
batsmen_data.columns = ['striker', 'avg_runs']
batsmen_data.head()

Unnamed: 0,striker,avg_runs
0,A Ashish Reddy,6.0
1,A Chopra,5.8
2,A Flintoff,21.0
3,A Mishra,1.0
4,A Mukund,15.0


In [11]:
# Creating a dataset containing runs per ball of each bowler
bowler_runs = dataset.groupby(['bowler']).total_runs.sum().reset_index()
bowler_ball_count = dataset.groupby(['bowler']).ball.count().reset_index()
bowler_data = pd.merge(bowler_runs, bowler_ball_count, on='bowler')
bowler_data['runs_per_ball'] = bowler_data['total_runs']/bowler_data['ball']
bowler_data.columns = ['bowler', 'runs_given', 'balls', 'runs_per_ball']
bowler_data.head()

Unnamed: 0,bowler,runs_given,balls,runs_per_ball
0,A Ashish Reddy,20,6,3.333333
1,A Chandila,139,144,0.965278
2,A Choudhary,46,37,1.243243
3,A Dananjaya,12,6,2.0
4,A Flintoff,21,12,1.75


In [12]:
set(dataset.venue)

{'Arun Jaitley Stadium',
 'Barabati Stadium',
 'Brabourne Stadium',
 'Buffalo Park',
 'De Beers Diamond Oval',
 'Dr DY Patil Sports Academy',
 'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium',
 'Dubai International Cricket Stadium',
 'Eden Gardens',
 'Feroz Shah Kotla',
 'Green Park',
 'Himachal Pradesh Cricket Association Stadium',
 'Holkar Cricket Stadium',
 'JSCA International Stadium Complex',
 'Kingsmead',
 'M Chinnaswamy Stadium',
 'M.Chinnaswamy Stadium',
 'MA Chidambaram Stadium',
 'MA Chidambaram Stadium, Chepauk',
 'MA Chidambaram Stadium, Chepauk, Chennai',
 'Maharashtra Cricket Association Stadium',
 'Nehru Stadium',
 'New Wanderers Stadium',
 'Newlands',
 'OUTsurance Oval',
 'Punjab Cricket Association IS Bindra Stadium',
 'Punjab Cricket Association IS Bindra Stadium, Mohali',
 'Punjab Cricket Association Stadium, Mohali',
 'Rajiv Gandhi International Stadium',
 'Rajiv Gandhi International Stadium, Uppal',
 'Sardar Patel Stadium, Motera',
 'Saurashtra Cricket Associa

As we can see here, some of venues of are repeated with slightly varied names.
All such venues must be converted to single name for convenience.

In [13]:
dataset.loc[dataset.venue.str.contains('Chinnaswamy'), 'venue'] = 'M Chinnaswamy Stadium'
dataset.loc[dataset.venue.str.contains('Chidambaram'), 'venue'] = 'MA Chidambaram Stadium'
dataset.loc[dataset.venue.str.contains('Punjab Cricket'), 'venue'] = 'Punjab Cricket Association IS Bindra Stadium'
dataset.loc[dataset.venue.str.contains('Rajiv Gandhi'), 'venue'] = 'Rajiv Gandhi International Stadium'
dataset.loc[dataset.venue.str.contains('Wankhede'), 'venue'] = 'Wankhede Stadium'

In [14]:
set(dataset.venue)

{'Arun Jaitley Stadium',
 'Barabati Stadium',
 'Brabourne Stadium',
 'Buffalo Park',
 'De Beers Diamond Oval',
 'Dr DY Patil Sports Academy',
 'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium',
 'Dubai International Cricket Stadium',
 'Eden Gardens',
 'Feroz Shah Kotla',
 'Green Park',
 'Himachal Pradesh Cricket Association Stadium',
 'Holkar Cricket Stadium',
 'JSCA International Stadium Complex',
 'Kingsmead',
 'M Chinnaswamy Stadium',
 'MA Chidambaram Stadium',
 'Maharashtra Cricket Association Stadium',
 'Nehru Stadium',
 'New Wanderers Stadium',
 'Newlands',
 'OUTsurance Oval',
 'Punjab Cricket Association IS Bindra Stadium',
 'Rajiv Gandhi International Stadium',
 'Sardar Patel Stadium, Motera',
 'Saurashtra Cricket Association Stadium',
 'Sawai Mansingh Stadium',
 'Shaheed Veer Narayan Singh International Stadium',
 'Sharjah Cricket Stadium',
 'Sheikh Zayed Stadium',
 "St George's Park",
 'Subrata Roy Sahara Stadium',
 'SuperSport Park',
 'Vidarbha Cricket Association Stadiu

Now all the venues are given unique names

In [15]:
# Creating a dataset containing average runs scored in each stadium i
venue_data = dataset.groupby(['match_id', 'innings', 'venue']).total_runs.sum().reset_index()
venue_data = venue_data.groupby(['venue']).total_runs.mean().reset_index()
venue_data.columns = ['venue', 'avg_runs']
venue_data.head()

Unnamed: 0,venue,avg_runs
0,Arun Jaitley Stadium,51.461538
1,Barabati Stadium,43.571429
2,Brabourne Stadium,51.863636
3,Buffalo Park,39.5
4,De Beers Diamond Oval,40.0


In [16]:
# Creating a dataset containing average runs scored by each team in first six overs
batting_team_data = dataset.groupby(['match_id', 'batting_team']).total_runs.sum().reset_index()
batting_team_data = batting_team_data.groupby(['batting_team']).total_runs.mean().reset_index()
batting_team_data.columns = ['batting_team', 'avg_runs']
batting_team_data.head()

Unnamed: 0,batting_team,avg_runs
0,Chennai Super Kings,45.162921
1,Deccan Chargers,45.56
2,Delhi Capitals,48.65625
3,Delhi Daredevils,45.6625
4,Gujarat Lions,51.896552


In [17]:
# Creating a dataset containing average runs given by each team during bowling in first six overs
bowling_team_data = dataset.groupby(['match_id', 'bowling_team']).total_runs.sum().reset_index()
bowling_team_data = bowling_team_data.groupby(['bowling_team']).total_runs.mean().reset_index()
bowling_team_data.columns = ['bowling_team', 'avg_runs']
bowling_team_data.head()

Unnamed: 0,bowling_team,avg_runs
0,Chennai Super Kings,46.011236
1,Deccan Chargers,43.04
2,Delhi Capitals,47.46875
3,Delhi Daredevils,46.685535
4,Gujarat Lions,49.275862


In [18]:
final_data = dataset.groupby(['match_id', 'venue', 'innings', 'batting_team', 'bowling_team']).total_runs.sum().reset_index()
final_data.head()

Unnamed: 0,match_id,venue,innings,batting_team,bowling_team,total_runs
0,335982,M Chinnaswamy Stadium,1,Kolkata Knight Riders,Royal Challengers Bangalore,61
1,335982,M Chinnaswamy Stadium,2,Royal Challengers Bangalore,Kolkata Knight Riders,26
2,335983,Punjab Cricket Association IS Bindra Stadium,1,Chennai Super Kings,Kings XI Punjab,53
3,335983,Punjab Cricket Association IS Bindra Stadium,2,Kings XI Punjab,Chennai Super Kings,63
4,335984,Feroz Shah Kotla,1,Rajasthan Royals,Delhi Daredevils,40


In [19]:
# Creating a dataset containing names of all batsmen and bowler in each innings
bowlers_batsmen = dataset.groupby(['match_id', 'innings'])['striker', 'bowler'].agg(set).reset_index()
bowlers_batsmen.head()

  bowlers_batsmen = dataset.groupby(['match_id', 'innings'])['striker', 'bowler'].agg(set).reset_index()


Unnamed: 0,match_id,innings,striker,bowler
0,335982,1,"{SC Ganguly, BB McCullum, RT Ponting}","{Z Khan, P Kumar, AA Noffke}"
1,335982,2,"{W Jaffer, CL White, R Dravid, JH Kallis, V Ko...","{I Sharma, AB Agarkar, AB Dinda}"
2,335983,1,"{PA Patel, MEK Hussey, ML Hayden}","{B Lee, JR Hopes, S Sreesanth}"
3,335983,2,"{K Goel, JR Hopes}","{MS Gony, JDP Oram}"
4,335984,1,"{M Kaif, YK Pathan, SR Watson, T Kohli}","{B Geeves, GD McGrath, MF Maharoof}"


In [20]:
# Batsmen and bowlers names are added to final data
final_data = pd.merge(final_data, bowlers_batsmen, on=['match_id', 'innings'])
final_data = final_data.drop(columns=['match_id'])
final_data = final_data[['venue', 'innings', 'batting_team', 'bowling_team', 'striker', 'bowler', 'total_runs']]
final_data.head()

Unnamed: 0,venue,innings,batting_team,bowling_team,striker,bowler,total_runs
0,M Chinnaswamy Stadium,1,Kolkata Knight Riders,Royal Challengers Bangalore,"{SC Ganguly, BB McCullum, RT Ponting}","{Z Khan, P Kumar, AA Noffke}",61
1,M Chinnaswamy Stadium,2,Royal Challengers Bangalore,Kolkata Knight Riders,"{W Jaffer, CL White, R Dravid, JH Kallis, V Ko...","{I Sharma, AB Agarkar, AB Dinda}",26
2,Punjab Cricket Association IS Bindra Stadium,1,Chennai Super Kings,Kings XI Punjab,"{PA Patel, MEK Hussey, ML Hayden}","{B Lee, JR Hopes, S Sreesanth}",53
3,Punjab Cricket Association IS Bindra Stadium,2,Kings XI Punjab,Chennai Super Kings,"{K Goel, JR Hopes}","{MS Gony, JDP Oram}",63
4,Feroz Shah Kotla,1,Rajasthan Royals,Delhi Daredevils,"{M Kaif, YK Pathan, SR Watson, T Kohli}","{B Geeves, GD McGrath, MF Maharoof}",40


In [21]:
final2 = final_data.copy()
for i in range(len(final_data)):
    final_data.iloc[i, 0] = venue_data.loc[venue_data['venue'] == final_data.iloc[i, 0], 'avg_runs']
    final_data.iloc[i, 2] = batting_team_data.loc[batting_team_data['batting_team'] == final_data.iloc[i, 2], 'avg_runs']
    final_data.iloc[i, 3] = bowling_team_data.loc[bowling_team_data['bowling_team'] == final_data.iloc[i, 3], 'avg_runs']
    strikers_avg = 0
    for striker in final_data.iloc[i, 4]:
        strikers_avg += float(batsmen_data.loc[batsmen_data['striker']==striker, 'avg_runs'])
    final_data.iloc[i, 4] = strikers_avg
    bowlers_avg = 0
    for bowler in final_data.iloc[i, 5]:
        bowlers_avg += float(bowler_data.loc[bowler_data['bowler']==bowler, 'runs_per_ball'])
    final_data.iloc[i, 5] = bowlers_avg

final_data.head()

Unnamed: 0,venue,innings,batting_team,bowling_team,striker,bowler,total_runs
0,45.840764,1,46.380952,46.380208,34.086246,5.498325,61
1,45.840764,2,44.549223,46.005291,64.536448,3.858576,26
2,48.428571,1,45.162921,48.129032,51.484716,3.774984,53
3,48.428571,2,47.005376,46.011236,26.738095,2.954909,63
4,47.092437,1,44.643312,46.685535,38.68875,4.169147,40


## Train Data

In [22]:
X = final_data.iloc[:, 0:6].values
y = final_data.iloc[:, 6].values

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2)

In [24]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=200, random_state=2)
regressor.fit(X_train, y_train)

RandomForestRegressor(n_estimators=200, random_state=2)

## Test the model

In [25]:
from sklearn import metrics

y_pred = regressor.predict(X_test)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 8.02496894409938
Mean Squared Error: 108.09274813664597
Root Mean Squared Error: 10.396766234586886


**Using Random Forest Classifier, we got Mean Absolute error of 8.02**