In [23]:
# Import the files 
import pandas as pd
import pickle 
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings("ignore")

In [24]:
# Import data Set
df = pd.read_csv(r'C:\Users\Lenovo\Documents\Live Session\Cricket\IPL.csv')

In [54]:
df.head()

Unnamed: 0,bat_team,bowl_team,runs,wickets,overs,runs_last_5,wickets_last_5,total,Year,venue
32,Kolkata Knight Riders,Royal Challengers Bangalore,61,0,5.1,59,0,222,2008,M Chinnaswamy Stadium
33,Kolkata Knight Riders,Royal Challengers Bangalore,61,1,5.2,59,1,222,2008,M Chinnaswamy Stadium
34,Kolkata Knight Riders,Royal Challengers Bangalore,61,1,5.3,59,1,222,2008,M Chinnaswamy Stadium
35,Kolkata Knight Riders,Royal Challengers Bangalore,61,1,5.4,59,1,222,2008,M Chinnaswamy Stadium
36,Kolkata Knight Riders,Royal Challengers Bangalore,61,1,5.5,58,1,222,2008,M Chinnaswamy Stadium


In [26]:
# Colums to be removed - mid, batsman, bowlerstriker non striker 
columns_to_remove = ['mid','batsman','bowler','striker','non-striker']
df.drop(labels=columns_to_remove,inplace=True,axis=1)

In [27]:
# Keeping the current teams and renaming deccan chargers with Sunrisers Hyderabad
df = df.replace(['Deccan Chargers'],'Sunrisers Hyderabad')
selected_teams = ['Kolkata Knight Riders','Chennai Super Kings','Rajasthan Royals','Mumbai Indians','Kings XI Punjab',
                  'Royal Challengers Bangalore','Delhi Daredevils','Sunrisers Hyderabad']
df = df[df['bat_team'].isin(selected_teams) & df['bowl_team'].isin(selected_teams)]

In [28]:
#Will take matches played from 2008 to 2016 as training dataset and year 2017 as test dataset
df['Year'] = pd.DatetimeIndex(df['date']).year

In [29]:
# Removing the first 5 overs from all the matches --- for prediction we need to have at least 5 overs bowled 
df = df[df['overs']>5]

In [30]:
# Make changes in the stadium name of Mohali 
df['venue'] = df['venue'].replace(['Punjab Cricket Association IS Bindra Stadium, Mohali'],'Punjab Cricket Association Stadium, Mohali')


In [31]:
# Keeping all the home venues and converting rest into others
home_venues = ['M Chinnaswamy Stadium','Punjab Cricket Association Stadium, Mohali','Feroz Shah Kotla','Wankhede Stadium'
               ,'Eden Gardens','Sawai Mansingh Stadium','MA Chidambaram Stadium, Chepauk','Rajiv Gandhi International Stadium, Uppal']
df['new_venue']=np.where(df['venue'].isin(home_venues),df['venue'],'other')
df.drop(columns=['venue','date'],axis =1,inplace= True)

In [32]:
df.rename(columns ={'new_venue':'venue'},inplace=True)

In [33]:
df.columns

Index(['bat_team', 'bowl_team', 'runs', 'wickets', 'overs', 'runs_last_5',
       'wickets_last_5', 'total', 'Year', 'venue'],
      dtype='object')

In [34]:
# Convert the categorical variable into dummy variables 
df_new = pd.get_dummies(data=df, columns = ['bat_team','bowl_team','venue'],drop_first=True)


In [35]:
df_new.columns

Index(['runs', 'wickets', 'overs', 'runs_last_5', 'wickets_last_5', 'total',
       'Year', 'bat_team_Delhi Daredevils', 'bat_team_Kings XI Punjab',
       'bat_team_Kolkata Knight Riders', 'bat_team_Mumbai Indians',
       'bat_team_Rajasthan Royals', 'bat_team_Royal Challengers Bangalore',
       'bat_team_Sunrisers Hyderabad', 'bowl_team_Delhi Daredevils',
       'bowl_team_Kings XI Punjab', 'bowl_team_Kolkata Knight Riders',
       'bowl_team_Mumbai Indians', 'bowl_team_Rajasthan Royals',
       'bowl_team_Royal Challengers Bangalore',
       'bowl_team_Sunrisers Hyderabad', 'venue_Feroz Shah Kotla',
       'venue_M Chinnaswamy Stadium', 'venue_MA Chidambaram Stadium, Chepauk',
       'venue_Punjab Cricket Association Stadium, Mohali',
       'venue_Rajiv Gandhi International Stadium, Uppal',
       'venue_Sawai Mansingh Stadium', 'venue_Wankhede Stadium',
       'venue_other'],
      dtype='object')

In [36]:
df_new = df_new[['bat_team_Delhi Daredevils','bat_team_Kings XI Punjab',
       'bat_team_Kolkata Knight Riders', 'bat_team_Mumbai Indians',
       'bat_team_Rajasthan Royals', 'bat_team_Royal Challengers Bangalore',
       'bat_team_Sunrisers Hyderabad', 'bowl_team_Delhi Daredevils',
       'bowl_team_Kings XI Punjab', 'bowl_team_Kolkata Knight Riders',
       'bowl_team_Mumbai Indians', 'bowl_team_Rajasthan Royals',
       'bowl_team_Royal Challengers Bangalore',
       'bowl_team_Sunrisers Hyderabad', 'venue_Feroz Shah Kotla',
       'venue_M Chinnaswamy Stadium', 'venue_MA Chidambaram Stadium, Chepauk',
       'venue_Punjab Cricket Association Stadium, Mohali',
       'venue_Rajiv Gandhi International Stadium, Uppal',
       'venue_Sawai Mansingh Stadium', 'venue_Wankhede Stadium',
       'venue_other','runs', 'wickets', 'overs', 'runs_last_5', 'wickets_last_5','total','Year']]

In [37]:
df_new.columns

Index(['bat_team_Delhi Daredevils', 'bat_team_Kings XI Punjab',
       'bat_team_Kolkata Knight Riders', 'bat_team_Mumbai Indians',
       'bat_team_Rajasthan Royals', 'bat_team_Royal Challengers Bangalore',
       'bat_team_Sunrisers Hyderabad', 'bowl_team_Delhi Daredevils',
       'bowl_team_Kings XI Punjab', 'bowl_team_Kolkata Knight Riders',
       'bowl_team_Mumbai Indians', 'bowl_team_Rajasthan Royals',
       'bowl_team_Royal Challengers Bangalore',
       'bowl_team_Sunrisers Hyderabad', 'venue_Feroz Shah Kotla',
       'venue_M Chinnaswamy Stadium', 'venue_MA Chidambaram Stadium, Chepauk',
       'venue_Punjab Cricket Association Stadium, Mohali',
       'venue_Rajiv Gandhi International Stadium, Uppal',
       'venue_Sawai Mansingh Stadium', 'venue_Wankhede Stadium', 'venue_other',
       'runs', 'wickets', 'overs', 'runs_last_5', 'wickets_last_5', 'total',
       'Year'],
      dtype='object')

In [38]:
df_train = df_new[df_new['Year'] != 2017]
df_test = df_new[df_new['Year']==2017]

In [39]:
X_train = df_train.drop(columns = ['total','Year'])
y_train = df_train['total']

In [40]:
X_test = df_test.drop(columns = ['total','Year'])
y_test = df_test['total']

In [41]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression()

In [42]:
regression.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [43]:
#Saving Linear Regression Model
filename = 'ipl_score_pred.pkl'
pickle.dump(regression, open(filename, 'wb'))

In [44]:
y_pred = regression.predict(X_test)

In [45]:
from sklearn import metrics 
np.sqrt(metrics.mean_squared_error(y_test, y_pred))

16.359394035797013

In [46]:
X_train.columns

Index(['bat_team_Delhi Daredevils', 'bat_team_Kings XI Punjab',
       'bat_team_Kolkata Knight Riders', 'bat_team_Mumbai Indians',
       'bat_team_Rajasthan Royals', 'bat_team_Royal Challengers Bangalore',
       'bat_team_Sunrisers Hyderabad', 'bowl_team_Delhi Daredevils',
       'bowl_team_Kings XI Punjab', 'bowl_team_Kolkata Knight Riders',
       'bowl_team_Mumbai Indians', 'bowl_team_Rajasthan Royals',
       'bowl_team_Royal Challengers Bangalore',
       'bowl_team_Sunrisers Hyderabad', 'venue_Feroz Shah Kotla',
       'venue_M Chinnaswamy Stadium', 'venue_MA Chidambaram Stadium, Chepauk',
       'venue_Punjab Cricket Association Stadium, Mohali',
       'venue_Rajiv Gandhi International Stadium, Uppal',
       'venue_Sawai Mansingh Stadium', 'venue_Wankhede Stadium', 'venue_other',
       'runs', 'wickets', 'overs', 'runs_last_5', 'wickets_last_5'],
      dtype='object')

In [52]:
a = [1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,50,2,5.3,48,1]
b = np.reshape(a,(1,27))
regression.predict(b)[0]

164.95284893845385