In [1]:
# Importing essential libraries
import pandas as pd
import numpy as np

In [2]:
import sklearn
print(sklearn.__version__)

1.5.0


In [3]:
# Loading the dataset
df = pd.read_csv('ipl.csv')
df

Unnamed: 0,mid,date,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0,222
1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,0,0,222
2,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,0,0,222
3,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,0,0,222
4,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,0,0,222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76009,617,2017-05-21,"Rajiv Gandhi International Stadium, Uppal",Mumbai Indians,Rising Pune Supergiant,KH Pandya,DT Christian,121,7,19.2,40,0,40,12,129
76010,617,2017-05-21,"Rajiv Gandhi International Stadium, Uppal",Mumbai Indians,Rising Pune Supergiant,KH Pandya,DT Christian,127,7,19.3,46,0,46,12,129
76011,617,2017-05-21,"Rajiv Gandhi International Stadium, Uppal",Mumbai Indians,Rising Pune Supergiant,KH Pandya,DT Christian,128,7,19.4,47,0,47,12,129
76012,617,2017-05-21,"Rajiv Gandhi International Stadium, Uppal",Mumbai Indians,Rising Pune Supergiant,MG Johnson,DT Christian,129,7,19.5,48,0,47,13,129


In [4]:
# Removing unwanted columns
columns_to_remove = ['mid', 'venue','striker', 'non-striker']

print('Before removing unwanted columns: {}'.format(df.shape))
df.drop(labels=columns_to_remove, axis=1, inplace=True)
print('After removing unwanted columns: {}'.format(df.shape))

Before removing unwanted columns: (76014, 15)
After removing unwanted columns: (76014, 11)


In [5]:
# Removing the first 5 overs data in every match
print('Before removing first 5 overs data: {}'.format(df.shape))
df = df[df['overs']>=5.0]
print('After removing first 5 overs data: {}'.format(df.shape))

Before removing first 5 overs data: (76014, 11)
After removing first 5 overs data: (56707, 11)


In [6]:
consistent_teams = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals','Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers Bangalore','Delhi Daredevils', 'Sunrisers Hyderabad']

In [7]:
print('Before removing inconsistent teams: {}'.format(df.shape))
df = df[(df['bat_team'].isin(consistent_teams)) & (df['bowl_team'].isin(consistent_teams))]
print('After removing inconsistent teams: {}'.format(df.shape))

Before removing inconsistent teams: (56707, 11)
After removing inconsistent teams: (40108, 11)


In [8]:
from datetime import datetime
print("Before converting 'date' column from string to datetime object: {}".format(type(df.iloc[0,0])))
df['date'] = df['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
print("After converting 'date' column from string to datetime object: {}".format(type(df.iloc[0,0])))

Before converting 'date' column from string to datetime object: <class 'str'>
After converting 'date' column from string to datetime object: <class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [9]:
encoded_df = pd.get_dummies(data=df, columns=['bat_team', 'bowl_team','batsman','bowler'])

In [10]:
# Rearranging the columns
 
encoded_df = encoded_df[['date','bat_team_Chennai Super Kings', 'bat_team_Delhi Daredevils', 'bat_team_Kings XI Punjab','bat_team_Kolkata Knight Riders', 'bat_team_Mumbai Indians', 'bat_team_Rajasthan Royals','bat_team_Royal Challengers Bangalore', 'bat_team_Sunrisers Hyderabad','bowl_team_Chennai Super Kings', 'bowl_team_Delhi Daredevils', 'bowl_team_Kings XI Punjab','bowl_team_Kolkata Knight Riders', 'bowl_team_Mumbai Indians', 'bowl_team_Rajasthan Royals','bowl_team_Royal Challengers Bangalore', 'bowl_team_Sunrisers Hyderabad',
'batsman_SC Ganguly','batsman_BB McCullum','batsman_RT Ponting','batsman_DJ Hussey','batsman_Mohammad Hafeez','batsman_PA Patel','batsman_ML Hayden','batsman_MEK Hussey',
'batsman_MJ Guptill','batsman_JC Buttler','batsman_KH Pandya','batsman_KJ Abbott','batsman_TM Head','batsman_KW Richardson','batsman_NS Naik','batsman_SW Billings',
'batsman_AC Gilchrist','batsman_Sunny Singh','batsman_RG Sharma','batsman_A Symonds','batsman_MS Dhoni', 'batsman_SK Raina', 'batsman_JDP Oram', 'batsman_S Badrinath','batsman_T Kohli','batsman_YK Pathan', 'batsman_SR Watson', 'batsman_M Kaif',
'batsman_DS Lehmann','batsman_RA Jadeja','batsman_M Rawat', 'batsman_D Salunkhe', 'batsman_SK Warne', 'batsman_SK Trivedi', 'batsman_BE Hendricks','batsman_ST Jayasuriya', 
'batsman_DJ Thornely', 'batsman_RV Uthappa', 'batsman_PR Shah','batsman_AM Nayar',              
'batsman_Imran Tahir', 'batsman_MM Sharma', 'batsman_DJ Hooda', 'batsman_CH Morris','batsman_SS Iyer','batsman_SA Abbott', 'batsman_AN Ahmed', 'batsman_YS Chahal', 
'batsman_J Suchith', 'batsman_P Negi','batsman_RG More', 'batsman_Anureet Singh','batsman_HH Pandya', 'batsman_NM Coulter-Nile','batsman_PV Tambe', 'batsman_MJ McClenaghan',
'batsman_DJ Muthuswami', 'batsman_SN Thakur','batsman_SN Khan','batsman_PJ Cummins',                                 
'bowler_P Kumar', 'bowler_Z Khan', 'bowler_AA Noffke', 'bowler_JH Kallis','bowler_SB Joshi','bowler_CL White', 'bowler_B Lee', 'bowler_S Sreesanth', 
'bowler_JR Hopes', 'bowler_IK Pathan','bowler_Bipul Sharma', 'bowler_DJ Bravo','bowler_S Ladda', 'bowler_UT Yadav', 'bowler_MC Henriques','bowler_R McLaren', 
'bowler_J Theron', 'bowler_S Narwal', 'bowler_Sohail Tanvir', 'bowler_RS Bopara','bowler_Yuvraj Singh', 'bowler_YS Chahal', 'bowler_Y Venugopal Rao', 'bowler_A Mishra',
'bowler_SP Narine', 'bowler_Abdur Razzak', 'bowler_RR Powar', 'bowler_M Ntini','bowler_GJ Maxwell', 'bowler_BJ Hodge', 'bowler_YA Abdulla', 'bowler_PP Chawla',
'bowler_RA Jadeja', 'bowler_M Muralitharan', 'bowler_TM Dilshan', 'bowler_VS Malik','bowler_D du Preez','bowler_RE van der Merwe', 'bowler_DL Vettori', 'bowler_R Ashwin',
'overs', 'runs', 'wickets', 'runs_last_5', 'wickets_last_5', 'total']]

In [11]:
# Splitting the data into train and test set
X_train = encoded_df.drop(labels='total', axis=1)[encoded_df['date'].dt.year <= 2016]
X_test = encoded_df.drop(labels='total', axis=1)[encoded_df['date'].dt.year >= 2017]

y_train = encoded_df[encoded_df['date'].dt.year <= 2016]['total'].values
y_test = encoded_df[encoded_df['date'].dt.year >= 2017]['total'].values

# Removing the 'date' column
X_train.drop(labels='date', axis=True, inplace=True)
X_test.drop(labels='date', axis=True, inplace=True)

print("Training set: {} and Test set: {}".format(X_train.shape, X_test.shape))


Training set: (37330, 121) and Test set: (2778, 121)


In [12]:

# Decision Tree Regression Model
from sklearn.tree import DecisionTreeRegressor
decision_regressor = DecisionTreeRegressor()
decision_regressor.fit(X_train,y_train)


In [13]:
# Predicting results
y_pred_dt = decision_regressor.predict(X_test)

In [14]:
# # Random Forest Regression - Model Evaluation
# from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse, accuracy_score,r2_score

# print("---- Random Forest Regression - Model Evaluation ----")
# print("Mean Absolute Error (MAE): {}".format(mae(y_test, y_pred_rf)))
# print("Mean Squared Error (MSE): {}".format(mse(y_test, y_pred_rf)))
# print("Root Mean Squared Error (RMSE): {}".format(np.sqrt(mse(y_test, y_pred_rf))))
# r2 = r2_score(y_test, y_pred_rf)
# print("R-squared (R2) Score:", r2)

In [15]:
import pickle
pickle.dump(decision_regressor,open('model.pkl','wb'))