In [None]:
import pandas as pd
import pickle

Loading the dataset

In [None]:
df = pd.read_csv('./ipl.csv')
df.head()
df.tail()

Data Cleaning

In [None]:
# Removing unwanted columns
columns_to_remove = ['mid', 'venue', 'batsman', 'bowler', 'striker',
'non-striker']
df.drop(labels=columns_to_remove, axis =1, inplace=True)
df['bat_team'].unique()
df.head()
# Keeping the consistent teams
consistent_teams = ['Kolkata Knight Riders', 'Chennai Super Kings','Rajasthan Royals','Mumbai Indians', 'Kings XI Punjab','Royal Challengers Bangalore', 'Delhi Daredevils','SunrisersHyderabad']
## So the teams which we have considered, we shall filter it out from the batting team and bowling team
df=df[(df['bat_team'].isin(consistent_teams) &(df['bowl_team'].isin(consistent_teams)))]
# Removing the first 5 overs data in every match
df = df[df['overs']>=5.0]
print(df['bat_team'].unique())
print(df['bowl_team'].unique())
df.head()
# Converting the column 'date' from string into datetime object
from datetime import datetime
df['date'] = df['date'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d'))

Data Preprocessing

In [None]:
# Converting categorical features using OnehotEncoding method
encoded_df = pd.get_dummies(data = df, columns=['bat_team','bowl_team'])
encoded_df.head()
encoded_df.tail()
encoded_df.columns
# Rearranging the columns
encoded_df = encoded_df[['date', 'bat_team_Chennai Super Kings','bat_team_Delhi Daredevils', 'bat_team_Kings XI Punjab','bat_team_Kolkata Knight Riders', 'bat_team_Mumbai Indians','bat_team_Rajasthan Royals','bat_team_Royal Challengers Bangalore', 'bat_team_SunrisersHyderabad','bowl_team_Chennai Super Kings', 'bowl_team_DelhiDaredevils', 'bowl_team_Kings XI Punjab','bowl_team_Kolkata Knight Riders', 'bowl_team_MumbaiIndians', 'bowl_team_Rajasthan Royals','bowl_team_Royal Challengers Bangalore', 'bowl_team_SunrisersHyderabad','overs', 'runs', 'wickets', 'runs_last_5', 'wickets_last_5', 'total']]
# Splitting the data into train and test set
X_train = encoded_df.drop(labels='total',axis=1)[encoded_df['date'].dt.year <= 2016]
X_test = encoded_df.drop(labels='total', axis=1)[encoded_df['date'].dt.year>= 2017]
y_train = encoded_df[encoded_df['date'].dt.year <= 2016]['total'].values
y_test = encoded_df[encoded_df['date'].dt.year >= 2017]['total'].values
# Removing the 'date' column
X_train.drop(labels='date', axis=True, inplace=True)
X_test.drop(labels='date', axis=True, inplace=True)
print("Training set: {} and Test set: {}".format(X_train.shape,X_test.shape))

Model Building and Testing

1. Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)
prediction_linear=regressor.predict(X_test)
import seaborn as sns
sns.distplot(y_test-prediction_linear)
with open('score_linear.pkl','wb') as f:pickle.dump(regressor,f)
from sklearn import metrics
import numpy as np
print('MAE:', metrics.mean_absolute_error(y_test, prediction_linear))
print('MSE:', metrics.mean_squared_error(y_test, prediction_linear))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test,prediction_linear)))

2. Decision Tree

In [None]:
# Decision Tree Regression Model
from sklearn.tree import DecisionTreeRegressor
decision_regressor = DecisionTreeRegressor()
decision_regressor.fit(X_train,y_train)
# Predicting results
prediction_decision = decision_regressor.predict(X_test)
# Decision Tree Regression - Model Evaluation
print("---- Decision Tree Regression - Model Evaluation ----")
print('MAE:', metrics.mean_absolute_error(y_test, prediction_decision))
print('MSE:', metrics.mean_squared_error(y_test, prediction_decision))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test,prediction_decision)))

Predictions

In [None]:
def predict_score(batting_team='Chennai Super Kings',
bowling_team='Mumbai Indians', overs=5.1, runs=50, wickets=0,
runs_in_prev_5=50, wickets_in_prev_5=0):temp_array = list()
# Batting Team
if batting_team == 'Chennai Super Kings':
    temp_array = temp_array + [1,0,0,0,0,0,0,0]
elif batting_team == 'Delhi Daredevils':
    temp_array = temp_array + [0,1,0,0,0,0,0,0]
elif batting_team == 'Kings XI Punjab':
    temp_array = temp_array + [0,0,1,0,0,0,0,0]
elif batting_team == 'Kolkata Knight Riders':
    temp_array = temp_array + [0,0,0,1,0,0,0,0]
elif batting_team == 'Mumbai Indians':
    temp_array = temp_array + [0,0,0,0,1,0,0,0]
elif batting_team == 'Rajasthan Royals':
    temp_array = temp_array + [0,0,0,0,0,1,0,0]
elif batting_team == 'Royal Challengers Bangalore':
    temp_array = temp_array + [0,0,0,0,0,0,1,0]
elif batting_team == 'Sunrisers Hyderabad':
    temp_array = temp_array + [0,0,0,0,0,0,0,1]
# Bowling Team
if bowling_team == 'Chennai Super Kings':
    temp_array = temp_array + [1,0,0,0,0,0,0,0]
elif bowling_team == 'Delhi Daredevils':
    temp_array = temp_array + [0,1,0,0,0,0,0,0]
elif bowling_team == 'Kings XI Punjab':
    temp_array = temp_array + [0,0,1,0,0,0,0,0]
elif bowling_team == 'Kolkata Knight Riders':
    temp_array = temp_array + [0,0,0,1,0,0,0,0]
elif bowling_team == 'Mumbai Indians':
    temp_array = temp_array + [0,0,0,0,1,0,0,0]
elif bowling_team == 'Rajasthan Royals':
    temp_array = temp_array + [0,0,0,0,0,1,0,0]
elif bowling_team == 'Royal Challengers Bangalore':
    temp_array = temp_array + [0,0,0,0,0,0,1,0]
elif bowling_team == 'Sunrisers Hyderabad':
    temp_array = temp_array + [0,0,0,0,0,0,0,1]
# Overs, Runs, Wickets, Runs_in_prev_5, Wickets_in_prev_5
temp_array = temp_array + [overs, runs, wickets, runs_in_prev_5,wickets_in_prev_5]
# Converting into numpy array
temp_array = np.array([temp_array])
# Prediction
return int(regressor.predict(temp_array)[0])