In [180]:
# Import numpy and pandas library
import pandas as pd
import numpy as np

#Import models from scikit learn module:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold   #For K-fold cross validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn import metrics
from sklearn.linear_model import LinearRegression as lr

# Input training and test data
deliveries = pd.read_csv("Data/deliveries.csv")
match = pd.read_csv("Data/matches.csv")
IPL_2018 = pd.read_csv("Data/test_data.csv")
# Choice of fields for current model from available data
matches = match[["city","venue","season","toss_winner","toss_decision", "result", "dl_applied", "team1", "team2", "winner"]]


In [181]:

#Removing data of withdrawn teams
withdrawn_teams = ['Deccan Chargers', 'Gujarat Lions', 'Kochi Tuskers Kerala', 'Pune Warriors', 'Rising Pune Supergiant', 'Rising Pune Supergiants']
for team in withdrawn_teams:
    matches = matches[matches["team1"] != team]
    matches = matches[matches["team2"] != team]
matches = matches.reset_index(drop=True)


In [182]:
# Addition of customized field
match_by_ball = deliveries[["match_id", "inning", "batsman","batsman_runs", "extra_runs", "total_runs"]]

match_by_ball = match_by_ball[match_by_ball["inning"] == 1]
match_by_ball.drop(["inning"], axis=1, inplace=True)

extra_stats = match_by_ball.groupby(["match_id", "batsman"])["batsman_runs", "extra_runs", "total_runs"].sum()

extra_stats["half_century"] = extra_stats["batsman_runs"] >= 50

newdf = extra_stats.groupby("match_id").sum().drop("batsman_runs",1).reset_index(drop = True)
matches = pd.concat([matches, newdf], axis=1)

In [183]:
# Choice of fields for test data
IPL_2018 = IPL_2018[["city","venue","season","toss_winner","toss_decision", "result", "dl_applied", "team1", "team2", "winner", "total_runs", "half_century", "extra_runs"]]

In [184]:
# Removing matches resulting in tie, having no results or being finalized via D/L method
matches = matches[matches["result"] != "tie"]
matches = matches[matches["result"] != "no result"]
matches = matches[matches["dl_applied"] == 0]
IPL_2018 = IPL_2018[matches["result"] != "tie"]
IPL_2018 = IPL_2018[matches["result"] != "no result"]
IPL_2018 = IPL_2018[matches["dl_applied"] == 0]

  """
  
  import sys


In [185]:
# Merging training and test data for preprocessing
matches = pd.concat([IPL_2018, matches]).reset_index(drop=True)

matches.drop(["dl_applied", "result"],1,inplace=True)

In [186]:
def factorize_fields(matches):
    df = matches[["team1","team2","toss_winner","winner"]]
    _, b = pd.factorize(df.values.T.reshape(-1, ))  
    facorized_fields = df.apply(lambda x: pd.Categorical(x, b).codes)
    matches["venue"] = matches.venue.factorize()[0]
    matches["city"] = matches.city.factorize()[0]
    matches["toss_winner"] = matches.toss_winner.factorize()[0]
    matches["toss_decision"] = matches.toss_decision.factorize()[0]
    matches = pd.concat([matches.drop(["team1","team2","toss_winner","winner"],1), facorized_fields], 1)
    factors = pd.factorize(df.values.T.reshape(-1, ))[1]
    return [factors, matches]
#[["city","venue","season","toss_winner","toss_decision"]]

In [187]:
# Converting data into factors
[factors, matches] = factorize_fields(matches)

# Removing season: Include if time model made
matches.drop(["season"],1,inplace=True)

In [188]:
# Deriving insight on data
matches.describe()

Unnamed: 0,city,extra_runs,half_century,toss_decision,total_runs,venue,team1,team2,toss_winner,winner
count,441.0,441.0,441.0,441.0,441.0,441.0,441.0,441.0,441.0,441.0
mean,7.029478,8.546485,0.736961,0.430839,157.913832,10.276644,3.281179,3.600907,3.410431,3.258503
std,6.532892,4.440336,0.731573,0.495756,29.963718,8.691198,2.348662,2.45216,2.43213,2.343526
min,-1.0,0.0,0.0,0.0,56.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,5.0,0.0,0.0,139.0,3.0,1.0,1.0,1.0,1.0
50%,6.0,8.0,1.0,0.0,160.0,9.0,3.0,4.0,4.0,3.0
75%,9.0,11.0,1.0,1.0,178.0,13.0,5.0,6.0,6.0,5.0
max,26.0,27.0,3.0,1.0,246.0,33.0,7.0,7.0,7.0,7.0


In [189]:
# Train - test split after pre processing
ind = len(IPL_2018)
test_data = matches[:ind]
train_data = matches[ind:]

In [190]:
def remap(item):
    '''Function to remap the factors to original field names'''
    return factors[item]

In [191]:
def model_building(model, predictors, outcome, data, test_data):
    '''Function to build model, cross-validate and predict results'''
    #model.fit(data[predictors], data[outcome])  
    kf = KFold(data.shape[0], n_folds = 3)
    error = []
    for train, test in kf:
        train_predictors = (data[predictors].iloc[train,:])
        train_target = data[outcome].iloc[train]
        model.fit(train_predictors, train_target)
        error.append(model.score(data[predictors].iloc[test,:], data[outcome].iloc[test]))
    cv_error = np.mean(error)
    #print('Cross validation Score : %s' % '{0:.3%}'.format(cv_error))
    model.fit(data[predictors],data[outcome])
    #coefficients = [model.intercept_, model.coef_]
    #print coefficients
    predictions = np.int_(np.round_(model.predict(test_data[predictors])))
    test_data["predicted_winner"] = predictions
    accuracy = metrics.accuracy_score(predictions,test_data[outcome])
    #print('Accuracy : %s' % '{0:.3%}'.format(accuracy))
    test_data["Team1"] = test_data["team1"].apply(remap)
    test_data["Team2"] = test_data["team2"].apply(remap)
    test_data["Actual Winner"] = test_data["winner"].apply(remap)
    test_data["Predicted Winner"] = test_data["predicted_winner"].apply(remap)
    df = test_data[["Team1","Team2","Actual Winner", "Predicted Winner"]]
    return [df, accuracy, cv_error]

In [192]:
# Models tested
model1 = lr()                         #linear regression
model2 = LogisticRegression()         #L2 regularization, one vs all
model3 = LogisticRegression(penalty='l1')         #L1 regularization, one vs all
model4 = LogisticRegression(solver='newton-cg', multi_class='multinomial')  #Multinomial
model5 = SVC(kernel = "linear")
model6 = DTC()
model7 = RandomForestClassifier(n_estimators=100)
models = [model1, model2, model3, model4, model5, model6, model7]
results = []
accuracies = []
cv_errors = []
#col = ["Intercept"] + list(train_data.columns)
#coefficients_summary = pd.DataFrame(columns= col)
output = ['winner']
predictors = matches.drop(["winner"],1).columns
for model in models:
    [result, accuracy, cv_error] = model_building(model, predictors, output, train_data, test_data)
    results.append(result)
    accuracies.append(accuracy)
    cv_errors.append(cv_error)
    #coefficients_summary = coefficients_summary.append(coefficients, ignore_index=True)
model_names = ["Linear Regression(Nearest integer round off)", "LogisticRegression(One vs All) L2 reg", "LogisticRegression(One vs All) L1 reg", "MultinomialRegression", "SVM", "DecisionTree", "Random Forest"]
model_comparison = pd.DataFrame(columns=["Model Names", "Accuracy", "Cross Validation Errors"])
model_comparison["Model Names"] = model_names
model_comparison["Accuracy"] = accuracies
model_comparison["Cross Validation Errors"] = cv_errors


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [193]:
# Final result
model_comparison

Unnamed: 0,Model Names,Accuracy,Cross Validation Errors
0,Linear Regression(Nearest integer round off),0.0,0.376332
1,LogisticRegression(One vs All) L2 reg,0.0,0.283136
2,LogisticRegression(One vs All) L1 reg,0.0,0.280821
3,MultinomialRegression,0.0,0.276159
4,SVM,0.0,0.310914
5,DecisionTree,0.4,0.382705
6,Random Forest,0.3,0.515054


In [194]:
# Post model fit analysis
import statsmodels.formula.api as sm
model = sm.MNLogit(train_data[output], train_data[predictors]) 
mod = model.fit()

Optimization terminated successfully.
         Current function value: 1.707727
         Iterations 7


In [195]:
print mod.summary()

                          MNLogit Regression Results                          
Dep. Variable:                 winner   No. Observations:                  431
Model:                        MNLogit   Df Residuals:                      368
Method:                           MLE   Df Model:                           56
Date:                Wed, 18 Apr 2018   Pseudo R-squ.:                  0.1670
Time:                        15:48:14   Log-Likelihood:                -736.03
converged:                       True   LL-Null:                       -883.55
                                        LLR p-value:                 3.495e-34
     winner=1       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
city              0.2076      0.068      3.048      0.002       0.074       0.341
extra_runs        0.0320      0.045      0.710      0.478      -0.056       0.120
half_century     -0.0543      0.311     

In [196]:
results[5]

Unnamed: 0,Team1,Team2,Actual Winner,Predicted Winner
0,Mumbai Indians,Chennai Super Kings,Chennai Super Kings,Mumbai Indians
1,Kings XI Punjab,Delhi Daredevils,Kings XI Punjab,Chennai Super Kings
2,Kolkata Knight Riders,Royal Challengers Bangalore,Kolkata Knight Riders,Kolkata Knight Riders
3,Sunrisers Hyderabad,Rajasthan Royals,Sunrisers Hyderabad,Sunrisers Hyderabad
4,Chennai Super Kings,Kolkata Knight Riders,Chennai Super Kings,Chennai Super Kings
5,Rajasthan Royals,Delhi Daredevils,Rajasthan Royals,Delhi Daredevils
6,Sunrisers Hyderabad,Mumbai Indians,Sunrisers Hyderabad,Mumbai Indians
7,Royal Challengers Bangalore,Kings XI Punjab,Royal Challengers Bangalore,Kings XI Punjab
8,Mumbai Indians,Delhi Daredevils,Delhi Daredevils,Delhi Daredevils
9,Kolkata Knight Riders,Sunrisers Hyderabad,Sunrisers Hyderabad,Kolkata Knight Riders
