In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csgraph
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, RidgeClassifier, RidgeClassifierCV,\
SGDClassifier, Perceptron, PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid, RadiusNeighborsClassifier
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

In [2]:
def labelencode_dataframe(dataframe_columns, classlist, column_names):
    le = LabelEncoder().fit(classlist)
    if len(dataframe_columns.shape) == 1:
        df_new = pd.DataFrame(le.transform(dataframe_columns), columns=column_names)
    else:
        df_new = dataframe_columns.apply(le.transform)
    return df_new

In [3]:
def dataframe_tolist(dataframe):
    ret_list = []
    headers = list(dataframe)
    for header in headers:
        ret_list.extend(list(set(dataframe[header])))
    return list(set(ret_list))

In [4]:
data = pd.read_csv('psl_data2.csv', header=0, delimiter=',')
#Separating all features for encoding
teams = list(set(sum([list(data[['team1']]), list(data['team2'])], [])))
venues = list(set((data['venue'])))
toss_decisions = list(set(data['toss_decision']))
players = dataframe_tolist(data.iloc[:,2:24])
#encoding.
team1_new = labelencode_dataframe(data['team1'], teams, ['team1'])
team2_new = labelencode_dataframe(data['team2'], teams, ['team2'])
toss_winner_new = labelencode_dataframe(data['toss_winner'], teams, ['toss_winner'])
winner_new = labelencode_dataframe(data['winner'], teams, ['winner'])
venue_new = labelencode_dataframe(data['venue'], venues, ['venue'])
toss_decision_new = labelencode_dataframe(data['toss_decision'], toss_decisions, ['toss_decision'])
players_new = labelencode_dataframe(data.iloc[:,2:24], players, list(data.iloc[:,2:24].columns))
#New Encoded Data
data_new = pd.concat([team1_new, team2_new, players_new, venue_new, toss_winner_new, toss_decision_new, winner_new], axis=1)
#Features
X = data_new.iloc[:,0:27]
#Labels
Y = pd.DataFrame(np.where(data['team1']==data['winner'], 1, 0))

In [5]:
cvs = cross_val_score(LogisticRegression(), X, Y, cv=5)
print "Cross Validation Logistic Regression: ", cvs.mean(), '+/-', cvs.std()

Cross Validation Logistic Regression:  0.5015196078431372 +/- 0.07538737586513299


  y = column_or_1d(y, warn=True)


In [6]:
cvs = cross_val_score(RandomForestClassifier(max_depth=5, random_state=0), X, Y, cv=5)
print "Cross Validation RandomForest: ", cvs.mean(), '+/-', cvs.std()

  estimator.fit(X_train, y_train, **fit_params)


Cross Validation RandomForest:  0.4999509803921569 +/- 0.05233526726714126


In [7]:
cvs = cross_val_score(BernoulliNB(), X, Y, cv=5)
print "Cross Validation BernoulliNB: ", cvs.mean(), '+/-', cvs.std()

Cross Validation BernoulliNB:  0.5789215686274509 +/- 0.1782616085322609


In [8]:
cvs = cross_val_score(DecisionTreeClassifier(), X, Y, cv=5)
print "Cross Validation DecisionTreeClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation DecisionTreeClassifier:  0.49838235294117644 +/- 0.1304382750522864


In [9]:
cvs = cross_val_score(ExtraTreeClassifier(), X, Y, cv=5)
print "Cross Validation ExtraTreeClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation ExtraTreeClassifier:  0.5093137254901962 +/- 0.10822847822386844


In [10]:
cvs = cross_val_score(ExtraTreesClassifier(), X, Y, cv=5)
print "Cross Validation ExtraTreesClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation ExtraTreesClassifier:  0.5516176470588235 +/- 0.09001321510052601


In [11]:
cvs = cross_val_score(KNeighborsClassifier(), X, Y, cv=5)
print "Cross Validation KNeighborsClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation KNeighborsClassifier:  0.5540196078431372 +/- 0.11922820254859202


  estimator.fit(X_train, y_train, **fit_params)


In [12]:
cvs = cross_val_score(LabelPropagation(), X, Y, cv=5).mean()
print "Cross Validation LabelPropagation: ", cvs.mean(), '+/-', cvs.std()

Cross Validation LabelPropagation:  0.5500490196078431 +/- 0.0


  probabilities /= normalizer


In [13]:
cvs = cross_val_score(LabelSpreading(), X, Y, cv=5)
print "Cross Validation LabelSpreading: ", cvs.mean(), '+/-', cvs.std()

Cross Validation LabelSpreading:  0.5500490196078431 +/- 0.015299615653302954


In [14]:
cvs = cross_val_score(LinearDiscriminantAnalysis(), X, Y, cv=5)
print "Cross Validation LinearDiscriminantAnalysis: ", cvs.mean(), '+/-', cvs.std()

Cross Validation LinearDiscriminantAnalysis:  0.5141176470588236 +/- 0.09068463167563359


In [15]:
cvs = cross_val_score(LinearSVC(), X, Y, cv=5)
print "Cross Validation LinearSVC: ", cvs.mean(), '+/-', cvs.std()

Cross Validation LinearSVC:  0.5266176470588235 +/- 0.09217245383342433


In [16]:
cvs = cross_val_score(LogisticRegressionCV(), X, Y, cv=5)
print "Cross Validation LogisticCV Regression: ", cvs.mean(), '+/-', cvs.std()

Cross Validation LogisticCV Regression:  0.5147549019607843 +/- 0.09075254600241692


In [35]:
cvs = cross_val_score(MLPClassifier(solver='lbfgs'), X, Y, cv=5)
print "Cross Validation ANN: ", cvs.mean(), '+/-', cvs.std()

Cross Validation ANN:  0.513921568627451 +/- 0.09508507593068602


In [18]:
cvs = cross_val_score(NearestCentroid(), X, Y, cv=5)
print "Cross Validation NearestCentroid: ", cvs.mean(), '+/-', cvs.std()

Cross Validation NearestCentroid:  0.5874509803921569 +/- 0.11428061354094865


In [19]:
cvs = cross_val_score(QuadraticDiscriminantAnalysis(), X, Y, cv=5)
print "Cross Validation QuadraticDiscriminantAnalysis: ", cvs.mean(), '+/-', cvs.std()

Cross Validation QuadraticDiscriminantAnalysis:  0.5249509803921569 +/- 0.04600468650382549


In [20]:
cvs = cross_val_score(RadiusNeighborsClassifier(radius=290), X, Y, cv=5)
print "Cross Validation RadiusNeighborsClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation RadiusNeighborsClassifier:  0.591421568627451 +/- 0.12167235454622698


In [21]:
cvs = cross_val_score(RidgeClassifier(), X, Y, cv=5)
print "Cross Validation RidgeClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation RidgeClassifier:  0.5141176470588236 +/- 0.09068463167563359


  y = column_or_1d(y, warn=True)


In [22]:
cvs = cross_val_score(RidgeClassifierCV(), X, Y, cv=5)
print "Cross Validation RidgeClassifierCV: ", cvs.mean(), '+/-', cvs.std()

  y = column_or_1d(y, warn=True)


Cross Validation RidgeClassifierCV:  0.5508823529411765 +/- 0.08600288681080645


In [23]:
cvs = cross_val_score(NuSVC(nu=0.31), X, Y, cv=5)
print "Cross Validation NuSVC: ", cvs.mean(), '+/-', cvs.std()

Cross Validation NuSVC:  0.5500490196078431 +/- 0.015299615653302954


In [24]:
cvs = cross_val_score(SVC(), X, Y, cv=5)
print "Cross Validation SVC: ", cvs.mean(), '+/-', cvs.std()

Cross Validation SVC:  0.5500490196078431 +/- 0.015299615653302954


In [25]:
cvs = cross_val_score(GaussianProcessClassifier(), X, Y, cv=5)
print "Cross Validation GaussianProcessClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation GaussianProcessClassifier:  0.5500490196078431 +/- 0.015299615653302954


In [26]:
cvs = cross_val_score(GradientBoostingClassifier(), X, Y, cv=5)
print "Cross Validation GradientBoostingClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation GradientBoostingClassifier:  0.48911764705882355 +/- 0.08211019897117838


In [27]:
cvs = cross_val_score(SGDClassifier(), X, Y, cv=5)
print "Cross Validation SGDClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation SGDClassifier:  0.5507843137254902 +/- 0.07502456985732106




In [28]:
cvs = cross_val_score(Perceptron(), X, Y, cv=5)
print "Cross Validation Perceptron: ", cvs.mean(), '+/-', cvs.std()

Cross Validation Perceptron:  0.5624509803921569 +/- 0.08621257246424965




In [29]:
cvs = cross_val_score(PassiveAggressiveClassifier(), X, Y, cv=5)
print "Cross Validation PassiveAggressiveClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation PassiveAggressiveClassifier:  0.4654901960784314 +/- 0.09524339507827374


