In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csgraph
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, RidgeClassifier, RidgeClassifierCV,\
SGDClassifier, Perceptron, PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid, RadiusNeighborsClassifier
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

In [2]:
def labelencode_dataframe(dataframe_column, classlist, column_name):
    le = LabelEncoder().fit(classlist)
    df_new = pd.DataFrame(le.transform(dataframe_column), columns=[column_name])
    return df_new

In [3]:
data = pd.read_csv('psl_data.csv', header=0, delimiter=',')
#Separating all features for encoding
teams = list(set(sum([list(data[['team1']]), list(data['team2'])], [])))
venues = list(set((data['venue'])))
toss_decisions = list(set(data['toss_decision']))
#encoding.
team1_new = labelencode_dataframe(data['team1'], teams, 'team1')
team2_new = labelencode_dataframe(data['team2'], teams, 'team2')
toss_winner_new = labelencode_dataframe(data['toss_winner'], teams, 'toss_winner')
winner_new = labelencode_dataframe(data['winner'], teams, 'winner')
venue_new = labelencode_dataframe(data['venue'], venues, 'venue')
toss_decision_new = labelencode_dataframe(data['toss_decision'], toss_decisions, 'toss_decision')
#New Encoded Data
data_new = pd.concat([team1_new, team2_new, venue_new, toss_winner_new, toss_decision_new, winner_new], axis=1)
#Features
X = data_new.iloc[:,0:5]
#Labels
Y = pd.DataFrame(np.where(data['team1']==data['winner'], 1, 0))

In [4]:
cvs = cross_val_score(LogisticRegression(), X, Y, cv=5)
print "Cross Validation Logistic Regression: ", cvs.mean(), '+/-', cvs.std()

Cross Validation Logistic Regression:  0.5797549019607844 +/- 0.13787513922997682


  y = column_or_1d(y, warn=True)


In [5]:
cvs = cross_val_score(RandomForestClassifier(max_depth=5, random_state=0), X, Y, cv=5)
print "Cross Validation RandomForest: ", cvs.mean(), '+/-', cvs.std()

  estimator.fit(X_train, y_train, **fit_params)


Cross Validation RandomForest:  0.5915196078431373 +/- 0.11927444694792255


In [6]:
cvs = cross_val_score(BernoulliNB(), X, Y, cv=5)
print "Cross Validation BernoulliNB: ", cvs.mean(), '+/-', cvs.std()

Cross Validation BernoulliNB:  0.5687254901960784 +/- 0.2233092848867853


In [7]:
cvs = cross_val_score(DecisionTreeClassifier(), X, Y, cv=5)
print "Cross Validation DecisionTreeClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation DecisionTreeClassifier:  0.5133823529411765 +/- 0.052679910880530374


In [8]:
cvs = cross_val_score(ExtraTreeClassifier(), X, Y, cv=5)
print "Cross Validation ExtraTreeClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation ExtraTreeClassifier:  0.4279901960784314 +/- 0.08614272466040584


In [9]:
cvs = cross_val_score(ExtraTreesClassifier(), X, Y, cv=5)
print "Cross Validation ExtraTreesClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation ExtraTreesClassifier:  0.5907843137254902 +/- 0.09523711278122356


In [10]:
cvs = cross_val_score(KNeighborsClassifier(), X, Y, cv=5)
print "Cross Validation KNeighborsClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation KNeighborsClassifier:  0.5648529411764706 +/- 0.1483490459907538


  estimator.fit(X_train, y_train, **fit_params)


In [11]:
cvs = cross_val_score(LabelPropagation(), X, Y, cv=5).mean()
print "Cross Validation LabelPropagation: ", cvs.mean(), '+/-', cvs.std()

Cross Validation LabelPropagation:  0.5791176470588235 +/- 0.0


In [12]:
cvs = cross_val_score(LabelSpreading(), X, Y, cv=5)
print "Cross Validation LabelSpreading: ", cvs.mean(), '+/-', cvs.std()

Cross Validation LabelSpreading:  0.5782843137254903 +/- 0.1349619038619445


In [13]:
cvs = cross_val_score(LinearDiscriminantAnalysis(), X, Y, cv=5)
print "Cross Validation LinearDiscriminantAnalysis: ", cvs.mean(), '+/-', cvs.std()

Cross Validation LinearDiscriminantAnalysis:  0.5280882352941176 +/- 0.16790640414717128


In [14]:
cvs = cross_val_score(LinearSVC(), X, Y, cv=5)
print "Cross Validation LinearSVC: ", cvs.mean(), '+/-', cvs.std()

Cross Validation LinearSVC:  0.5155882352941177 +/- 0.14733386045300706


In [15]:
cvs = cross_val_score(LogisticRegressionCV(), X, Y, cv=5)
print "Cross Validation LogisticCV Regression: ", cvs.mean(), '+/-', cvs.std()

Cross Validation LogisticCV Regression:  0.4772549019607843 +/- 0.07478397056174287


In [16]:
cvs = cross_val_score(MLPClassifier(solver='lbfgs',  hidden_layer_sizes=(6,10,12,6)), X, Y, cv=5)
print "Cross Validation ANN: ", cvs.mean(), '+/-', cvs.std()

  y = column_or_1d(y, warn=True)


Cross Validation ANN:  0.6274509803921569 +/- 0.15944376291738865


In [17]:
cvs = cross_val_score(NearestCentroid(), X, Y, cv=5)
print "Cross Validation NearestCentroid: ", cvs.mean(), '+/-', cvs.std()

Cross Validation NearestCentroid:  0.5359803921568628 +/- 0.07191819677985627


In [18]:
cvs = cross_val_score(QuadraticDiscriminantAnalysis(), X, Y, cv=5)
print "Cross Validation QuadraticDiscriminantAnalysis: ", cvs.mean(), '+/-', cvs.std()

Cross Validation QuadraticDiscriminantAnalysis:  0.46392156862745104 +/- 0.08906923446111698


In [19]:
cvs = cross_val_score(RadiusNeighborsClassifier(radius=3.7), X, Y, cv=5)
print "Cross Validation RadiusNeighborsClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation RadiusNeighborsClassifier:  0.47568627450980394 +/- 0.08624475868489138


In [20]:
cvs = cross_val_score(RidgeClassifier(), X, Y, cv=5)
print "Cross Validation RidgeClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation RidgeClassifier:  0.5155882352941177 +/- 0.14733386045300706


  y = column_or_1d(y, warn=True)


In [21]:
cvs = cross_val_score(RidgeClassifierCV(), X, Y, cv=5)
print "Cross Validation RidgeClassifierCV: ", cvs.mean(), '+/-', cvs.std()

Cross Validation RidgeClassifierCV:  0.5405882352941177 +/- 0.1310643150185445


  y = column_or_1d(y, warn=True)


In [22]:
cvs = cross_val_score(NuSVC(nu=0.31), X, Y, cv=5)
print "Cross Validation NuSVC: ", cvs.mean(), '+/-', cvs.std()

Cross Validation NuSVC:  0.605686274509804 +/- 0.1679699190174622


In [23]:
cvs = cross_val_score(SVC(), X, Y, cv=5)
print "Cross Validation SVC: ", cvs.mean(), '+/-', cvs.std()

Cross Validation SVC:  0.5907843137254902 +/- 0.11729282864226397


In [24]:
cvs = cross_val_score(GaussianProcessClassifier(), X, Y, cv=5)
print "Cross Validation GaussianProcessClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation GaussianProcessClassifier:  0.5791176470588235 +/- 0.12884451212410358


In [25]:
cvs = cross_val_score(GradientBoostingClassifier(), X, Y, cv=5)
print "Cross Validation GradientBoostingClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation GradientBoostingClassifier:  0.5806862745098039 +/- 0.1532912503054985


In [26]:
cvs = cross_val_score(SGDClassifier(), X, Y, cv=5)
print "Cross Validation SGDClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation SGDClassifier:  0.47411764705882353 +/- 0.0553679888279429




In [27]:
cvs = cross_val_score(Perceptron(), X, Y, cv=5)
print "Cross Validation Perceptron: ", cvs.mean(), '+/-', cvs.std()

Cross Validation Perceptron:  0.47558823529411764 +/- 0.08539218500258422




In [28]:
cvs = cross_val_score(PassiveAggressiveClassifier(), X, Y, cv=5)
print "Cross Validation PassiveAggressiveClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation PassiveAggressiveClassifier:  0.5374509803921569 +/- 0.09965017145249346




In [29]:
LogisticRegression().fit(X,Y).predict([[5,0,0,5, 1]])

array([0])