In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csgraph
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, RidgeClassifier, RidgeClassifierCV,\
SGDClassifier, Perceptron, PassiveAggressiveClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid, RadiusNeighborsClassifier
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

In [2]:
def labelencode_dataframe(dataframe_column, classlist, column_name):
    le = LabelEncoder().fit(classlist)
    df_new = pd.DataFrame(le.transform(dataframe_column), columns=[column_name])
    return df_new

In [3]:
data = pd.read_csv('psl_data.csv', header=0, delimiter=',')
#Separating all features for encoding
teams = list(set(sum([list(data[['team1']]), list(data['team2'])], [])))
venues = list(set((data['venue'])))
toss_decisions = list(set(data['toss_decision']))
#encoding.
team1_new = labelencode_dataframe(data['team1'], teams, 'team1')
team2_new = labelencode_dataframe(data['team2'], teams, 'team2')
toss_winner_new = labelencode_dataframe(data['toss_winner'], teams, 'toss_winner')
winner_new = labelencode_dataframe(data['winner'], teams, 'winner')
venue_new = labelencode_dataframe(data['venue'], venues, 'venue')
toss_decision_new = labelencode_dataframe(data['toss_decision'], toss_decisions, 'toss_decision')
#New Encoded Data
data_new = pd.concat([team1_new, team2_new, venue_new, toss_winner_new, toss_decision_new, winner_new], axis=1)
#Features
X = data_new.iloc[:,0:5]
#Labels
Y = data_new.iloc[:, 5]

In [4]:
cvs = cross_val_score(LogisticRegression(multi_class='multinomial', solver='lbfgs'), X, Y, cv=5)
print "Cross Validation Logistic Regression: ", cvs.mean(), '+/-', cvs.std()

Cross Validation Logistic Regression:  0.34698717948717944 +/- 0.07054416552728389




In [5]:
cvs = cross_val_score(RandomForestClassifier(max_depth=5, random_state=0), X, Y, cv=5)
print "Cross Validation RandomForest: ", cvs.mean(), '+/-', cvs.std()

Cross Validation RandomForest:  0.4943589743589743 +/- 0.07314681517994745


In [6]:
cvs = cross_val_score(BernoulliNB(), X, Y, cv=5)
print "Cross Validation BernoulliNB: ", cvs.mean(), '+/-', cvs.std()

Cross Validation BernoulliNB:  0.4068162393162392 +/- 0.052240351742768684


In [7]:
cvs = cross_val_score(DecisionTreeClassifier(), X, Y, cv=5)
print "Cross Validation DecisionTreeClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation DecisionTreeClassifier:  0.48446581196581195 +/- 0.11533180969580066


In [8]:
cvs = cross_val_score(ExtraTreeClassifier(), X, Y, cv=5)
print "Cross Validation ExtraTreeClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation ExtraTreeClassifier:  0.3688034188034188 +/- 0.10785498498699046


In [9]:
cvs = cross_val_score(ExtraTreesClassifier(), X, Y, cv=5)
print "Cross Validation ExtraTreesClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation ExtraTreesClassifier:  0.47963675213675205 +/- 0.11502324703812493


In [10]:
cvs = cross_val_score(KNeighborsClassifier(), X, Y, cv=5)
print "Cross Validation KNeighborsClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation KNeighborsClassifier:  0.41008547008547 +/- 0.10698506063768601


In [11]:
cvs = cross_val_score(LabelPropagation(), X, Y, cv=5).mean()
print "Cross Validation LabelPropagation: ", cvs.mean(), '+/-', cvs.std()

Cross Validation LabelPropagation:  0.4700213675213675 +/- 0.0


In [12]:
cvs = cross_val_score(LabelSpreading(), X, Y, cv=5)
print "Cross Validation LabelSpreading: ", cvs.mean(), '+/-', cvs.std()

Cross Validation LabelSpreading:  0.48196581196581195 +/- 0.1321165164014551


In [13]:
cvs = cross_val_score(LinearDiscriminantAnalysis(), X, Y, cv=5)
print "Cross Validation LinearDiscriminantAnalysis: ", cvs.mean(), '+/-', cvs.std()

Cross Validation LinearDiscriminantAnalysis:  0.3692094017094017 +/- 0.07965519647253737


In [14]:
cvs = cross_val_score(LinearSVC(), X, Y, cv=5)
print "Cross Validation LinearSVC: ", cvs.mean(), '+/-', cvs.std()

Cross Validation LinearSVC:  0.39737179487179486 +/- 0.0983510497863792


In [15]:
cvs = cross_val_score(LogisticRegressionCV(multi_class='multinomial', solver='lbfgs'), X, Y, cv=5)
print "Cross Validation LogisticCV Regression: ", cvs.mean(), '+/-', cvs.std()

Cross Validation LogisticCV Regression:  0.38542735042735043 +/- 0.06155988468507649


In [16]:
cvs = cross_val_score(MLPClassifier(solver='lbfgs',  hidden_layer_sizes=(6,10,12,6)), X, Y, cv=5)
print "Cross Validation ANN: ", cvs.mean(), '+/-', cvs.std()

Cross Validation ANN:  0.3745940170940171 +/- 0.15191307503109425


In [17]:
cvs = cross_val_score(NearestCentroid(), X, Y, cv=5)
print "Cross Validation NearestCentroid: ", cvs.mean(), '+/-', cvs.std()

Cross Validation NearestCentroid:  0.27226495726495725 +/- 0.0919246961119696


In [18]:
cvs = cross_val_score(QuadraticDiscriminantAnalysis(), X, Y, cv=5)
print "Cross Validation QuadraticDiscriminantAnalysis: ", cvs.mean(), '+/-', cvs.std()

Cross Validation QuadraticDiscriminantAnalysis:  0.2932051282051282 +/- 0.13523385264428212


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])


In [19]:
cvs = cross_val_score(RadiusNeighborsClassifier(radius=3.7), X, Y, cv=5)
print "Cross Validation RadiusNeighborsClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation RadiusNeighborsClassifier:  0.38542735042735043 +/- 0.0505475172605475


In [20]:
cvs = cross_val_score(RidgeClassifier(), X, Y, cv=5)
print "Cross Validation RidgeClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation RidgeClassifier:  0.41209401709401716 +/- 0.10472262114580541


In [21]:
cvs = cross_val_score(RidgeClassifierCV(), X, Y, cv=5)
print "Cross Validation RidgeClassifierCV: ", cvs.mean(), '+/-', cvs.std()

Cross Validation RidgeClassifierCV:  0.42525641025641026 +/- 0.059243224600301014


In [22]:
cvs = cross_val_score(NuSVC(nu=0.31), X, Y, cv=5)
print "Cross Validation NuSVC: ", cvs.mean(), '+/-', cvs.std()

Cross Validation NuSVC:  0.5208547008547009 +/- 0.07390785646321124


In [23]:
cvs = cross_val_score(SVC(), X, Y, cv=5)
print "Cross Validation SVC: ", cvs.mean(), '+/-', cvs.std()

Cross Validation SVC:  0.47314102564102567 +/- 0.04257339084459247


In [24]:
cvs = cross_val_score(GaussianProcessClassifier(multi_class = 'one_vs_one'), X, Y, cv=5)
print "Cross Validation GaussianProcessClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation GaussianProcessClassifier:  0.4657478632478632 +/- 0.08362658242997921


In [25]:
cvs = cross_val_score(GradientBoostingClassifier(), X, Y, cv=5)
print "Cross Validation GradientBoostingClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation GradientBoostingClassifier:  0.5046367521367521 +/- 0.06121247048175202


In [26]:
cvs = cross_val_score(GaussianProcessClassifier(multi_class = 'one_vs_rest'), X, Y, cv=5)
print "Cross Validation GaussianProcessClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation GaussianProcessClassifier:  0.4657478632478632 +/- 0.08362658242997921


In [27]:
cvs = cross_val_score(LinearSVC(multi_class='ovr'), X, Y, cv=5)
print "Cross Validation LinearSVC: ", cvs.mean(), '+/-', cvs.std()

Cross Validation LinearSVC:  0.3715384615384615 +/- 0.0957698314607187


In [28]:
cvs = cross_val_score(LogisticRegressionCV(multi_class='ovr'), X, Y, cv=5)
print "Cross Validation LogisticCV Regression: ", cvs.mean(), '+/-', cvs.std()

Cross Validation LogisticCV Regression:  0.360982905982906 +/- 0.03493406121837463


In [29]:
cvs = cross_val_score(LogisticRegression(multi_class='ovr'), X, Y, cv=5)
print "Cross Validation Logistic Regression: ", cvs.mean(), '+/-', cvs.std()

Cross Validation Logistic Regression:  0.33393162393162396 +/- 0.1006696839810616


In [30]:
cvs = cross_val_score(SGDClassifier(), X, Y, cv=5)
print "Cross Validation SGDClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation SGDClassifier:  0.2932051282051282 +/- 0.12772190986295226




In [31]:
cvs = cross_val_score(Perceptron(), X, Y, cv=5)
print "Cross Validation Perceptron: ", cvs.mean(), '+/-', cvs.std()

Cross Validation Perceptron:  0.26333333333333336 +/- 0.14434184390271093




In [32]:
cvs = cross_val_score(PassiveAggressiveClassifier(), X, Y, cv=5)
print "Cross Validation PassiveAggressiveClassifier: ", cvs.mean(), '+/-', cvs.std()

Cross Validation PassiveAggressiveClassifier:  0.3161538461538461 +/- 0.0981940811889348




In [33]:
LogisticRegression().fit(X,Y).predict([[5,0,0,5, 1]])

array([2], dtype=int64)