# Poker Hands UCI Classifications
The Poker Hands dataset is taken from here: https://archive.ics.uci.edu/ml/datasets/Poker+Hand

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold
import sklearn.tree   
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import svm

In [None]:
# data_train = np.loadtxt('poker-hand-training-true.data', delimiter=',')

# data_test = np.loadtxt('poker-hand-testing.data', delimiter=',')

data_train = pd.read_csv('poker-hand-training-true.data', header=None)
print(data_train.shape)
data_test = pd.read_csv('poker-hand-testing.data', header=None)
print(data_test.shape)


In [None]:
data_train.columns = ["Suit 1", "Card 1", "Suit 2", "Card 2", "Suit 3", "Card 3","Suit 4", "Card 4","Suit 5", "Card 5","Poker Hand"]
print(data_train.head())

In [None]:
data_test.columns = ["Suit 1", "Card 1", "Suit 2", "Card 2", "Suit 3", "Card 3","Suit 4", "Card 4","Suit 5","Card 5","Poker Hand"]
print(data_test.head(10))

Extracting Poker Hand type and features:

In [None]:
X_train = data_train.iloc[:,:-1]
y_train = data_train["Poker Hand"]


X_test = data_test.iloc[:,:-1]
y_test = data_test["Poker Hand"]




In [None]:
print(y_train.shape)
print(y_train.head(10))

print(y_test.shape)
print(y_test.head(10))


As we can see, the Poker Hands Dataset has 11 columns to represent th etc...

In [None]:
clf = sklearn.tree.DecisionTreeClassifier(random_state=0, max_depth = 9, criterion="gini").fit(X_train, y_train)
clf2 = sklearn.tree.DecisionTreeClassifier(random_state=0, max_depth = 2, criterion="gini").fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred, normalize=True))
                                                                                                
y_pred2 = clf2.predict(X_test)
print(accuracy_score(y_test, y_pred2, normalize=True))

Using DecisonTree Classifer and usign max_depth of 9 will result in accuracy of 0.52719. To equalize accuracy with that of Logistic Regression, we select max_depth = 2

In [None]:
clf = sklearn.linear_model.LogisticRegression(random_state=0, solver="lbfgs", max_iter=100, multi_class="ovr").fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_pred,y_test)

Using LogisticRegression classifier - Slightly better than DecisionTree using the same data set. Using the 

In [None]:
# SVM
clf = sklearn.svm.SVC(kernel='linear')
clf.fit(X_train,y_train) 
y_pred = clf.predict(X_test)
accuracy_score(y_pred,y_test)


In [None]:
from sklearn import svm
clf = svm.LinearSVC()
clf.fit(x_train,y_train) 
y_pred = clf.predict(x_test)
accuracy_score(y_pred,y_test)

The accuracy_score we got from the SVm results in : 0.559363 (Took a while to train

We will now preprocess the data to achieve a higher accuracy

In [None]:
X_train_processed = data_train.copy()
cards = X_train_processed[["Card 1", "Card 2", "Card 3", "Card 4", "Card 5"]]
cards.values.sort()
X_train_processed[["Card 1", "Card 2", "Card 3", "Card 4", "Card 5"]] = cards
X_train_processed = X_train_processed[["Card 1", "Card 2", "Card 3", "Card 4", "Card 5", "Suit 1", "Suit 2", "Suit 3", "Suit 4", "Suit 5", "Poker Hand"]]


X_test_processed = data_test.copy()
cards = X_test_processed[["Card 1", "Card 2", "Card 3", "Card 4", "Card 5"]]
cards.values.sort()
X_test_processed[["Card 1", "Card 2", "Card 3", "Card 4", "Card 5"]] = cards
X_test_processed = X_test_processed[["Card 1", "Card 2", "Card 3", "Card 4", "Card 5", "Suit 1", "Suit 2", "Suit 3", "Suit 4", "Suit 5", "Poker Hand"]]

X_train = X_train_processed.loc[:,X_train_processed.columns != "Poker Hand"]
X_test = X_test_processed.loc[:,X_test_processed.columns != "Poker Hand"]


Now doing a 10 fold cv for the DecisionTree Classifier

In [None]:
# using the same tree as before
clf = sklearn.tree.DecisionTreeClassifier(random_state=1, criterion='gini')
# cross_validation(alg, X_train, Y_train)
kf = KFold(n_splits = 10, shuffle=True)

acc = []
matrix = None
first = True

i = 1
for train_index, test_index in kf.split(X_train, y_train):
    print('{}-Fold'.format(i))
    fX_train, fX_test = X_train.iloc[train_index,:], X_train.iloc[test_index,:]
    fy_train, fy_test = y_train[train_index], y_train[test_index]
    clf.fit(fX_train, fy_train)
    fy_pred = clf.predict(fX_test)
    curr = accuracy_score(fy_test, fy_pred, normalize=True)
    acc.append(curr)
    i = i+1

acc = pd.Series(acc)
print(acc.mean())
# return acc.mean()

In [None]:
clf = sklearn.tree.DecisionTreeClassifier(random_state=1,criterion='gini')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred, normalize=True)

96% is pretty good. We have essentially doubled the correcteness of our predictions simply by re-arranging the cards into Cards and Suits. Compared to random state = 0, we get a better result using random state 1

In [None]:
pd.crosstab(y_pred, y_test, rownames=['Predicted'], colnames=['True'], margins=True)

By analyzing the data shown in the table, we can observe how the True/Predicted values in the upper end have prediction disparity. This means that the flushes and higher reward hands are causing prediction erros.

In [None]:
tmp = X_test[["Suit 1", "Suit 2", "Suit 3", "Suit 4", "Suit 5"]]
X_test["Unique"] = tmp.apply(lambda x: len(np.unique(x)) , axis=1)

In [None]:
tmp = X_train[["Suit 1", "Suit 2", "Suit 3", "Suit 4", "Suit 5"]]
X_train["Unique"] = tmp.apply(lambda x: len(np.unique(x)) , axis=1)

In [None]:
tree = sklearn.tree.DecisionTreeClassifier(random_state=0, criterion='gini')
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
accuracy_score(y_test, y_pred, normalize=True)

Much better, we're now at 96%, slightly higher than previously but now the data should be more spread out and the prediciton should be less sparse.

In [None]:
pd.crosstab(y_pred, y_test, rownames=['Predicted'], colnames=['True'], margins=True)

What we see in this cross tab: flushes have now been resolved but the last two poker hands are still slightly cause for mispredictions. 12-5 and 230-155, still these are very slight but can be cause for accuracy concerns. Straight flushes are bieng underepresented. 

In [None]:
X_train["Diff1"] = X_train["Card 5"] - X_train["Card 4"]
X_train["Diff2"] = X_train["Card 4"] - X_train["Card 3"]
X_train["Diff3"] = X_train["Card 3"] - X_train["Card 2"]
X_train["Diff4"] = X_train["Card 2"] - X_train["Card 1"]

X_test["Diff1"] = X_test["Card 5"] - X_test["Card 4"]
X_test["Diff2"] = X_test["Card 4"] - X_test["Card 3"]
X_test["Diff3"] = X_test["Card 3"] - X_test["Card 2"]
X_test["Diff4"] = X_test["Card 2"] - X_test["Card 1"]

tree = sklearn.tree.DecisionTreeClassifier(random_state=0, criterion='gini')
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
accuracy_score(y_test, y_pred, normalize=True)


Much better! Now we find our predictions to be in the upper 99% bound of accuracy predicions.


In [None]:
pd.crosstab(y_pred, y_test, rownames=['Predicted'], colnames=['True'], margins=True)

We now see that the model is bad at predicting flushes and above (The stronger hands). We have concluded that
the Decision Tree Classifer is decent ad prediciting weaker hands but continuosuly struugles in predicitng with stronger ones.

In [None]:
X_train["Diff1"] = X_train["Card 5"] - X_train["Card 4"]
X_train["Diff2"] = X_train["Card 4"] - X_train["Card 3"]
X_train["Diff3"] = X_train["Card 3"] - X_train["Card 2"]
X_train["Diff4"] = X_train["Card 2"] - X_train["Card 1"]

X_test["Diff1"] = X_test["Card 5"] - X_test["Card 4"]
X_test["Diff2"] = X_test["Card 4"] - X_test["Card 3"]
X_test["Diff3"] = X_test["Card 3"] - X_test["Card 2"]
X_test["Diff4"] = X_test["Card 2"] - X_test["Card 1"]

tree = sklearn.tree.DecisionTreeClassifier(random_state=0, criterion='gini')
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
accuracy_score(y_test, y_pred, normalize=True)


Much better! Now we find our predictions to be in the upper 99% bound of accuracy predicions.


In [None]:
pd.crosstab(y_pred, y_test, rownames=['Predicted'], colnames=['True'], margins=True)

We now see that the model is bad at predicting flushes and above (The stronger hands). We have concluded that
the Decision Tree Classifer is decent ad prediciting weaker hands but continuosuly struugles in predicitng with stronger ones.