# Poker Hands UCI Classifications
The Poker Hands dataset is taken from here: https://archive.ics.uci.edu/ml/datasets/Poker+Hand

In [141]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold
import sklearn.tree   
from sklearn.metrics import accuracy_score

import sklearn.neural_network
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [78]:
# data_train = np.loadtxt('poker-hand-training-true.data', delimiter=',')

# data_test = np.loadtxt('poker-hand-testing.data', delimiter=',')

data_train = pd.read_csv('poker-hand-training-true.data', header=None)
print(data_train.shape)
data_test = pd.read_csv('poker-hand-testing.data', header=None)
print(data_test.shape)


(25010, 11)
(1000000, 11)


In [79]:
data_train.columns = ["Suit 1", "Card 1", "Suit 2", "Card 2", "Suit 3", "Card 3","Suit 4", "Card 4","Suit 5", "Card 5","Poker Hand"]
print(data_train.head())

   Suit 1  Card 1  Suit 2  Card 2  Suit 3  Card 3  Suit 4  Card 4  Suit 5  \
0       1      10       1      11       1      13       1      12       1   
1       2      11       2      13       2      10       2      12       2   
2       3      12       3      11       3      13       3      10       3   
3       4      10       4      11       4       1       4      13       4   
4       4       1       4      13       4      12       4      11       4   

   Card 5  Poker Hand  
0       1           9  
1       1           9  
2       1           9  
3      12           9  
4      10           9  


In [80]:
data_test.columns = ["Suit 1", "Card 1", "Suit 2", "Card 2", "Suit 3", "Card 3","Suit 4", "Card 4","Suit 5","Card 5","Poker Hand"]
print(data_test.head(10))

   Suit 1  Card 1  Suit 2  Card 2  Suit 3  Card 3  Suit 4  Card 4  Suit 5  \
0       1       1       1      13       2       4       2       3       1   
1       3      12       3       2       3      11       4       5       2   
2       1       9       4       6       1       4       3       2       3   
3       1       4       3      13       2      13       2       1       3   
4       3      10       2       7       1       2       2      11       4   
5       1       3       4       5       3       4       1      12       4   
6       2       6       4      11       2       3       4       9       1   
7       3       2       4       9       3       7       4       3       4   
8       4       4       3      13       1       8       3       9       3   
9       1       9       3       8       4       4       1       7       3   

   Card 5  Poker Hand  
0      12           0  
1       5           1  
2       9           1  
3       6           1  
4       9           0  
5       

Extracting Poker Hand type and features:

In [81]:
X_train = data_train.iloc[:,:-1]
y_train = data_train["Poker Hand"]


X_test = data_test.iloc[:,:-1]
y_test = data_test["Poker Hand"]

In [82]:
print(y_train.shape)
print(y_train.head(10))

print(y_test.shape)
print(y_test.head(10))


(25010,)
0    9
1    9
2    9
3    9
4    9
5    8
6    8
7    8
8    8
9    8
Name: Poker Hand, dtype: int64
(1000000,)
0    0
1    1
2    1
3    1
4    0
5    0
6    0
7    0
8    0
9    0
Name: Poker Hand, dtype: int64


As we can see, the Poker Hands Dataset has 11 columns to represent th etc...

In [83]:
tree = sklearn.tree.DecisionTreeClassifier(random_state=0, criterion="gini")
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
accuracy_score(y_test, y_pred, normalize=True)
# using numpy



0.479303

We will now preprocess the data to achieve a higher accuracy

In [84]:
X_train_processed = data_train.copy()
cards = X_train_processed[["Card 1", "Card 2", "Card 3", "Card 4", "Card 5"]]
cards.values.sort()
X_train_processed[["Card 1", "Card 2", "Card 3", "Card 4", "Card 5"]] = cards
X_train_processed = X_train_processed[["Card 1", "Card 2", "Card 3", "Card 4", "Card 5", "Suit 1", "Suit 2", "Suit 3", "Suit 4", "Suit 5", "Poker Hand"]]


X_test_processed = data_test.copy()
cards = X_test_processed[["Card 1", "Card 2", "Card 3", "Card 4", "Card 5"]]
cards.values.sort()
X_test_processed[["Card 1", "Card 2", "Card 3", "Card 4", "Card 5"]] = cards
X_test_processed = X_test_processed[["Card 1", "Card 2", "Card 3", "Card 4", "Card 5", "Suit 1", "Suit 2", "Suit 3", "Suit 4", "Suit 5", "Poker Hand"]]

X_train = X_train_processed.loc[:,X_train_processed.columns != "Poker Hand"]
X_test = X_test_processed.loc[:,X_test_processed.columns != "Poker Hand"]


Now doing a 10 fold cv

In [93]:
# using the same tree as before
tree = sklearn.tree.DecisionTreeClassifier(random_state=1, criterion='gini')
# cross_validation(alg, X_train, Y_train)
kf = KFold(n_splits = 10, shuffle=True)

acc = []
matrix = None
first = True

i = 1
for train_index, test_index in kf.split(X_train, y_train):
    print('{}-Fold'.format(i))
    fX_train, fX_test = X_train.iloc[train_index,:], X_train.iloc[test_index,:]
    fy_train, fy_test = y_train[train_index], y_train[test_index]
    tree.fit(fX_train, fy_train)
    fy_pred = tree.predict(fX_test)
    curr = accuracy_score(fy_test, fy_pred, normalize=True)
    acc.append(curr)
    i = i+1

acc = pd.Series(acc)
print(acc.mean())
# return acc.mean()

1-Fold
2-Fold
3-Fold
4-Fold
5-Fold
6-Fold
7-Fold
8-Fold
9-Fold
10-Fold
0.9566173530587765


In [94]:
tree = sklearn.tree.DecisionTreeClassifier(random_state=1, criterion='gini')
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
accuracy_score(y_test, y_pred, normalize=True)

0.960249

96% is pretty good. We have essentially doubled the correcteness of our predictions simply by re-arranging the cards into Cards and Suits.

In [98]:
pd.crosstab(y_pred, y_test, rownames=['Predicted'], colnames=['True'], margins=True)

True,0,1,2,3,4,5,6,7,8,9,All
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,494264,2631,5,19,0,1864,0,0,0,0,498783
1,1556,408995,11216,1870,300,1,70,4,0,0,424012
2,0,9361,35858,490,153,0,440,23,0,0,46325
3,0,845,232,17664,0,0,619,144,0,0,19504
4,0,507,73,12,3004,0,0,0,12,0,3608
5,5389,24,0,0,0,131,0,0,0,0,5544
6,0,125,238,1012,0,0,285,14,0,0,1674
7,0,0,0,54,0,0,10,45,0,0,109
8,0,10,0,0,292,0,0,0,0,0,302
9,0,0,0,0,136,0,0,0,0,3,139


By analyzing the data shown in the table, we can observe how the True/Predicted values in the upper end have prediction disparity. This means that the flushes and higher reward hands are causing prediction erros.

In [101]:
tmp = X_test[["Suit 1", "Suit 2", "Suit 3", "Suit 4", "Suit 5"]]
X_test["Unique"] = tmp.apply(lambda x: len(np.unique(x)) , axis=1)

In [102]:
tmp = X_train[["Suit 1", "Suit 2", "Suit 3", "Suit 4", "Suit 5"]]
X_train["Unique"] = tmp.apply(lambda x: len(np.unique(x)) , axis=1)

In [104]:
tree = sklearn.tree.DecisionTreeClassifier(random_state=0, criterion='gini')
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
accuracy_score(y_test, y_pred, normalize=True)

0.965091

Much better, we're now at 96%, slightly higher than previously but now the data should be more spread out and the prediciton should be less sparse.

In [105]:
pd.crosstab(y_pred, y_test, rownames=['Predicted'], colnames=['True'], margins=True)

True,0,1,2,3,4,5,6,7,8,9,All
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,499679,2802,0,15,0,875,0,0,0,0,503371
1,1530,407933,11807,1759,544,7,76,3,0,0,423659
2,0,10372,35024,618,141,0,427,28,0,0,46610
3,0,640,354,17801,0,0,634,144,0,0,19573
4,0,607,77,8,3200,0,0,0,7,0,3899
5,0,0,0,0,0,1114,0,0,0,0,1114
6,0,144,360,818,0,0,283,6,0,0,1611
7,0,0,0,102,0,0,4,49,0,0,155
8,0,0,0,0,0,0,0,0,5,0,5
9,0,0,0,0,0,0,0,0,0,3,3


What we see in this cross tab: flushes have now been resolved but the last two poker hands are still slightly cause for mispredictions. 12-5 and 230-155, still these are very slight but can be cause for accuracy concerns. Straight flushes are bieng underepresented. 

In [111]:
X_train["Diff1"] = X_train["Card 5"] - X_train["Card 4"]
X_train["Diff2"] = X_train["Card 4"] - X_train["Card 3"]
X_train["Diff3"] = X_train["Card 3"] - X_train["Card 2"]
X_train["Diff4"] = X_train["Card 2"] - X_train["Card 1"]

X_test["Diff1"] = X_test["Card 5"] - X_test["Card 4"]
X_test["Diff2"] = X_test["Card 4"] - X_test["Card 3"]
X_test["Diff3"] = X_test["Card 3"] - X_test["Card 2"]
X_test["Diff4"] = X_test["Card 2"] - X_test["Card 1"]

tree = sklearn.tree.DecisionTreeClassifier(random_state=0, criterion='gini')
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
accuracy_score(y_test, y_pred, normalize=True)


0.999836

Much better! Now we find our predictions to be in the upper 99% bound of accuracy predicions.


In [112]:
pd.crosstab(y_pred, y_test, rownames=['Predicted'], colnames=['True'], margins=True)

True,0,1,2,3,4,5,6,7,8,9,All
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,501209,0,0,0,0,0,0,0,0,0,501209
1,0,422498,0,0,0,0,0,0,0,0,422498
2,0,0,47622,0,0,0,0,0,0,0,47622
3,0,0,0,21121,0,0,0,0,0,0,21121
4,0,0,0,0,3885,0,0,0,0,0,3885
5,0,0,0,0,0,1836,0,0,4,0,1840
6,0,0,0,0,0,0,1424,0,0,0,1424
7,0,0,0,0,0,0,0,230,0,0,230
8,0,0,0,0,0,152,0,0,8,0,160
9,0,0,0,0,0,8,0,0,0,3,11


We now see that the model is bad at predicting flushes and above (The stronger hands). We have concluded that
the Decision Tree Classifer is decent ad prediciting weaker hands but continuosuly struugles in predicitng with stronger ones.

In [147]:
X_train = data_train[:,:-1]
y_train = data_train[:,-1]


X_test = data_test[:,:-1]
y_test = data_test[:,-1]

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.int64)

class Net(nn.Module):
    def __init__(self, in_count, out_count):
        super(Net,self).__init__()
        self.fc1 = nn.Linear(in_count, 15)
        self.fc2 = nn.Linear(15,10)
        self.fc3 = nn.Linear(10,10)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x= F.relu(self.fc1(x))
        x= self.fc2(x)
        x = self.fc3(x)
        return self.softmax(x)
    
X_train = Variable(torch.Tensor(X_train).float())
X_test = Variable(torch.Tensor(X_test).float())
y_train = Variable(torch.LongTensor(y_train))
y_test = Variable(torch.LongTensor(y_test))

model = Net(x.shape[1])
criterion = nn.crossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)

for epoch in range(100):
    optimizer.zero_grad()
    out = model(x_train)
    loss = criterion(out, y_train)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}, loss: {loss.item})