In [1]:
import sqlite3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats, display, HTML


DB = '../data/hodoku.db'

In [45]:
def load_sudokus(path):
    conn = sqlite3.connect(path)
    c = conn.cursor()
    #c.execute('SELECT * FROM "sudoku" LIMIT 0, 10;')
    c.execute('SELECT * FROM "sudoku" ORDER BY RANDOM();')
    data = c.fetchall()
    names = [member[0] for member in c.description]
    dic = {'Extreme': 4, 'Unfair': 3, 'Hard': 2, 'Medium': 1, 'Easy': 0}
    data_raw = np.array([row[3:] for row in data]) 
    dataset = {}
    dataset['data'] = data_raw[:,:10]
    dataset['data_cell'] = np.delete(data_raw, np.s_[1:10], 1)
    dataset['data_bool'] = np.clip(data_raw[:,:10], 0, 1)
    dataset['target'] = np.array([dic[name] for name in [row[2] for row in data]])
    dataset['target_names'] = np.array(['Easy', 'Medium', 'Hard', 'Unfair', 'Extreme'])
    dataset['feature_names'] = names[3:13]
    c.close()
    conn.close()
    return dataset

dataset = load_sudokus(DB)
#print the first 10 results
print(dataset['data'][:10])
print(dataset['data_cell'][:10])
print(dataset['data_bool'][:10])
print(dataset['target'][:10])
print(dataset['target_names'])
print(dataset['feature_names'])

[[ 25 169   7   2   0   0   0   2   0   3]
 [ 25 167  13   1   1   0   0   6   0   3]
 [ 23 163  13   4   2   0   0   2   0   4]
 [ 25 153  19   3   4   0   0   2   0   2]
 [ 24 160  22   1   2   0   0   4   0   3]
 [ 23 179  10   0   0   0   0   0   0   0]
 [ 25 147  23   5   1   1   1   2   1   2]
 [ 26 160  11   0   1   0   0   0   0   2]
 [ 24 157  25   0   0   0   0   0   0   0]
 [ 23 172  15   2   1   0   0   0   0   0]]
[[25 42  4  5  0  0  0  2  0  3]
 [25 33  4  2  2  0  0 12  0  3]
 [23 39  7  4  1  0  0  3  0  4]
 [25 27 11  5  6  0  0  5  0  2]
 [24 28 11  3  3  0  0  9  0  3]
 [23 51  7  0  0  0  0  0  0  0]
 [25 24 10  7  3  2  1  4  3  2]
 [26 44  7  0  2  0  0  0  0  2]
 [24 43 14  0  0  0  0  0  0  0]
 [23 40 10  4  4  0  0  0  0  0]]
[[1 1 1 1 0 0 0 1 0 1]
 [1 1 1 1 1 0 0 1 0 1]
 [1 1 1 1 1 0 0 1 0 1]
 [1 1 1 1 1 0 0 1 0 1]
 [1 1 1 1 1 0 0 1 0 1]
 [1 1 1 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 1 1]
 [1 1 1 0 1 0 0 0 0 1]
 [1 1 1 0 0 0 0 0 0 0]
 [1 1 1 1 1 0 0 0 0 0]]
[3 3 4 

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    dataset['data'], dataset['target'], random_state=0)

In [19]:
print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))

X_train shape: (7500, 10)
y_train shape: (7500,)


In [20]:
print("X_test shape: {}".format(X_test.shape))
print("y_test shape: {}".format(y_test.shape))

X_test shape: (2500, 10)
y_test shape: (2500,)


In [21]:
def avg_by_class(data, coef, intercept, target, labels):
    avgs = []
    for label in labels:
        avgs.append([0,0,label])
    for i in range(len(data)):
        dot = np.dot(data[i,:], coef)+intercept
        avgs[target[i]][0] += dot
        avgs[target[i]][1] += 1
    print("Average score by class:")
    for e in avgs:
        print("{}: {:.3f}".format(e[2], e[0]/e[1]))

def evaluate_regr(data, coef, intercept, target):
    count = 0;
    for i in range(len(data)):
        dot = np.dot(data[i,:], coef)+intercept
        if (int(round(dot)) == target[i]):
            count += 1
    return count/len(data)

In [36]:
from sklearn import linear_model

X_train, X_test, y_train, y_test = train_test_split(
    dataset['data'], dataset['target'], random_state=0)

regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)
print("Mean squared error: %.2f"
      % np.mean((regr.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X_test, y_test))
print()
avg_by_class(dataset['data_bool'], regr.coef_, regr.intercept_, dataset['target'], dataset['target_names'])
print()
print("Training set score: {:.3f}".format(evaluate_regr(X_train, regr.coef_, regr.intercept_, y_train)))
print("Test set score: {:.3f}".format(evaluate_regr(X_test, regr.coef_, regr.intercept_, y_test)))

Coefficients: 
 [  4.79834889   0.31065453   0.27263194   5.17190582   6.13765323
   6.29289374   5.83322658   9.35814022  10.01409444  30.3905071 ]
Intercept: 
 -132.32540736
Mean squared error: 2145.53
Variance score: 0.57

Average score by class:
Easy: -126.957
Medium: -117.358
Hard: -82.612
Unfair: -80.885
Extreme: -78.358

Training set score: 0.010
Test set score: 0.013


In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    dataset['data_bool'], dataset['target'], random_state=0)

regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

print('Coefficients: \n', regr.coef_)

print("Mean squared error: %.2f"
      % np.mean((regr.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X_test, y_test))

print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)
print("Mean squared error: %.2f"
      % np.mean((regr.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X_test, y_test))
print()
avg_by_class(dataset['data'], regr.coef_, regr.intercept_, dataset['target'], dataset['target_names'])
print()
print("Training set score: {:.3f}".format(evaluate_regr(X_train, regr.coef_, regr.intercept_, y_train)))
print("Test set score: {:.3f}".format(evaluate_regr(X_test, regr.coef_, regr.intercept_, y_test)))

Coefficients: 
 [ 0.          0.          0.25238778  0.35900033  0.12029326  0.15222765
  0.03373589  0.42262502  0.04422661  2.05144261]
Mean squared error: 0.43
Variance score: 0.78
Coefficients: 
 [ 0.          0.          0.25238778  0.35900033  0.12029326  0.15222765
  0.03373589  0.42262502  0.04422661  2.05144261]
Intercept: 
 0.033726501067
Mean squared error: 0.43
Variance score: 0.78

Average score by class:
Easy: 6.712
Medium: 8.013
Hard: 11.769
Unfair: 11.601
Extreme: 13.212

Training set score: 0.589
Test set score: 0.599


In [37]:
X_train, X_test, y_train, y_test = train_test_split(
    dataset['data_cell'], dataset['target'], random_state=0)

regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

print('Coefficients: \n', regr.coef_)

print("Mean squared error: %.2f"
      % np.mean((regr.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X_test, y_test))

print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)
print("Mean squared error: %.2f"
      % np.mean((regr.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X_test, y_test))
print()
avg_by_class(dataset['data_cell'], regr.coef_, regr.intercept_, dataset['target'], dataset['target_names'])
print()
print("Training set score: {:.3f}".format(evaluate_regr(X_train, regr.coef_, regr.intercept_, y_train)))
print("Test set score: {:.3f}".format(evaluate_regr(X_test, regr.coef_, regr.intercept_, y_test)))

Coefficients: 
 [-0.68289492 -1.32676719 -1.29558989 -0.80669268 -0.71374766 -0.45949668
  1.10406176 -0.33395404 -0.46289889  4.9779802 ]
Mean squared error: 85.07
Variance score: 0.57
Coefficients: 
 [-0.68289492 -1.32676719 -1.29558989 -0.80669268 -0.71374766 -0.45949668
  1.10406176 -0.33395404 -0.46289889  4.9779802 ]
Intercept: 
 99.2254064458
Mean squared error: 85.07
Variance score: 0.57

Average score by class:
Easy: 8.014
Medium: 13.101
Hard: 23.062
Unfair: 25.203
Extreme: 30.568

Training set score: 0.049
Test set score: 0.046


In [42]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)

X_train, X_test, y_train, y_test = train_test_split(
    dataset['data_cell'], dataset['target'], random_state=0)

knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)

print("Test set score: {:.2f}".format(knn.score(X_test, y_test)))

Test set score: 0.58


In [34]:
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(
    dataset['data_bool'], dataset['target'], random_state=0)

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X_test, y_test))
logreg = LogisticRegression().fit(X_train, y_train)
print('Coefficients: \n', logreg.coef_)
print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))

Variance score: -7.41
Coefficients: 
 [[  1.31265065e+00   1.31265065e+00  -2.23794894e-01  -8.27079448e+00
   -6.32020980e+00  -4.90863593e+00  -1.07389577e+00  -5.65724347e+00
   -1.31598207e+00  -7.24892326e+00]
 [ -1.19613711e+00  -1.19613711e+00   8.75583104e-01   5.78873862e+00
    3.35637266e+00   1.86805994e+00   3.76909850e-02   2.82975576e+00
   -3.09343182e+00  -1.32539376e+01]
 [ -1.68469985e+00  -1.68469985e+00   8.23010429e-02  -4.94799497e-03
   -1.07958069e-01  -3.14771001e-01  -7.87948751e-02  -9.50936127e-01
    5.35823429e-02   5.05967298e+00]
 [ -1.63111028e+00  -1.63111028e+00  -1.18526024e+00  -5.58776599e-02
   -2.94122501e-02  -1.78104561e-01  -3.44709199e-02   3.77469165e-02
   -1.99675591e-02   5.40268403e+00]
 [ -1.80500332e+00  -1.80500332e+00  -1.11931557e+00   1.02859068e-01
    1.57051075e-01   4.15884649e-01   1.24716072e-01   1.24427119e+00
    9.12866839e-02   4.61292497e+00]]
Training set score: 0.649
Test set score: 0.636


In [49]:
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(
    dataset['data_bool'], dataset['target'], random_state=0)

#build the tree using the training data
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

#Print accuracy of the model
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

Accuracy on training set: 0.662
Accuracy on test set: 0.634


In [44]:
def load_sudokus(path):
    conn = sqlite3.connect(path)
    c = conn.cursor()
    #c.execute('SELECT * FROM "sudoku" LIMIT 0, 10;')
    c.execute('SELECT * FROM "sudoku";')
    data = c.fetchall()
    names = [member[0] for member in c.description]
    dic = {'Diabolical': 4, 'Hard': 3, 'Medium': 2, 'Easy': 1, 'Ultra Easy': 0}
    data_raw = np.array([row[3:] for row in data]) 
    dataset = {}
    dataset['data'] = data_raw[:,:10]
    dataset['data_cell'] = np.delete(data_raw, np.s_[1:10], 1)
    dataset['data_bool'] = np.clip(data_raw[:,:10], 0, 1)
    dataset['target'] = np.array([dic[name] for name in [row[2] for row in data]])
    dataset['target_names'] = np.array(['Ultra Easy', 'Easy', 'Medium', 'Hard', 'Diabolical'])
    dataset['feature_names'] = names[3:13]
    c.close()
    conn.close()
    return dataset

dataset2 = load_sudokus('../data/puzzler.db')

In [50]:
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(
    dataset['data'], dataset['target'], random_state=0)

#build the tree using the training data
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

#Print accuracy of the model
print("Accuracy on Mixed set: {:.3f}".format(tree.score(dataset2['data'], dataset2['target'])))


Accuracy on Mixed set: 0.144
