In [3]:
import sqlite3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats, display, HTML


DB = '../data/dailysudoku.db'

In [4]:
def load_sudokus(path):
    conn = sqlite3.connect(path)
    c = conn.cursor()
    #c.execute('SELECT * FROM "sudoku" LIMIT 0, 10;')
    c.execute('SELECT * FROM "sudoku";')
    data = c.fetchall()
    names = [member[0] for member in c.description]
    dic = {'very hard': 3, 'hard': 2, 'medium': 1, 'easy': 0}
    data_raw = np.array([row[3:] for row in data]) 
    dataset = {}
    dataset['data'] = data_raw[:,:10]
    dataset['data_cell'] = np.delete(data_raw, np.s_[1:10], 1)
    dataset['data_bool'] = np.clip(data_raw[:,:10], 0, 1)
    dataset['target'] = np.array([dic[name] for name in [row[2] for row in data]])
    dataset['target_names'] = np.array(['easy', 'medium', 'hard',  'very hard'])
    dataset['feature_names'] = names[3:13]
    c.close()
    conn.close()
    return dataset

dataset = load_sudokus(DB)
#print the first 10 results
print(dataset['data'][:10])
print(dataset['data_cell'][:10])
print(dataset['data_bool'][:10])
print(dataset['target'][:10])
print(dataset['target_names'])
print(dataset['feature_names'])


[[ 28 146   0   0   0   0   0   0   0   0]
 [ 30 121   0   0   0   0   0   0   0   0]
 [ 29 120   0   0   0   0   0   0   0   0]
 [ 31 122   0   0   0   0   0   0   0   0]
 [ 28 130  18   0   0   0   0   0   0   0]
 [ 28 126   0   0   0   0   0   0   0   0]
 [ 28 120   0   0   0   0   0   0   0   0]
 [ 30 127   0   0   0   0   0   0   0   0]
 [ 31 112  24   0   0   0   0   0   0   0]
 [ 31 113  18   0   0   0   0   0   0   0]]
[[28 53  0  0  0  0  0  0  0  0]
 [30 51  0  0  0  0  0  0  0  0]
 [29 52  0  0  0  0  0  0  0  0]
 [31 50  0  0  0  0  0  0  0  0]
 [28 41 12  0  0  0  0  0  0  0]
 [28 53  0  0  0  0  0  0  0  0]
 [28 53  0  0  0  0  0  0  0  0]
 [30 51  0  0  0  0  0  0  0  0]
 [31 37 13  0  0  0  0  0  0  0]
 [31 37 13  0  0  0  0  0  0  0]]
[[1 1 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 0 0]
 [1 1 1 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 0 0]
 [1 1 1 0 0 0 0 0 0 0]
 [1 1 1 0 0 0 0 0 0 0]]
[1 0 1 

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    dataset['data'], dataset['target'], random_state=0)

In [6]:
print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))

X_train shape: (3367, 10)
y_train shape: (3367,)


In [7]:
print("X_test shape: {}".format(X_test.shape))
print("y_test shape: {}".format(y_test.shape))

X_test shape: (1123, 10)
y_test shape: (1123,)


In [8]:
def avg_by_class(data, coef, intercept, target, labels):
    avgs = []
    for label in labels:
        avgs.append([0,0,label])
    for i in range(len(data)):
        dot = np.dot(data[i,:], coef)+intercept
        avgs[target[i]][0] += dot
        avgs[target[i]][1] += 1
    print("Average score by class:")
    for e in avgs:
        print("{}: {:.3f}".format(e[2], e[0]/e[1]))

def evaluate_regr(data, coef, intercept, target):
    count = 0;
    for i in range(len(data)):
        dot = np.dot(data[i,:], coef)+intercept
        if (int(round(dot)) == target[i]):
            count += 1
    return count/len(data)

In [9]:
from sklearn import linear_model

X_train, X_test, y_train, y_test = train_test_split(
    dataset['data'], dataset['target'], random_state=0)

regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)
print("Mean squared error: %.2f"
      % np.mean((regr.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X_test, y_test))
print()
avg_by_class(dataset['data'], regr.coef_, regr.intercept_, dataset['target'], dataset['target_names'])
print()
print("Training set score: {:.3f}".format(evaluate_regr(X_train, regr.coef_, regr.intercept_, y_train)))
print("Test set score: {:.3f}".format(evaluate_regr(X_test, regr.coef_, regr.intercept_, y_test)))

Coefficients: 
 [-0.01752837  0.01224312  0.01551447  0.14861896  0.18039217  0.19327873
  0.0258513   0.09074743  0.40265058  0.69937519]
Intercept: 
 -0.342499802283
Mean squared error: 0.30
Variance score: 0.69

Average score by class:
easy: 0.353
medium: 1.358
hard: 1.847
very hard: 2.646

Training set score: 0.676
Test set score: 0.662


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    dataset['data_cell'], dataset['target'], random_state=0)

regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

print('Coefficients: \n', regr.coef_)

print("Mean squared error: %.2f"
      % np.mean((regr.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X_test, y_test))

print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)
print("Mean squared error: %.2f"
      % np.mean((regr.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X_test, y_test))
print()
avg_by_class(dataset['data'], regr.coef_, regr.intercept_, dataset['target'], dataset['target_names'])
print()
print("Training set score: {:.3f}".format(evaluate_regr(X_train, regr.coef_, regr.intercept_, y_train)))
print("Test set score: {:.3f}".format(evaluate_regr(X_test, regr.coef_, regr.intercept_, y_test)))

Coefficients: 
 [-0.20795723 -0.11622239 -0.10091315 -0.04499197 -0.03084585 -0.01607224
 -0.11633234 -0.0593136   0.05142192  0.64122686]
Mean squared error: 0.30
Variance score: 0.69
Coefficients: 
 [-0.20795723 -0.11622239 -0.10091315 -0.04499197 -0.03084585 -0.01607224
 -0.11633234 -0.0593136   0.05142192  0.64122686]
Intercept: 
 12.9207918621
Mean squared error: 0.30
Variance score: 0.69

Average score by class:
easy: -6.340
medium: -11.196
hard: -11.259
very hard: -10.362

Training set score: 0.669
Test set score: 0.660


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    dataset['data_bool'], dataset['target'], random_state=0)

regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

print('Coefficients: \n', regr.coef_)

print("Mean squared error: %.2f"
      % np.mean((regr.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X_test, y_test))

print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)
print("Mean squared error: %.2f"
      % np.mean((regr.predict(X_test) - y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X_test, y_test))
print()
avg_by_class(dataset['data'], regr.coef_, regr.intercept_, dataset['target'], dataset['target_names'])
print()
print("Training set score: {:.3f}".format(evaluate_regr(X_train, regr.coef_, regr.intercept_, y_train)))
print("Test set score: {:.3f}".format(evaluate_regr(X_test, regr.coef_, regr.intercept_, y_test)))

Coefficients: 
 [ 0.          0.          0.92062852  0.52931625  0.34325368  0.18390713
 -0.02545044  0.23272697  0.31953516  1.10272073]
Mean squared error: 0.22
Variance score: 0.78
Coefficients: 
 [ 0.          0.          0.92062852  0.52931625  0.34325368  0.18390713
 -0.02545044  0.23272697  0.31953516  1.10272073]
Intercept: 
 0.199541284404
Mean squared error: 0.22
Variance score: 0.78

Average score by class:
easy: 6.943
medium: 24.401
hard: 26.200
very hard: 27.826

Training set score: 0.747
Test set score: 0.736


In [16]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)

X_train, X_test, y_train, y_test = train_test_split(
    dataset['data_bool'], dataset['target'], random_state=0)

knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)

print("Test set score: {:.2f}".format(knn.score(X_test, y_test)))

Test set score: 0.84


In [19]:
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(
    dataset['data_cell'], dataset['target'], random_state=0)

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(X_test, y_test))
logreg = LogisticRegression().fit(X_train, y_train)
print('Coefficients: \n', logreg.coef_)
print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))

Variance score: -233.44
Coefficients: 
 [[  1.39702020e+00  -8.04680028e-01  -8.12236785e-01  -3.01315656e+00
   -1.89988343e+00  -1.12332316e+00  -4.85072891e-04  -2.16629528e+00
   -2.44398960e+00  -4.25594366e+00]
 [ -3.48556323e-01   2.07553952e-01   2.06259606e-01  -2.43629730e-01
   -4.88155765e-01  -9.01044644e-01  -8.05128837e-01  -9.74661019e-03
   -8.34790474e-01  -3.72884487e+00]
 [ -1.18692965e-01   2.33498340e-02   4.64964812e-02   3.72252533e-01
    4.12774392e-01   4.67688174e-01   4.18368560e-02   1.81580784e-01
   -4.80010439e-01  -3.87351639e+00]
 [ -9.00111668e-02  -4.11186045e-02   8.84913623e-03   6.73008213e-02
    1.58240067e-01   2.50523228e-01   2.62047732e-02   9.19621197e-02
    9.55197902e-01   5.62733761e+00]]
Training set score: 0.868
Test set score: 0.862
