In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import scipy as sp
import pickle

from utils import *
from sklearn.preprocessing import OneHotEncoder

In [2]:
raw_dataset = pd.read_csv('data/sudoku-3m.csv')

# columns: id, puzzle, solution, clues, difficulty
# lignes: 1,
#         1..5.37..6.3..8.9......98...1.......8761..........6...........7.8.9.76.47...6.312,
#         198543726643278591527619843914735268876192435235486179462351987381927654759864312,
#         27,
#         2.2

# One Hot Encoding 

## Preprocessing of input data **X**:

In [3]:
# Function to replace '.' with 0 and convert to integers
f = lambda x: np.array([0 if s == '.' else int(s) for s in x])

# Apply the function to the 'puzzle' column
vector_X = np.stack(raw_dataset.puzzle.apply(f).values)
print('Vectorization done.')

# Reshape for One-Hot Encoding
vector_X = vector_X.reshape(-1, 1).astype(np.int8)
print('Reshaping done.')

# Initialize and fit the OneHotEncoder
one_hot = OneHotEncoder(categories=[np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])], handle_unknown='ignore')
one_hot.fit(vector_X)
print('Fitting done.')

Vectorization done.
Reshaping done.
Fitting done.


In [4]:
with open("data/encoder.pkl", "wb") as f:
    pickle.dump(one_hot, f)

In [5]:
one_hot_X_batches = np.array([], dtype=np.int8)
for i in range(0, len(vector_X), 1_000_000):
    print(f'Batch {i//1_000_000+1}...', end='\r', flush=True)
    tmp = one_hot.transform(vector_X[i:i+1_000_000]).toarray().astype(np.int8)
    one_hot_X_batches = np.append(one_hot_X_batches, tmp)

print('Transforming done.')

Batch 1...

Transforming done.


In [6]:
array_X = one_hot_X_batches.reshape(-1, 729)
print('DONE!')

DONE!


In [7]:
np.save('data/one_hot_X_3m.npy', array_X)

## Preprocessing of target data **y**:

In [3]:
# Function to replace '.' with 0 and convert to integers
f = lambda x: np.array([int(s)-1 for s in x])

# Apply the function to the 'puzzle' column
vector_y = np.stack(raw_dataset.solution.apply(f).values).astype(np.uint8)
print('Vectorization done.')

Vectorization done.


In [7]:
np.save('data/dense_y_3m.npy', vector_y)

# Dense Representation

## Preprocessing of input data **X**:

In [4]:
# Function to replace '.' with 0 and convert to integers
f = lambda x: np.array([0 if s == '.' else int(s) for s in x])

# Apply the function to the 'puzzle' column
vector_X = np.stack(raw_dataset.puzzle.apply(f).values)
print('Vectorization done.')

# Reshape for One-Hot Encoding
dense_X = vector_X.reshape(-1, 81).astype(np.int8)/9
print('Reshaping done.')

# save data to file
np.save('data/dense_X_30k.npy', dense_X)

Vectorization done.
Reshaping done.


In [60]:
# Function to replace '.' with 0 and convert to integers
f = lambda x: np.array([np.int8(s) for s in x])

# Apply the function to the 'solution' column
vector_y = np.stack(raw_dataset.solution.apply(f).values)
print('Vectorization done.')

# Reshape for One-Hot Encoding
dense_y = vector_y.reshape(-1, 81)/9
print('Reshaping done.')

# save data to file
np.save('data/dense_y_30k.npy', dense_y)

Vectorization done.
Reshaping done.


# Custom Embedding

## Preprocessing of input dada **X**:

In [4]:
# Function to replace '.' with 0 and convert to integers
f = lambda x: np.array([0 if s == '.' else int(s) for s in x])

# Apply the function to the 'puzzle' column
vector_X = np.stack(raw_dataset.puzzle.apply(f).values)
print('Vectorization done.')

Vectorization done.


In [24]:
test_grid = vector_X[0]

# cell
cell_idx = [[i*3 + 18*(i//3) + j + 6*(j//3) for j in range(9)] for i in range(9)]
cell = test_grid[cell_idx]

# row
row_idx = [[i*9 + j for j in range(9)] for i in range(9)]
row = test_grid[row_idx]

# column
column_idx = [[i + j*9 for j in range(9)] for i in range(9)]
column = test_grid[column_idx]

In [None]:
# make every combination of 9 numbers

# Making the Sudokus so that they are at different solving steps

In [3]:
def int_to_binary_list(number, num_bits=4):
    return [bool((number >> i) & 1) for i in range(num_bits-1, -1, -1)]

In [4]:
# Function to replace '.' with 0 and convert to bools
f = lambda x: np.array([True if s == '.' else False for s in x])

# Apply the function to the 'puzzle' column
min_X = np.stack(raw_dataset.puzzle.apply(f).values).reshape(-1, 81)
print('Vectorization done.')

Vectorization done.


In [5]:
# Function to replace '.' with 0 and convert to integers
f = lambda x: np.array([int_to_binary_list(np.uint8(s)) for s in x])

# Apply the function to the 'puzzle' column
bin_y = np.stack(raw_dataset.solution.apply(f).values).reshape(-1, 81, 4)
print('Vectorization done.')

Vectorization done.


In [7]:
np.save('data/bin_y_3m.npy', bin_y)

In [5]:
bin_y = np.load('data/bin_y_3m.npy')

In [6]:
del raw_dataset

In [7]:
def clear_cell(arr, idx):
    arr[idx] = [False, False, False, False]
    return arr.copy()

# min number of clues is 19 and max is 31
remove_idx = np.stack([np.random.permutation([j for j in range(81) if min_X[i, j]])[:50] for i in range(len(min_X))])

np.save('data/remove_idx_3m.npy', remove_idx)

In [8]:
bin_steps_X = []
bin_steps_y = []

for rem 
for i, cells in enumerate(remove_idx):

    bin_steps_X += [clear_cell(bin_y[i], cell) for cell in cells]

    if i % 10_000 == 0:
        print(f'{100 * i / len(min_X):.2f}%...', end='\r', flush=True)


0.33%...

65.67%...

: 

In [5]:
# save data to file
np.save('data/steps_X_30k.npy', np.array(steps_X)/9)
np.save('data/steps_y_30k.npy', np.array(steps_y)/9)

NameError: name 'steps_X' is not defined