# Import dataset

In [6]:
import git
import os

dataset_dir = 'data'
dataset_name = 'nyt_crosswords'
dataset_url = 'https://github.com/doshea/nyt_crosswords'
dataset_path = os.path.join(dataset_dir, dataset_name)

words_file_path = os.path.join(dataset_dir, 'words')
clues_file_path = os.path.join(dataset_dir, 'clues')

max_word_length = 12
min_word_length = 4

if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)

if not os.path.exists(dataset_path):
    git.Git(dataset_dir).clone(dataset_url)

# Parsing

In [8]:
import json
import re

def read_file(path):
    try:
        file = open(path, 'r')
        data = json.loads(file.read())
        file.close()
    except:
        return
    return data

def data_is_valid(data):
    if data is None:
        return False

    answers = data['answers']['across'] + data['answers']['down']
    clues = data['clues']['across'] + data['clues']['down']
    
    return len(answers) == len(clues)

def parse_data(data):
    answers = data['answers']['across'] + data['answers']['down']
    clues = data['clues']['across'] + data['clues']['down']

    words_file = open(words_file_path, 'a')
    clues_file = open(clues_file_path, 'a')
    for i in range(len(answers)):
        word = answers[i]
        clue = re.sub(r'^[0-9]+\.\s', '', clues[i])
        length = len(word)
        
        if length <= max_word_length and length >= min_word_length:
            words_file.write(word + '\n')
            clues_file.write(clue + '\n')
    
    words_file.close()
    clues_file.close()

if os.path.exists(words_file_path):
    os.remove(words_file_path)
if os.path.exists(clues_file_path):
    os.remove(clues_file_path)

n = 0
for root, dirs, files in os.walk(dataset_path, topdown=False):
    for name in files:
        if name.endswith('.json'):
            path = os.path.join(root, name)
            data = read_file(path)
            
            if n % 1000 == 0:
                print(n)
            
            n += 1
            
            if data_is_valid(data):
                parse_data(data)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000


# Remove original data

In [3]:
import shutil

shutil.rmtree(dataset_path)