# Clean Data

We will use this file to quickly clean up the raw data that we have downloaded.

## Smogon

First we have Smogon data - Pokedex & Formats.

In [None]:
from pathlib import Path
import json

SOURCE_FILE = Path("../data/raw/smogon/pokedex.ts") # UP past repo/notebooks, and then down
DEST_FILE = Path("pokedex.json") # we'll save here and move into processed

def check_for_number_id(line):
    if '"num":' in line:
        key, val = line.split(':', 1)
        return f'{key.strip()}: "{val.strip().rstrip(",")}",'
    return line

def rebuild_line_from_tokens(tokens, whitespace=''):
    '''For testing.'''
    rebuilt = []
    for tok in tokens:
        print(tok)
        part = tok['part']
        if not tok['in_quote'] and ':' in part:
            key, rest = part.split(':', 1)
            part = f'"{key.strip()}":{rest.strip()}'
        rebuilt.append(part)
    return whitespace+ ' '.join(rebuilt)


def split_tokens_with_quote_state(line):
    '''The idea here is straightforward. We are going to split the line by whitespace into parts. We will iterate through each part and 
    keep track of whether it is part of a larger quote (either in-quote or not). This helps us rebuild the string with double-quotes.'''
    parts = line.split()
    output = []
    in_quote = False

    for part in parts:
        starts_with = part.startswith('"')
        ends_with = part.rstrip(',').endswith('"') ## because some entries are parts of lists

        if starts_with and ends_with and len(part) > 1:
            output.append({'in_quote': True, 'part': part})
            continue

        if starts_with:
            in_quote = True

        output.append({'in_quote': in_quote, 'part': part})

        if ends_with:
            in_quote = False

    return output

In [None]:
# Process lines
with open(SOURCE_FILE, 'r', encoding='utf-8') as f:
    original_lines = f.readlines()

output_lines = ['{']

for raw_line in original_lines[1:]:  # Skip export line
    try:
        raw_line = raw_line.split('//')[0]
        leading_ws = raw_line[:len(raw_line) - len(raw_line.lstrip())]
        line = raw_line.strip()
        if not line:
            continue

        tokens = split_tokens_with_quote_state(line)
        rebuilt = []
        for tok in tokens:
            if not tok['in_quote'] and ':' in tok['part']:
                key = tok['part'].split(':', 1)[0]
                rest = tok['part'][len(key)+1:]
                quoted = f'"{key.strip()}":{rest}'
                rebuilt.append(quoted)
            else:
                rebuilt.append(tok['part'])

        newline = check_for_number_id(' '.join(rebuilt))

        output_lines.append(leading_ws + newline)

    except Exception as e:
        print(f"Error: {e}")
        break

output_lines.append('}')

# Write to file
with open(DEST_FILE, 'w', encoding='utf-8') as f:
    f.write('\n'.join(output_lines))
