In [None]:
import pandas as pd
import numpy as np
import spacy
import re
from typing import List, Dict, Tuple, Set, Union

nlp = spacy.load("en_core_web_sm")
def tokenize(text:str)->List[str]:
    """
    Tokenize text using spacy.
    """
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    #remove numbers and punctuation
    text = re.sub(r"[0-9]", "", text)
    doc = nlp(text)
    return [token.text for token in doc]

def create_tokenmap(tokens:List[str])->Dict[str, int]:
    """
    Create a token map for the given tokens.
    """
    tokenmap = {}
    for token in tokens:
        if token not in tokenmap:
            tokenmap[token] = 1
        else:
            tokenmap[token] += 1
    return tokenmap

def assemble_token_maps(
    token_maps:List[Dict[str,int]]
    )->Dict[Dict[Union[str,int],Union(str,int)]]:
    """
    Assemble all token maps into one.
    """
    tokenmap = {}
    for token_map in token_maps:
        for token in token_map:
            if token not in tokenmap:
                tokenmap[token] = token_map[token]
            else:
                tokenmap[token] += token_map[token]
    
    tok_to_idx = {"<PAD>": 0, "<UNK>": 1}
    idx_to_tok = {0: "<PAD>", 1: "<UNK>"}
    idx_to_cnt = {0: 0, 1: 0}
    for idx, token in enumerate(tokenmap):
        tok_to_idx[token] = idx
        idx_to_tok[idx] = token
        idx_to_cnt[idx] = tokenmap[token]

    tokenmap = {
        "tok_to_idx": tok_to_idx,
        "idx_to_tok": idx_to_tok,
        "idx_to_cnt": idx_to_cnt
    }
    return tokenmap
    
def preprocess_text(
    text:str,tokenmap:Dict[Dict[Union[str,int],Union(str,int)]]
    )->List[int]:
    """
    tokenize text and replace words with indices.
    """
    text = [tokenmap["tok_to_idx"].get(token, 1) for token in text]
    return text