# Trie class for searching strings

A Trie, also known as a prefix tree, is a specialized tree used to store associative data structures. A Trie for strings is a tree where each node represents a single character of a string. This structure is particularly efficient for solving problems related to strings, such as autocomplete systems, spell checkers, and prefix searches, because it can provide fast retrieval of strings with common prefixes.

Here's how a Trie works for strings:

The root of the Trie is an empty character.
Each node consists of a set of children, each labeled with a character that can follow the string represented by the node.
The end of a particular word is usually marked with a special flag, indicating that a complete word terminates there.

In [None]:
from graphviz import Digraph

class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False

class Trie:
    def __init__(self):
        self.root = TrieNode()
    
    def insert(self, word):
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.is_end_of_word = True
    
    def search(self, word):
        node = self.root
        for char in word:
            if char not in node.children:
                return False
            node = node.children[char]
        return node.is_end_of_word
    
    def starts_with(self, prefix):
        node = self.root
        for char in prefix:
            if char not in node.children:
                return False
            node = node.children[char]
        return True

    def ends_with(self, suffix):
        for word in self.iterate_words(self.root, ""):
            if word.endswith(suffix):
                return True
        return False

    def search_substring(self, substring):
        def search_from_node(node, sub):
            for char in sub:
                if char not in node.children:
                    return False
                node = node.children[char]
            return True
        
        for word in self.iterate_words(self.root, ""):
            if substring in word:
                return True
        return False

    def find_endings(self, prefix):
        node = self.root
        endings = []

        # Find the node corresponding to the last character of the prefix
        for char in prefix:
            if char not in node.children:
                return endings  # Prefix not found
            node = node.children[char]
        
        # Helper function to recursively find all endings
        def _find_endings(node, current_word):
            if node.is_end_of_word:
                endings.append(current_word)
            for char, next_node in node.children.items():
                _find_endings(next_node, current_word + char)
        
        # Start the recursive search from the current node
        _find_endings(node, "")

        return endings

    def iterate_words(self, node, prefix):
        if node.is_end_of_word:
            yield prefix
        for char, next_node in node.children.items():
            yield from self.iterate_words(next_node, prefix + char)

    def display(self, node=None, level=0):
        if node is None:
            node = self.root
        if node.is_end_of_word:
            print((' ' * level) + '⟶')
        for char, child in node.children.items():
            print((' ' * level) + char)
            self.display(child, level + 1)

    def draw_tree(self):
        dot = Digraph(comment='The Trie Structure')
        self._add_nodes(dot, self.root, "")
        return dot

    def _add_nodes(self, dot, node, current_prefix):
        # Invisible root node to align children
        if current_prefix == "":
            dot.node(current_prefix, label="", style='invis')

        for char, child in node.children.items():
            child_prefix = current_prefix + char
            dot.node(child_prefix, label=char)
            dot.edge(current_prefix, child_prefix)
            if child.is_end_of_word:
                dot.node(child_prefix + "_end", label="⟶", shape="plaintext")
                dot.edge(child_prefix, child_prefix + "_end")
            self._add_nodes(dot, child, child_prefix)



In [None]:
# Example usage:
trie = Trie()
trie.insert("help")
trie.insert("helium")
trie.insert("hamster")
trie.insert("ham")
trie.insert("hamilton")

trie.display() # Graphically display the Trie structure

In [None]:
print(trie.search("help"))  # Returns True
print(trie.search("helium")) # Returns True
print(trie.search("hell"))   # Returns False, "hell" is not a complete word in the Trie



In [None]:
print(trie.starts_with("he")) # Returns True, there are words that start with "he"
print(trie.starts_with("hi")) # Returns False, there are no words that start with "hi"



In [None]:
print(trie.ends_with("llo"))         # Returns True, "hello" ends with "llo"
print(trie.ends_with("ium"))         # Returns True, "helium" ends with "ium"
print(trie.ends_with("help"))        # Returns False, no word ends with "help"



In [None]:
print(trie.search_substring("ell"))  # Returns True, "ell" is a substring of "hello"
print(trie.search_substring("help")) # Returns False, "help" is not a substring



In [None]:
# To visualize the Trie:
trie_dot = trie.draw_tree()
trie_dot

# trie_dot.render('trie_graph', view=True, format='png')


In [None]:
# construct Trie from a text corpus
# https://www.kaggle.com/code/dmitryyemelyanov/sherlock-holmes-word-cloud-eda

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import glob
# from wordcloud import WordCloud,STOPWORDS
from PIL import Image

In [None]:
Image.open('sherlock-holmes-silhouette.png')

In [None]:
lines = open("fina.txt").readlines()
lines[0:30]

In [None]:
import re

def extract_words(corpus):
    words_set = set()
    for text in corpus:
        # Split the text into words using regex that matches words
        words = re.findall(r'\b\w+\b', text.lower())
        # Add to the set of unique words
        words_set.update(words)
    return list(words_set)

unique_words = extract_words(lines)
print(len(unique_words))
print(sorted(unique_words))


In [None]:
trie = Trie()
for word in sorted(unique_words)[7:108]:
    trie.insert(word)

In [None]:
trie_dot = trie.draw_tree()
trie_dot


In [None]:
print(trie.search_substring("alas"))  
print(trie.search_substring("about")) 
print(trie.search_substring("allowed")) 
print(trie.search_substring("allowing")) 


Finding continuations for a word prefix

In [None]:
prefix = "ab"
endings = trie.find_endings(prefix)
print(f"Possible endings for the prefix '{prefix}': {endings}")

