In [1]:
class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, word):
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.is_end_of_word = True

    def search(self, word):
        node = self.root
        for char in word:
            if char not in node.children:
                return False
            node = node.children[char]
        return node.is_end_of_word

    def suggestions(self, prefix):
        node = self.root
        for char in prefix:
            if char not in node.children:
                return []
            node = node.children[char]
        return self._collect(node, prefix)

    def _collect(self, node, prefix):
        results = []
        if node.is_end_of_word:
            results.append(prefix)
        for char in node.children:
            results.extend(self._collect(node.children[char], prefix + char))
        return results

In [4]:
example_trie = Trie()
example_queries = ["apple", "apple pie", "banana", "cherry", "cherry pie"]

# Insert the queries into the trie
for example_query in example_queries:
    example_trie.insert(example_query)

# Get search query suggestions for the prefix "a"
example_prefix = "a"
example_suggestions = example_trie.suggestions(example_prefix)
print(f"Suggestions for '{example_prefix}': {example_suggestions}")

Suggestions for 'a': ['apple', 'apple pie']


In [45]:
import pandas as pd
from gensim.utils import tokenize

# Load the CSV file
df = pd.read_csv('/content/merged_genre.csv')

# Define the tokenizer function
def tokenize_text(text):
    text = text.lower()
    return list(tokenize(text))

# Tokenize the 'description' column
df['tokens'] = df['description'].apply(tokenize_text)
df['titletokens'] = df['title'].apply(tokenize_text)

# Print the resulting dataframe
print(df.head())

   Unnamed: 0     type                                    title  \
0           0    Movie  Norm of the North: King Sized Adventure   
1           1    Movie               Jandino: Whatever it Takes   
2           2  TV Show                       Transformers Prime   
3           3  TV Show         Transformers: Robots in Disguise   
4           4    Movie                             #realityhigh   

                   director  \
0  Richard Finn, Tim Maltby   
1                       NaN   
2                       NaN   
3                       NaN   
4          Fernando Lebrija   

                                                cast  \
0  Alan Marriott, Andrew Toth, Brian Dobson, Cole...   
1                                   Jandino Asporaat   
2  Peter Cullen, Sumalee Montano, Frank Welker, J...   
3  Will Friedle, Darren Criss, Constance Zimmer, ...   
4  Nesta Cooper, Kate Walsh, John Michael Higgins...   

                                    country         date_added  release_y

In [46]:
word_freq = {}

In [47]:
words_for_trie = set()

for ind, row in df.iterrows(): 
  for w in row['tokens']:
    # add in frequency map
    if w not in word_freq:
        word_freq[w] = 1
    else:
        word_freq[w] += 1
    # add in trie
    words_for_trie.add(w)
  for w in row['titletokens']:
    # add in frequency map
    if w not in word_freq:
        word_freq[w] = 1000
    else:
        word_freq[w] += 1000
    # add in trie
    words_for_trie.add(w)

In [49]:
print(len(words_for_trie))

22448


In [56]:
trie = Trie()

# Insert into the trie
for w in words_for_trie:
    trie.insert(w)

# Get search query suggestions for the prefix "a"
prefix = "Trans"
suggestions = trie.suggestions(prefix.lower())

# If there are multiple suggestions, display top 5 suggestions.
sorted_suggestions = sorted(suggestions, key=lambda x: word_freq[x], reverse=True)
if len(sorted_suggestions) > 5:
    sorted_suggestions = sorted_suggestions[:5]

print(f"Suggestions for '{prefix}': {sorted_suggestions}")

Suggestions for 'Trans': ['transformers', 'transylvania', 'transformed', 'transfer', 'transfers']


In [58]:
print(word_freq['transformers'], word_freq['transylvania'], word_freq['transformed'], word_freq['transfer'], word_freq['transfers'])

11002 3003 1015 1011 1006
