In [1]:
# Import libraries
import pandas as pd
from faker import Faker
import random
import pickle

# Initialize Faker
fake = Faker()

# Load US Baby Names
try:
    names_df = pd.read_csv('data/NationalNames.csv')
    names = names_df['Name'].unique().tolist()[:5000]  # Limit for simplicity
except FileNotFoundError:
    print("Download 'NationalNames.csv' from https://www.kaggle.com/datasets/kaggle/us-baby-names")
    names = [fake.name() for _ in range(5000)]  # Fallback

# Generate emails
emails = [fake.email() for _ in range(5000)]

# Combine and save dataset
dataset = names + emails
random.shuffle(dataset)
df = pd.DataFrame(dataset, columns=['text'])
df.to_csv('data/autocomplete_dataset.csv', index=False)
print(f"Dataset size: {len(dataset)}")
df.head()


Dataset size: 10000


Unnamed: 0,text
0,Ovie
1,Anabel
2,reyesmario@example.com
3,Suzie
4,suzanne80@example.org


In [2]:
# EDA: Text length distribution
df['length'] = df['text'].apply(len)
print(df.describe())

# Save length distribution plot
import matplotlib.pyplot as plt
plt.hist(df['length'], bins=30)
plt.title('Text Length Distribution')
plt.xlabel('Length')
plt.ylabel('Frequency')
plt.savefig('docs/length_distribution.png')
plt.close()


             length
count  10000.000000
mean      13.844800
std        8.317802
min        2.000000
25%        6.000000
50%       13.500000
75%       21.000000
max       33.000000


In [3]:
# Train trie model
class Trie:
    def __init__(self):
        self.root = {}
        self.end = '*'

    def insert(self, word):
        node = self.root
        for char in word.lower():
            if char not in node:
                node[char] = {}
            node = node[char]
        node[self.end] = True

    def search_prefix(self, prefix):
        node = self.root
        for char in prefix.lower():
            if char not in node:
                return []
            node = node[char]
        return self._collect_words(node, prefix.lower())

    def _collect_words(self, node, prefix):
        words = []
        if self.end in node:
            words.append(prefix)
        for char, child in node.items():
            if char != self.end:
                words.extend(self._collect_words(child, prefix + char))
        return words

# Train and save model
trie = Trie()
for text in dataset:
    trie.insert(text)
with open('models/trie_model.pkl', 'wb') as f:
    pickle.dump(trie, f)

# Test model
print(trie.search_prefix('jo')[:5])

['jo', 'joan', 'joann', 'joann99@example.org', 'joanne']
