In [None]:
# NGRAM
from collections import Counter
from nltk.util import ngrams
from nltk.tokenize import word_tokenize

class NGram:
    def __init__(self, n):
        self.n = n
        self.ngrams = Counter()

    def generate_ngrams(self, text):
        tokens = word_tokenize(text)
        self.ngrams.update(ngrams(tokens, self.n))

    def get_ngrams(self):
        return self.ngrams

# Example usage:
if __name__ == "__main__":
    import nltk
    nltk.download('punkt')  # Download tokenizer resources

    ngram_model = NGram(3)  # Create a trigram model
    text = "this is a simple test sentence for n-grams this is a test"
    ngram_model.generate_ngrams(text)

    print("Generated N-Grams with Counts:")
    for ngram, count in ngram_model.get_ngrams().items():
        print(f"{ngram}: {count}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Generated N-Grams with Counts:
('this', 'is', 'a'): 2
('is', 'a', 'simple'): 1
('a', 'simple', 'test'): 1
('simple', 'test', 'sentence'): 1
('test', 'sentence', 'for'): 1
('sentence', 'for', 'n-grams'): 1
('for', 'n-grams', 'this'): 1
('n-grams', 'this', 'is'): 1
('is', 'a', 'test'): 1


In [None]:
# PAT
!pip install datrie
import datrie
import string

# Initialize the PAT trie (PATRICIA Trie) to store strings made of ASCII characters
trie = datrie.Trie(string.ascii_lowercase)

# Insert some example strings into the PAT trie
trie['hello'] = 1
trie['hell'] = 2
trie['he'] = 3
trie['hero'] = 4
trie['her'] = 5

# Search for a prefix in the trie
print("Words starting with 'he':", trie.keys('he'))

# Access specific words
print("Value associated with 'hello':", trie['hello'])

# Remove a word
del trie['hero']
print("After deleting 'hero', words starting with 'he':", trie.keys('he'))





# Inverted file Structure
from collections import defaultdict
from nltk.tokenize import word_tokenize

class InvertedIndex:
    def __init__(self):
        self.index = defaultdict(list)

    def build_index(self, documents):
        for doc_id, text in enumerate(documents):
            for word in word_tokenize(text.lower()):
                self.index[word].append(doc_id)

    def get_index(self):
        return self.index

# Example usage:
if __name__ == "__main__":
    import nltk
    nltk.download('punkt')  # Download tokenizer resources

    documents = [
        "Inverted index is useful in search engines",
        "Search engines use inverted index",
        "An index maps words to documents"
    ]

    inverted_index = InvertedIndex()
    inverted_index.build_index(documents)

    for word, doc_ids in inverted_index.get_index().items():
        print(f"{word}: {doc_ids}")


Collecting datrie
  Downloading datrie-0.8.2.tar.gz (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━[0m [32m51.2/63.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m61.4/63.3 kB[0m [31m870.8 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.3/63.3 kB[0m [31m665.8 kB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: datrie
  Building wheel for datrie (pyproject.toml) ... [?25l[?25hdone
  Created wheel for datrie: filename=datrie-0.8.2-cp310-cp310-linux_x86_64.whl si

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
