In [1]:
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from collections import Counter

In [None]:
# Download required NLTK data
nltk.download('punkt')

In [None]:
def generate_ngrams():
    text = """
    Natural language processing is a subfield of linguistics and artificial intelligence. 
    It focuses on the interaction between computers and human language.
    NLP techniques are used in many applications like chatbots and translation.
    """
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    # Remove punctuation
    tokens = [token for token in tokens if token.isalpha()]
    
    print("Tokens:", tokens)
    # Generate n-grams
    unigrams = list(ngrams(tokens, 1))
    bigrams = list(ngrams(tokens, 2))
    trigrams = list(ngrams(tokens, 3))
    # Count frequencies
    unigram_freq = Counter(unigrams)
    bigram_freq = Counter(bigrams)
    trigram_freq = Counter(trigrams)
    
    print("\n--- Unigrams ---")
    print(f"Total unigrams: {len(unigrams)}")
    print("Top 10 unigrams:")
    for unigram, count in unigram_freq.most_common(10):
        print(f"  {unigram[0]}: {count}")
    
    print("\n--- Bigrams ---")
    print(f"Total bigrams: {len(bigrams)}")
    print("Top 10 bigrams:")
    for bigram, count in bigram_freq.most_common(10):
        print(f"  {' '.join(bigram)}: {count}")
    
    print("\n--- Trigrams ---")
    print(f"Total trigrams: {len(trigrams)}")
    print("Top 10 trigrams:")
    for trigram, count in trigram_freq.most_common(10):
        print(f"  {' '.join(trigram)}: {count}")
    
    return unigrams, bigrams, trigrams

if __name__ == "__main__":
    generate_ngrams()

Tokens: ['natural', 'language', 'processing', 'is', 'a', 'subfield', 'of', 'linguistics', 'and', 'artificial', 'intelligence', 'it', 'focuses', 'on', 'the', 'interaction', 'between', 'computers', 'and', 'human', 'language', 'nlp', 'techniques', 'are', 'used', 'in', 'many', 'applications', 'like', 'chatbots', 'and', 'translation']

--- Unigrams ---
Total unigrams: 32
Top 10 unigrams:
  and: 3
  language: 2
  natural: 1
  processing: 1
  is: 1
  a: 1
  subfield: 1
  of: 1
  linguistics: 1
  artificial: 1

--- Bigrams ---
Total bigrams: 31
Top 10 bigrams:
  natural language: 1
  language processing: 1
  processing is: 1
  is a: 1
  a subfield: 1
  subfield of: 1
  of linguistics: 1
  linguistics and: 1
  and artificial: 1
  artificial intelligence: 1

--- Trigrams ---
Total trigrams: 30
Top 10 trigrams:
  natural language processing: 1
  language processing is: 1
  processing is a: 1
  is a subfield: 1
  a subfield of: 1
  subfield of linguistics: 1
  of linguistics and: 1
  linguistics a