In [1]:
import nltk
from nltk.corpus import brown
from nltk.tokenize import RegexpTokenizer
from collections import Counter
import pandas as pd

# Download NLTK resources
nltk.download('brown')
nltk.download('punkt')

# Load and clean corpus
tokenizer = RegexpTokenizer(r'\w+')  # Removes punctuation
text = brown.words()
cleaned_text = [word.lower() for word in tokenizer.tokenize(" ".join(text))]

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\administrateur\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\administrateur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def get_ngrams_freq(tokens, n):
    ngrams = zip(*[tokens[i:] for i in range(n)])
    ngram_list = [' '.join(ng) for ng in ngrams]
    freq = Counter(ngram_list)
    df = pd.DataFrame(freq.items(), columns=['ngram', 'frequency']).sort_values(by='frequency', ascending=False)
    return df.reset_index(drop=True)

In [5]:
ngrams_df = get_ngrams_freq(cleaned_text, 3)
print(ngrams_df.head())


               ngram  frequency
0         one of the        404
1  the united states        340
2         as well as        238
3        some of the        179
4         out of the        176


In [3]:
def predict_next_words(tokens, input_seq, n=3, k=5):
    prefix = " ".join(input_seq.lower().split()[-(n-1):])  # Get last n-1 words
    ngrams = zip(*[tokens[i:] for i in range(n)])
    matches = [' '.join(ng) for ng in ngrams if ' '.join(ng[:n-1]) == prefix]

    next_words = [ng.split()[-1] for ng in matches]
    freq = Counter(next_words)
    suggestions = freq.most_common(k)
    return suggestions


In [4]:
predict_next_words(cleaned_text, "the united", n=3, k=5)

[('states', 340), ('nations', 44), ('irish', 1), ('arab', 1), ('steel', 1)]