In [3]:
#!pip install nltk
# corpus "brown"
import nltk
nltk.download('brown')
nltk.download('punkt')  #  tokenisation
from nltk.corpus import brown
from nltk import word_tokenize
import string

# Récupération des mots du corpus
tokenized_text = brown.words()

# On convertit en minuscules et on enlève la ponctuation
cleaned_tokens = [word.lower() for word in tokenized_text if word.isalpha()]

# Affichage des 20 premiers mots nettoyés
print(cleaned_tokens[:20])


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\admin-27619\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin-27619\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', 'recent', 'primary', 'election', 'produced', 'no', 'evidence', 'that', 'any', 'irregularities', 'took']


In [5]:
import pandas as pd
from collections import Counter
from nltk.util import ngrams

def get_ngrams_frequencies(tokens, n):
   
    # DataFrame avec les n-grams et leur fréquence d'apparition.
 
    n_grams = list(ngrams(tokens, n))
    freq_dist = Counter(n_grams)
    df = pd.DataFrame(freq_dist.items(), columns=['ngram', 'frequency'])
    df = df.sort_values(by='frequency', ascending=False).reset_index(drop=True)
    return df


In [6]:
# Exemple avec des bigrams
bigrams_df = get_ngrams_frequencies(cleaned_tokens, n=2)
print(bigrams_df.head(10))

         ngram  frequency
0    (of, the)       9774
1    (in, the)       6156
2    (to, the)       3525
3    (on, the)       2491
4   (and, the)       2307
5   (for, the)       1862
6     (to, be)       1718
7    (at, the)       1677
8  (with, the)       1545
9      (of, a)       1502


In [7]:
def predict_next_words(ngram_df, input_seq, n=2, k=5):
    #Prédit les k mots les plus probables suivant une séquence de (n-1) mots.
    # Filtrer les n-grams qui commencent par la séquence donnée
    candidates = ngram_df[ngram_df['ngram'].apply(lambda x: x[:n-1] == input_seq)]
    # Trier par fréquence
    candidates = candidates.sort_values(by='frequency', ascending=False)
    # Retourner les k premiers mots suivants
    return [ngram[-1] for ngram in candidates['ngram'].head(k)]


In [8]:
predict_next_words(bigrams_df, input_seq=("in",), n=2, k=5)

['the', 'a', 'his', 'this', 'which']

In [9]:
predict_next_words(bigrams_df, input_seq=("to",), n=2, k=3)

['the', 'be', 'a']

In [10]:
trigrams_df = get_ngrams_frequencies(cleaned_tokens, n=3)
print(trigrams_df.head(10))

                   ngram  frequency
0         (one, of, the)        404
1  (the, united, states)        337
2         (as, well, as)        238
3        (some, of, the)        179
4         (out, of, the)        174
5      (the, fact, that)        167
6         (the, end, of)        149
7        (part, of, the)        146
8           (it, was, a)        144
9        (there, was, a)        142


In [11]:
predict_next_words(trigrams_df, input_seq=("one", "of"), n=3, k=5)
predict_next_words(trigrams_df, input_seq=("the", "united"), n=3, k=5)
predict_next_words(trigrams_df, input_seq=("it", "was"), n=3, k=5)
predict_next_words(trigrams_df, input_seq=("as", "well"), n=3, k=5)

['as', 'for', 'to', 'the', 'that']