# Course 2 - N-grams

## Import librairies 

In [8]:
from collections import Counter, defaultdict
import math
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd
import numpy as np

## Import dataset

In [37]:
df = pd.read_parquet("hf://datasets/data-is-better-together/10k_prompts_ranked/data/train-00000-of-00001.parquet")
train_test = df.prompt

percentage_train_test = int(train_test.shape[0]//2)
train_list = train_test[percentage_train_test:]
test_list = train_test[:percentage_train_test]
print(f"train_list shape : {train_list.shape}")
print(f"test_list shape : {test_list.shape}")

train_list shape : (5166,)
test_list shape : (5165,)


In [38]:
train_strings = " ".join(train_list)
train_strings = train_strings.lower()
train_tokens = word_tokenize(train_strings)
print("Number of tokens in the training set:",len(train_tokens))

vocab = set(train_tokens)
print("Vocabulary size:",len(vocab))

test_strings = " ".join(test_list)
test_strings = test_strings.lower()
test_tokens = word_tokenize(test_strings)
print("Number of tokens in the test set:",len(test_tokens))

Number of tokens in the training set: 537073
Vocabulary size: 33925
Number of tokens in the test set: 546946


## Train n-grams

In [39]:
def tokenize(text):
    """Tokenize the input text."""
    
    return word_tokenize(text)

def count_ngrams(tokens, n):
    """Counts n-grams."""
    
    ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
    
    return Counter(ngrams)

def calculate_ngram_probabilities(train_tokens, n, test_tokens, k=0.00001):
    """Calculates n-gram probabilities."""
    
    vocab = set(train_tokens)
    V = len(vocab)
    ngram_counts = count_ngrams(train_tokens, n)
    n_minus_one_gram_counts = count_ngrams(train_tokens, n-1)
    ngram_probabilities = defaultdict(float)
    
    for ngram in ngram_counts:
        prefix = ngram[:-1]
        ngram_counts[ngram] += k
        n_minus_one_gram_counts[prefix] += k
        ngram_probabilities[ngram] = (ngram_counts[ngram] + k) / (n_minus_one_gram_counts[prefix] + k*V)

    for i in range(len(test_tokens)-n+1):
        ngram = tuple(test_tokens[i:i+n])
        if ngram not in ngram_counts:
            ngram_counts[ngram] = k
            prefix = ngram[:-1]
            if prefix not in n_minus_one_gram_counts:
                n_minus_one_gram_counts[prefix] = k
            ngram_probabilities[ngram] = (ngram_counts[ngram] + k) / (n_minus_one_gram_counts[prefix] + k*V)
    
    return ngram_probabilities

## Example

In [40]:
n = 5

ngram_probabilities = calculate_ngram_probabilities(train_tokens, n, test_tokens)
print(f"Number of {n}-grams:",len(ngram_probabilities))

Number of 5-grams: 950410


In [41]:
import random

def predict_next_word(ngram_probabilities, context, vocab):
    """
    Prédit le mot suivant en fonction du contexte (n-1 mots).
    """
    context = tuple(context)
    candidates = {
        ngram[-1]: prob
        for ngram, prob in ngram_probabilities.items()
        if ngram[:-1] == context
    }
    
    if candidates:
        # Trie les candidats par probabilité décroissante et retourne le mot avec la plus haute probabilité
        predicted_word = max(candidates.items(), key=lambda x: x[1])[0]
        return predicted_word
    
    # Si le vocabulaire est vide, déclenche une erreur.
    if not vocab:
        raise ValueError("Le vocabulaire est vide.")
    
    # Retourne un mot aléatoire si aucun contexte correspondant n'est trouvé
    return random.choice(list(vocab))

In [42]:
context = ["I", "want", "to", "understand"]  # suppose que tu veux prédire après ton test
predicted = predict_next_word(ngram_probabilities, context, vocab=set(train_tokens))
print("Mot prédit :", predicted)


Mot prédit : b-yern
