# Course 2 - N-grams

## Import librairies 

In [1]:
from datasets import load_dataset
from collections import Counter, defaultdict
import math
from nltk.tokenize import word_tokenize, sent_tokenize

## Import dataset

In [2]:
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
dataset = load_dataset("yaful/DeepfakeTextDetect")
df = dataset["train"].to_pandas()
df.head()

Downloading data: 100%|██████████| 233M/233M [00:01<00:00, 185MB/s]  
Downloading data: 100%|██████████| 42.3M/42.3M [00:00<00:00, 150MB/s] 
Downloading data: 100%|██████████| 41.9M/41.9M [00:00<00:00, 84.0MB/s]
Downloading data: 100%|██████████| 1.29M/1.29M [00:00<00:00, 9.35MB/s]
Downloading data: 100%|██████████| 2.13M/2.13M [00:00<00:00, 23.8MB/s]


Generating train split:   0%|          | 0/319071 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/56792 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/56819 [00:00<?, ? examples/s]

Generating test_ood_gpt split:   0%|          | 0/1562 [00:00<?, ? examples/s]

Generating test_ood_gpt_para split:   0%|          | 0/2362 [00:00<?, ? examples/s]

Unnamed: 0,text,label,src
0,White girls very rarely date Asian men. Even i...,1,cmv_human
1,I am a 23 year old male Indian American male. ...,1,cmv_human
2,"Take three people, Persons A, B, and C. They l...",1,cmv_human
3,(A) Work part-time in high school; Then go to ...,1,cmv_human
4,When police introduce a new form of speed prev...,1,cmv_human


In [4]:
df["text"] = df["text"].astype(str)

#X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=69)

train_df = df.query("src != 'cmv_machine_continuation_opt_iml_30b'")
train_list = list(train_df["text"])
print("Number of dream reports in the training set:",len(train_list))

test_df = df.query("src == 'cmv_machine_continuation_opt_iml_30b'")
test_list = list(test_df["text"])
print("Number of dream reports in the test set:",len(test_list))

Number of dream reports in the training set: 318457
Number of dream reports in the test set: 614


In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
train_strings = " ".join(train_list)
train_strings = train_strings.lower()
train_tokens = word_tokenize(train_strings)
print("Number of tokens in the training set:",len(train_tokens))

vocab = set(train_tokens)
print("Vocabulary size:",len(vocab))

test_strings = " ".join(test_list)
test_strings = test_strings.lower()
test_tokens = word_tokenize(test_strings)
print("Number of tokens in the test set:",len(test_tokens))

Number of tokens in the training set: 77450096
Vocabulary size: 494023
Number of tokens in the test set: 119560


## Train n-grams

In [7]:
def tokenize(text):
    """Tokenize the input text."""
    
    return word_tokenize(text)

def count_ngrams(tokens, n):
    """Counts n-grams."""
    
    ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
    
    return Counter(ngrams)

def calculate_ngram_probabilities(train_tokens, n, test_tokens, k=0.00001):
    """Calculates n-gram probabilities."""
    
    vocab = set(train_tokens)
    V = len(vocab)
    ngram_counts = count_ngrams(train_tokens, n)
    n_minus_one_gram_counts = count_ngrams(train_tokens, n-1)
    ngram_probabilities = defaultdict(float)
    
    for ngram in ngram_counts:
        prefix = ngram[:-1]
        ngram_counts[ngram] += k
        n_minus_one_gram_counts[prefix] += k
        ngram_probabilities[ngram] = (ngram_counts[ngram] + k) / (n_minus_one_gram_counts[prefix] + k*V)

    for i in range(len(test_tokens)-n+1):
        ngram = tuple(test_tokens[i:i+n])
        if ngram not in ngram_counts:
            ngram_counts[ngram] = k
            prefix = ngram[:-1]
            if prefix not in n_minus_one_gram_counts:
                n_minus_one_gram_counts[prefix] = k
            ngram_probabilities[ngram] = (ngram_counts[ngram] + k) / (n_minus_one_gram_counts[prefix] + k*V)
    
    return ngram_probabilities

## Example

In [8]:
n = 5

ngram_probabilities = calculate_ngram_probabilities(train_tokens, n, test_tokens)
print(f"Number of {n}-grams:",len(ngram_probabilities))

Number of 5-grams: 59220569


In [9]:
ngram_probabilities

defaultdict(float,
            {('white', 'girls', 'very', 'rarely', 'date'): 0.8196667662409259,
             ('girls', 'very', 'rarely', 'date', 'asian'): 0.8453292774255923,
             ('very', 'rarely', 'date', 'asian', 'men'): 0.8453292774255923,
             ('rarely', 'date', 'asian', 'men', '.'): 0.8453292774255923,
             ('date', 'asian', 'men', '.', 'even'): 0.7660529212521773,
             ('asian', 'men', '.', 'even', 'in'): 0.4566194311790747,
             ('men', '.', 'even', 'in', 'asia'): 0.8403302624672595,
             ('.', 'even', 'in', 'asia', 'white'): 0.8453292774255923,
             ('even', 'in', 'asia', 'white', 'men'): 0.8140208088605471,
             ('in', 'asia', 'white', 'men', 'are'): 0.8403302624672595,
             ('asia', 'white', 'men', 'are', 'much'): 0.8403302624672595,
             ('white', 'men', 'are', 'much', 'more'): 0.8403302624672595,
             ('men', 'are', 'much', 'more', 'popular'): 0.7234236610551292,
             ('are', 

## Evaluate perplexity

In [10]:
def calculate_perplexity(test_tokens, ngram_probabilities, n):
    """Calculates the perplexity of a test corpus given n-gram probabilities."""
    log_probability_sum = 0
    ngram_count = 0
    
    for i in range(len(test_tokens)-n+1):
        ngram = tuple(test_tokens[i:i+n])
        log_probability_sum += math.log2(ngram_probabilities[ngram])
        ngram_count += 1
    
    average_log_probability = -log_probability_sum / ngram_count
    perplexity = math.pow(2, average_log_probability)
    
    return perplexity

In [11]:
calculate_perplexity(train_tokens, ngram_probabilities, n)

5.870205936220359

In [12]:
calculate_perplexity(test_tokens, ngram_probabilities, n)

15608.159787834793

In [13]:
def greedy_sampling(context, vocab, ngram_probabilities, n, max_length = 50):
    
    sentence = []

    if len(context) < (n-1):
        print("len(context) < n")
        return sentence

    context = context[-(n-1):]
    
    for i in range(max_length):

        probs = dict()
        
        for v in vocab:

            ngram = list(context)
            ngram.append(v)
            ngram = tuple(ngram)
            probs[v] = ngram_probabilities[ngram]

        best_token = max(probs, key=probs.get) # greedy 
        #print(best_v)
        #print(probs[best_v])
        
        if probs[best_token] == 0:
            print("prob = 0")
            return sentence
            
        sentence.append(best_token)
        context = list(context)[1:]
        context.append(best_token)
        context = tuple(context)
            
    return sentence  

In [14]:
context = ['i', "'m", 'in', 'honduras']
context = ['the', 'one', 'at', 'the']
sentence = greedy_sampling(context, vocab, ngram_probabilities, n, max_length = 200)
print(" ".join(context) + " " +  " ".join(sentence))

the one at the top of the page for who ) . congratulations to -2 for each side who gave me the most ( see the top of the page for who ) . congratulations to -2 for each side who gave me the most ( see the top of the page for who ) . congratulations to -2 for each side who gave me the most ( see the top of the page for who ) . congratulations to -2 for each side who gave me the most ( see the top of the page for who ) . congratulations to -2 for each side who gave me the most ( see the top of the page for who ) . congratulations to -2 for each side who gave me the most ( see the top of the page for who ) . congratulations to -2 for each side who gave me the most ( see the top of the page for who ) . congratulations to -2 for each side who gave me the most ( see the top of the page for who ) . congratulations to -2 for each side who gave me the most ( see the top of


In [15]:
train_strings[:250]

"white girls very rarely date asian men. even in asia white men are much more popular than asian men. even though an asian guy may have a preference for white girls he doesn't have much of a chance with one so he should instead pursue a non-white girl"

# Using NLTK.lm

## Import dataset and prepare training and test sets

In [16]:
!pip install -U nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.4
    Uninstalling nltk-3.2.4:
      Successfully uninstalled nltk-3.2.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
preprocessing 0.1.13 requires nltk==3.2.4, but you have nltk 3.8.1 which is incompatible.[0m[31m
[0mSuccessfully installed nltk-3.8.1


In [17]:
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE, StupidBackoff, Laplace

In [18]:
train_sents = sent_tokenize(train_strings)
print(len(train_sents))
train_sents_tokens = [word_tokenize(s) for s in train_sents]

test_sents = sent_tokenize(test_strings)
print(len(test_sents))
test_sents_tokens = [word_tokenize(s) for s in test_sents]

3942769
4637


In [19]:
test_sents_tokens[:2]

[['white', 'girls', 'very', 'rarely', 'date', 'asian', 'men', '.'],
 ['even',
  'in',
  'asia',
  'white',
  'men',
  'are',
  'much',
  'more',
  'popular',
  'than',
  'asian',
  'men',
  '.']]

In [20]:
n = 3

train, vocab = padded_everygram_pipeline(n, train_sents_tokens)

#lm = MLE(n) # Maximum Likelihood Estimate
#lm = StupidBackoff(order = n) # Stupid Backoff
lm = Laplace(n) # Laplace smoothing

In [21]:
lm.fit(train, vocab)
print(lm.vocab)
print(len(lm.vocab))

<Vocabulary with cutoff=1 unk_label='<UNK>' and 493886 items>
493886


In [22]:
lm.vocab.lookup(["I", "am"])

('<UNK>', 'am')

In [23]:
print(lm.counts)

<NgramCounter with 3 ngram orders and 267839025 ngrams>


In [24]:
lm.counts['people']

140218

In [25]:
lm.score("people")

0.0014962066909790428

In [None]:
lm.perplexity(train_sents_tokens)

In [None]:
lm.perplexity(test_sents_tokens)

In [None]:
lm.generate(20, random_seed=42)

In [None]:
lm.generate(20, text_seed=['i'], random_seed=42)