In [25]:
import nltk
import numpy as np
import pandas as pd
from nltk import ngrams
from nltk.probability import FreqDist
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ali18\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [26]:
# Step 1: Load the data
data = pd.read_csv('data/google_play_store_apps_reviews.csv')

# Step 2: Split the data
train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 42)

In [27]:
# Step 3: Build the n-gram Language Model
def get_ngrams(text, n):
    tokens = nltk.word_tokenize(text)
    return list(ngrams(tokens, n))

def train_ngram(data, n):
    positive_ngrams = []
    negative_ngrams = []

    for index, row in data.iterrows():
        grams = get_ngrams(row['review'], n)
        if row['polarity'] == 1:
            positive_ngrams.extend(grams)
        elif row['polarity'] == 0:
            negative_ngrams.extend(grams)

    positive_freq = FreqDist(positive_ngrams)
    negative_freq = FreqDist(negative_ngrams)

    return positive_freq, negative_freq

# Step 4: Train the Model
n = 2  # Change to the desired n-gram size
positive_freq, negative_freq = train_ngram(train_data, n)

In [55]:
from sklearn.metrics import precision_score, recall_score

# Step 5: test the n-gram
def test_ngram(data, positive_freq, negative_freq, n):
    pred_labels = []

    for index, row in data.iterrows():
        grams = get_ngrams(row['review'], n)
        positive_prob = 1
        negative_prob = 1

        for gram in grams:
            positive_prob *= positive_freq[gram] + 1
            negative_prob *= negative_freq[gram] + 1

        positive_prob = positive_prob * sum(negative_freq.values()) / sum(positive_freq.values())

        if positive_prob > negative_prob:
            pred_labels.append(1)
        else:
            pred_labels.append(0)

    return pred_labels


In [56]:
# Step 6: Evaluate the model on the test set
n = 2  # Change to the desired n-gram size
test_labels = test_ngram(test_data, positive_freq, negative_freq, n)

# True labels from the test set
true_labels = test_data['polarity'].tolist()

# Calculate metrics
accuracy = accuracy_score(true_labels, test_labels)
precision = precision_score(true_labels, test_labels)
recall = recall_score(true_labels, test_labels)

# Print the metrics
print("Accuracy Score:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy Score: 0.7988826815642458
Precision: 0.9473684210526315
Recall: 0.33962264150943394


In [63]:
import os
from tokenizers import models as token_models
from tokenizers import normalizers as token_normalizers
from tokenizers import pre_tokenizers as token_pre_tokenizers
from tokenizers import processors as token_processors
from tokenizers import trainers as token_trainers
from tokenizers import Tokenizer
from collections import Counter
from typing import List, Dict, Tuple
from random import choices

def read_file(file_path: str) -> str:
    with open(file_path, encoding="utf8") as file:
        return file.read()

def train_tokenizer(file_path: str, tokenizer: Tokenizer, special_tokens: List[str]) -> None:
    tokenizer.normalizer = token_normalizers.Sequence(
        [token_normalizers.NFKC(), token_normalizers.Lowercase(), token_normalizers.StripAccents()]
    )
    tokenizer.pre_tokenizer = token_pre_tokenizers.BertPreTokenizer()
    token_trainer = token_trainers.BpeTrainer(special_tokens=special_tokens)
    tokenizer.train([file_path], trainer=token_trainer)

def initialize_tokenizer() -> Tokenizer:
    tokenizer = Tokenizer(token_models.BPE(unk_token="[UNK]"))
    return tokenizer

def configure_tokenizer_post_processor(tokenizer: Tokenizer, cls_id: int, sep_id: int) -> None:
    tokenizer.post_processor = token_processors.TemplateProcessing(
        single="[CLS]:0 $A:0 [SEP]:1",
        pair="[CLS]:0 $A:0 [SEP]:1 $B:1 [SEP]:2",
        special_tokens=[("[CLS]", cls_id), ("[SEP]", sep_id)],
    )

def calculate_n_gram_probability(tokens: List[str], n_gram: Tuple[str], result_n_grams: Counter) -> float:
    n_gram_count = result_n_grams[n_gram]
    if len(n_gram) > 1:
        prefix = tuple(n_gram[:-1])
        prefix_count = result_n_grams[prefix] if prefix in result_n_grams else 0
        return n_gram_count / prefix_count if prefix_count > 0 else 0
    else:
        return n_gram_count

def generate_n_gram_probabilities(text: str, n: int, tk: Tokenizer) -> Dict[Tuple[str], float]:
    tokens = tk.encode(text).tokens
    result_n_grams = Counter(tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1))

    return {n_gram: calculate_n_gram_probability(tokens, n_gram, result_n_grams) for n_gram in result_n_grams}

def create_n_grams_set(text: str, n: int, tk: Tokenizer) -> List[Dict[Tuple[str], int]]:
    return [generate_n_gram_probabilities(text, i, tk) for i in range(1, n + 1)]

def predict_next_word(prev_seq: List[str], n_gram: Dict[Tuple[str], int]) -> str | None:
    matched_seqs = [words_seq for words_seq in n_gram if prev_seq == words_seq[:-1]]
    if not matched_seqs:
        return None

    return choices(matched_seqs, [n_gram[seq] for seq in matched_seqs])[0][-1]

def generate_text(init_seq: str, n_tokens: int, n: int, trained_n_grams: List[Dict[Tuple[str], int]], tk: Tokenizer) -> List[str]:
    result = tk.encode(init_seq).tokens[:-1]
    current_n = n

    for _ in range(n_tokens):
        next_token = None

        while next_token is None and current_n > 0:
            next_token = predict_next_word(result[-(current_n - 1):], trained_n_grams[current_n - 1])
            current_n -= 1

        if next_token is None:
            break

        result.append(next_token)
        current_n = n

    return result

file_path = "./data/Tarzan.txt"
file_content = read_file(file_path)

tokenizer = initialize_tokenizer()
train_tokenizer(file_path, tokenizer, ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"])

cls_id = tokenizer.token_to_id("[CLS]")
sep_id = tokenizer.token_to_id("[SEP]")

configure_tokenizer_post_processor(tokenizer, cls_id, sep_id)

trained_n_grams_2 = create_n_grams_set(file_content, 2, tokenizer)

init_seq_1 = "Knowing well the windings of the trail he"
print(generate_text(init_seq_1, 10, 2, trained_n_grams_2, tokenizer))
init_seq_2 = "For half a day he lolled on the huge back and"
print(generate_text(init_seq_2, 10, 2, trained_n_grams_2, tokenizer))

trained_n_grams_3 = create_n_grams_set(file_content, 3, tokenizer)

print(generate_text(init_seq_1, 10, 3, trained_n_grams_3, tokenizer))
print(generate_text(init_seq_2, 10, 3, trained_n_grams_3, tokenizer))

trained_n_grams_5 = create_n_grams_set(file_content, 5, tokenizer)

print(generate_text(init_seq_1, 10, 5, trained_n_grams_5, tokenizer))
print(generate_text(init_seq_2, 10, 5, trained_n_grams_5, tokenizer))


['[CLS]', 'knowing', 'well', 'the', 'windings', 'of', 'the', 'trail', 'he']
['[CLS]', 'for', 'half', 'a', 'day', 'he', 'lolled', 'on', 'the', 'huge', 'back', 'and']
['[CLS]', 'knowing', 'well', 'the', 'windings', 'of', 'the', 'trail', 'he']
['[CLS]', 'for', 'half', 'a', 'day', 'he', 'lolled', 'on', 'the', 'huge', 'back', 'and']
['[CLS]', 'knowing', 'well', 'the', 'windings', 'of', 'the', 'trail', 'he']
['[CLS]', 'for', 'half', 'a', 'day', 'he', 'lolled', 'on', 'the', 'huge', 'back', 'and']


In [None]:
corpus_file = open("./data/Tarzan.txt", encoding="utf8")
corpus = corpus_file.read()

from tokenizers import (
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)

tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)

tokenizer.train(["./data/Tarzan.txt"], trainer=trainer)

cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")

tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)

def get_n_gram(text: str, n: int, tokenizer: Tokenizer) -> list[tuple[str]]:
  """
  This method will first tokenize the `text` using the provided `tokenizer`.
  After doing that it will create n-grams with respect to the given `n`.
  """

  tokens = tokenizer.encode(text).tokens
  result_n_grams = []
  idx_range = range(len(tokens) - n + 1) if n > 0 else range(len(tokens) - n) 
  for i in idx_range:
    result_n_grams = result_n_grams + [tuple(tokens[i:i+n])]
  return result_n_grams

from collections import Counter

def train_n_gram(text: str, n: int, tokenizer: Tokenizer) -> dict[tuple[str], int]:
  """
  This method calculate the probability of seeing the nth word after seeing
  (n-1) words before it. To do it counts the number of times we've seen the
  sentence with n words (`big_sentence_count`) and the number of times it's seen
  the sentence with (n-1) words (`small_sentence_count`). the result will be =
  `big_sentence_count` \ `small_sentence_count`.
  """

  big_sentences = Counter(get_n_gram(text, n, tokenizer))
  small_sentences = Counter(get_n_gram(text, n - 1, tokenizer))

  result = {}
  for big_sentence, big_sentence_count in big_sentences.items():
    small_sentence_count = small_sentences[big_sentence[:-1]]
    result[big_sentence] = big_sentence_count / small_sentence_count
  
  return result


def train_n_grams(text: str, n: int, tokenizer: Tokenizer) -> list[dict[tuple[str], int]]:
  """
  This method will create n-grams for n from 1 to the designated `n`. Th result
  will be a list of these trained n-grams where the index 0 of the list will
  correspond to a uni-gram.
  """

  result = [None] * n
  for i in range(1, n + 1):
    result[i - 1] = train_n_gram(text, i, tokenizer)
  return result


from random import choices

def predict_next_word(previous_text: list[str], n_gram: dict[tuple[str], int]) -> str | None:
    """
    This method simply searches for every combination of words in the n_gram
    that matches the input text. After finding every matched combination, it
    will make a random choice with the probabilities found in n_gram.
    """
    matched_combs: list[tuple[str]] = []
    combs_probabilities: list[int] = []
    previous_text = tuple(previous_text)

    for words_comb, probability in n_gram.items():
       if previous_text == words_comb[:-1]:
         matched_combs += [words_comb]
         combs_probabilities += [probability]
    
    if not matched_combs:
      return None

    return choices(matched_combs, combs_probabilities)[0][-1] # Select the last word of the chosen n-gram


def predict_text(
    init_sentence: str,
    n_tokens: int,
    n: int,
    trained_n_grams: list[dict[list[str], int]],
    tokenizer: Tokenizer) -> list[str]:
  """
  This method will continue the given initial sentence until `n_tokens` using
  the trained n-grams. it will also backoff to a lower n-gram when ever it
  doesn't find the sequence in the initial n-gram.
  """

  result = tokenizer.encode(init_sentence).tokens[:-1] # Tokenize and remove the end of sentence special token
  for i in range(n_tokens):
    next_token = None
    current_n = n
    while next_token is None:
      next_token = predict_next_word(result[-(current_n - 1):], trained_n_grams[current_n - 1])
      current_n -= 1
    
    result += [next_token]
  
  return result


trained_n_grams = train_n_grams(corpus, 2, tokenizer)


init_sentence_1 = "Knowing well the windings of the trail he"
print(predict_text(init_sentence_1, 10, 2, trained_n_grams, tokenizer))
init_sentence_2 = "For half a day he lolled on the huge back and"
print(predict_text(init_sentence_2, 10, 2, trained_n_grams, tokenizer))


trained_n_grams_3 = train_n_grams(corpus, 3, tokenizer)

print(predict_text(init_sentence_1, 10, 3, trained_n_grams_3, tokenizer))
print(predict_text(init_sentence_2, 10, 3, trained_n_grams_3, tokenizer))

trained_n_grams_5 = train_n_grams(corpus, 5, tokenizer)

print(predict_text(init_sentence_1, 10, 5, trained_n_grams_5, tokenizer))
print(predict_text(init_sentence_2, 10, 5, trained_n_grams_5, tokenizer))