In [None]:
import re, os, string, json
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm import tqdm
import nltk

In [None]:
dataname = 'IMDB Dataset.csv'
raw = pd.read_csv(dataname)
raw.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


## **N-gramų taikymas teksto generavimui**

⚠️***Negalima importuoti jokių kitų paketų, be tų kurie jau yra importuojami šio Notebook'o pradžioje!***

Užduotys:
1. Duomenys yra tie patys, kaip ir praeitos užduoties, todėl galite jas pašvarinti tuo pačiu būdu kaip darėte anksčiau (pvz. išmesti keistus simbolius). Pvz. galima pilnai pašalinti teksto eilutes, turinčias simbolius, kurių nėra `string.printable` rinkinyje.
  - Šiuo atveju nedirbsime su klasėmis, todėl galima ignoruoti klasių stulpelį.
  - STOP žodžių išmesti negalime - jie sudaro didelę dalį sakinio struktūros.
1. Siūloma išskleisti trumpinius naudojant `"contractions.json"` žodyną.
1. Išsirinkite ir panaudokite tokenizatorių iš NLTK (peržiūrėkite šį [demo](https://text-processing.com/demo/tokenize/) arba [dokumentaciją](https://www.nltk.org/api/nltk.tokenize.html)). Arba galite parašyti savo tokenizatorių.
1. Gaukite 2-gramas (paskui išbandykite didesnes N-gramas). Tai gali būti žodynas, kur raktai yra N-gramos `tuple` formatu, ir reikšmės yra N-gramų kiekiai.
1. Sukurkite algoritmą, kuris randa žodį (ar žodžius, jei yra >=3-gramos) N-gramų žodyne ir grąžina sekantį žodį su didžiausia tikimybe.
1. Galiausiai, parašykite sekančio žodžio pasiūlymo funkciją, kurios įvestis būtų:
  - `ngrams: dict[tuple, int]` - N-gramų žodynas
  - `start: list[str]` - pradiniai sakinio žodžiai
  - `steps: int` - reguliuoja galutinio sakinio ilgį

Išvestis turi būti sakinys (`list[str]` arba `str`).

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("yetman/english-contractions")

print("Path to dataset files:", path)
contractions = json.load(open('/kaggle/input/english-contractions/contractions.json'))

Using Colab cache for faster access to the 'english-contractions' dataset.
Path to dataset files: /kaggle/input/english-contractions


In [None]:
for i, text in tqdm(enumerate(raw['text']), total=len(raw['text'])):
    for word, expanded in contractions.items():
        if word in text:
            if isinstance(expanded, list):
                expanded = expanded[0]
            text = text.replace(word, expanded)
    if "br" in text:
        text = text.replace("br", "")
    raw.at[i, 'text'] = text

100%|██████████| 40000/40000 [00:09<00:00, 4217.63it/s]


In [125]:
tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()

ngrams = {}

for _, d in tqdm(raw.iterrows(), total=len(raw)):
    row = d['text']
    tokens = tokenizer.tokenize(row)
    tokens = [x.lower() for x in tokens if x.isalpha()]
    for i in range(len(tokens)-1):
        gram = (tokens[i], tokens[i+1])
        if gram not in ngrams:
            ngrams[gram] = 1
        else:
            ngrams[gram] += 1

100%|██████████| 40000/40000 [00:26<00:00, 1512.64it/s]


In [127]:
def create_ngram(raw_data, n):
    """
    Creates n-grams from the provided raw data.

    Args:
        raw_data (pd.DataFrame): The input DataFrame with a 'text' column.
        n (int): The desired length of the n-grams.

    Returns:
        dict: A dictionary where keys are n-grams (tuples) and values are their counts,
              sorted by count in descending order.
    """
    tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()
    ngrams = {}

    for _, d in tqdm(raw_data.iterrows(), total=len(raw_data)):
        row = d['text']
        tokens = tokenizer.tokenize(row)
        tokens = [x.lower() for x in tokens if x.isalpha()]

        if len(tokens) >= n:
            for i in range(len(tokens) - n + 1):
                gram = tuple(tokens[i:i+n])
                if gram not in ngrams:
                    ngrams[gram] = 1
                else:
                    ngrams[gram] += 1

    ngrams = dict(sorted(ngrams.items(), key=lambda x: x[1], reverse=True))
    return ngrams

In [126]:
ngrams_2 = create_ngram(raw, 2)
ngrams_3 = create_ngram(raw, 3)
ngrams_4 = create_ngram(raw, 4)

100%|██████████| 40000/40000 [00:26<00:00, 1515.37it/s]
100%|██████████| 40000/40000 [00:23<00:00, 1667.54it/s]
100%|██████████| 40000/40000 [00:25<00:00, 1592.31it/s]


In [132]:
print(f"2-gram: \n {list(ngrams_2.items())[:5]}")
print(f"3-gram: \n {list(ngrams_3.items())[:5]}")
print(f"4-gram: \n {list(ngrams_4.items())[:5]}")

2-gram: 
 [(('of', 'the'), 61943), (('in', 'the'), 40264), (('it', 'is'), 33988), (('this', 'movie'), 25217), (('is', 'a'), 24285)]
3-gram: 
 [(('one', 'of', 'the'), 7865), (('i', 'do', 'not'), 4720), (('this', 'movie', 'is'), 4385), (('it', 'is', 'a'), 4245), (('of', 'the', 'film'), 4135)]
4-gram: 
 [(('is', 'one', 'of', 'the'), 1941), (('i', 'have', 'ever', 'seen'), 1819), (('the', 'rest', 'of', 'the'), 1616), (('one', 'of', 'the', 'most'), 1326), (('one', 'of', 'the', 'best'), 1265)]


In [134]:
def predict_next_word_2gram(ngrams, current_word):
    candidates = {w2: count for (w1, w2), count in ngrams.items() if w1 == current_word}

    if not candidates:
        return None

    next_word = max(candidates, key=candidates.get)
    return next_word

In [135]:
print(predict_next_word_2gram(ngrams_2, 'i'))

have


In [136]:
def predict_next_word_3gram(ngrams, context):
    w1, w2 = context
    candidates = {w3: count for (a, b, w3), count in ngrams.items() if (a, b) == (w1, w2)}

    if not candidates:
        return None

    next_word = max(candidates, key=candidates.get)
    return next_word

In [137]:
print(predict_next_word_3gram(ngrams_3, ('i', 'have')))

seen


In [138]:
def suggest_next_words(ngrams: dict[tuple, int], start: list[str], steps: int):
    n = len(next(iter(ngrams)))
    sentence = start.copy()

    for _ in range(steps):
        context = tuple(sentence[-(n-1):])

        candidates = {
            gram[-1]: count
            for gram, count in ngrams.items()
            if gram[:-1] == context
        }

        if not candidates:
            break

        next_word = max(candidates, key=candidates.get)
        sentence.append(next_word)

    return sentence

In [139]:
print(suggest_next_words(ngrams_2, ['this'], 7))

['this', 'movie', 'is', 'a', 'lot', 'of', 'the', 'film']


In [140]:
print(suggest_next_words(ngrams_3, ['the', 'movie'], 10))

['the', 'movie', 'is', 'a', 'very', 'good', 'and', 'the', 'film', 'is', 'a', 'very']


## **BONUS**
Pasirinkite ir atlikite vieną iš užduočių:
1. Vietoje aukščiausios tikimybės žodžio, pritaikikyte top-k arba top-p algoritmą.
1. Sukurkite telefono siūlomų žodžių juostelės imituojančią programą, kur naudotojui duodama pasirinkti iš trijų sekančių žodžių (su `input()`).

In [143]:
def suggest_next_word_topk(ngrams, start, steps=5, k=3):
    n = len(next(iter(ngrams)))
    sentence = start[:]

    for _ in range(steps):
        context = tuple(sentence[-(n-1):])

        candidates = [(gram, count) for gram, count in ngrams.items()
                      if gram[:-1] == context]
        if not candidates:
            break

        candidates.sort(key=lambda x: x[1], reverse=True)

        topk = candidates[:k]
        words = [gram[-1] for gram, _ in topk]
        probs = np.array([count for _, count in topk], dtype=float)
        probs /= probs.sum()
        print(f"\n{' '.join(sentence)} ...")
        for i, w in enumerate(words):
            print(f"{i+1}. {w}")

        while True:
            choice = input(f"Enter number (1-{len(words)}): ").strip()
            if choice == "":
                print("Stopping generation")
                return ' '.join(sentence)
            elif choice.isdigit() and 1 <= int(choice) <= len(words):
                choice_idx = int(choice) - 1
                next_word = words[choice_idx]
                break
            else:
                print("Wrong input.")


        sentence.append(next_word)

    return ' '.join(sentence)

In [144]:
suggest_next_word_topk(ngrams_4, ['i', 'love', 'this'], steps=10, k=3)


i love this ...
1. movie
2. film
3. show
Enter number (1-3): 3

i love this show ...
1. i
2. it
3. and
Enter number (1-3): 1

i love this show i ...
1. think
2. do
3. was
Enter number (1-3): 3

i love this show i was ...
1. very
2. in
3. able
Enter number (1-3): 2

i love this show i was in ...
1. the
2. for
3. a
Enter number (1-3): 3

i love this show i was in a ...
1. movie
2. word
3. state
Enter number (1-3): 3

i love this show i was in a state ...
1. of
2. ken
3. championship
Enter number (1-3): 1

i love this show i was in a state of ...
1. the
2. shock
3. mind
Enter number (1-3): 2

i love this show i was in a state of shock ...
1. what
2. in
3. before
Enter number (1-3): 
Stopping generation


'i love this show i was in a state of shock'