<a href="https://colab.research.google.com/github/AsadiAhmad/Word-Predictor/blob/main/Code/Word_Predictor_N_gram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Import Libraries

In [1]:
import requests

import polars as pl

from collections import Counter

# Step 2: Get Corpus

In [2]:
corpus_url = "https://raw.githubusercontent.com/AsadiAhmad/Word-Predictor/main/Corpus/Tarzan.txt"

response = requests.get(corpus_url)

if response.status_code == 200:
    raw_text = response.text
    print("Text fetched successfully!")
else:
    print(f"Failed to fetch the text. Status code: {response.status_code}")

Text fetched successfully!


# Step 3: Convert Corpus to Polars Data frame

## Extract paragraphs

In [3]:
paragraphs = raw_text.split("\r\n\r\n")

paragraphs_df = pl.DataFrame({
    "index": range(0, len(paragraphs)),
    "text": paragraphs
})

## Set Confige

In [4]:
pl.Config.set_tbl_rows(-1)
pl.Config.set_tbl_cols(-1)
pl.Config.set_fmt_str_lengths(100)
pl.Config.set_fmt_table_cell_list_len(100)

## Show parts of corpus

In [5]:
paragraphs_df.head(10)

index,text
i64,str
0,"""﻿The Project Gutenberg eBook of Tarzan, lord of the jungle  This ebook is for the use of anyon…"
1,"""Title: Tarzan, lord of the jungle"""
2,""" Author: Edgar Rice Burroughs"""
3,"""Release date: February 12, 2024 [eBook #72938]"""
4,"""Language: English"""
5,"""Original publication: New York, NY: Grosset & Dunlap, Publishers, 1928"""
6,"""Credits: Greg Weeks, Mary Meehan and the Online Distributed Proofreading Team at http://www.pgdp.net"""
7,""" *** START OF THE PROJECT GUTENBERG EBOOK TARZAN, LORD OF THE JUNGLE ***"""
8,""""""
9,""""""


# Step 4: Preprocessing

## Filter some rows

In [6]:
indices_to_remove = list(range(0, 50))

paragraphs_filtered_df = paragraphs_df.filter(
    ~pl.col("index").is_in(indices_to_remove)
)

paragraphs_filtered_df = paragraphs_filtered_df.with_row_index("new_index").drop("index")

In [7]:
paragraphs_filtered_df.head(10)

new_index,text
u32,str
0,""" His great bulk swaying to and fro as he threw his weight first upon one side and then upon the ot…"
1,"""In peace he had lived with Dango the hyena, Sheeta the leopard and Numa the lion. Man alone had mad…"
2,"""Always during the long hundred years of his life, Tantor had known man. There had been black men, a…"
3,"""Tantor shared this caution with his fellows and avoided men--especially white men; and so had there…"
4,"""In the forest Fahd and Motlog, of the tribe el-Harb, hunted north from the menzil of Sheik Ibn Jad …"
5,"""Fejjuan, as his comrades, thought of fresh meat, but also he thought of el-Habash, the land from wh…"
6,"""Two marches to the north, in the southern extremity of Abyssinia, stood the round dwelling of the f…"
7,"""The leaves of the forest drowsed in the heat above the heads of the hunters. Beneath the drowsing l…"
8,"""Fejjuan, the Galla slave, halted in his tracks, stopping those behind him by the silent mandate of …"
9,"""As Tantor surged forward at the sound of the report Tarzan started to spring to an upright position…"


## Clear garbage charachters

In [8]:
def clean_text(df):
    df = df.with_columns(
        pl.col("text")
        .str.replace_all(r"<br\s*/?>", "")
        .str.replace_all(r"[.,(){}\[\]*&^%$#@!?_\-+=/\\`~'\";:]", " ")
        .str.replace_all(r"\d", " ")
    )
    return df

In [9]:
paragraphs_clean_df = clean_text(paragraphs_filtered_df)

In [10]:
paragraphs_clean_df.head(10)

new_index,text
u32,str
0,""" His great bulk swaying to and fro as he threw his weight first upon one side and then upon the ot…"
1,"""In peace he had lived with Dango the hyena Sheeta the leopard and Numa the lion Man alone had mad…"
2,"""Always during the long hundred years of his life Tantor had known man There had been black men a…"
3,"""Tantor shared this caution with his fellows and avoided men especially white men and so had there…"
4,"""In the forest Fahd and Motlog of the tribe el Harb hunted north from the menzil of Sheik Ibn Jad …"
5,"""Fejjuan as his comrades thought of fresh meat but also he thought of el Habash the land from wh…"
6,"""Two marches to the north in the southern extremity of Abyssinia stood the round dwelling of the f…"
7,"""The leaves of the forest drowsed in the heat above the heads of the hunters Beneath the drowsing l…"
8,"""Fejjuan the Galla slave halted in his tracks stopping those behind him by the silent mandate of …"
9,"""As Tantor surged forward at the sound of the report Tarzan started to spring to an upright position…"


## Lower all alphabet charachters

In [11]:
def lower_text(df):
    df = df.with_columns(
        pl.col("text")
        .str.to_lowercase()
    )
    return df

In [12]:
paragraphs_lower_df = lower_text(paragraphs_clean_df)

In [13]:
paragraphs_lower_df.head(10)

new_index,text
u32,str
0,""" his great bulk swaying to and fro as he threw his weight first upon one side and then upon the ot…"
1,"""in peace he had lived with dango the hyena sheeta the leopard and numa the lion man alone had mad…"
2,"""always during the long hundred years of his life tantor had known man there had been black men a…"
3,"""tantor shared this caution with his fellows and avoided men especially white men and so had there…"
4,"""in the forest fahd and motlog of the tribe el harb hunted north from the menzil of sheik ibn jad …"
5,"""fejjuan as his comrades thought of fresh meat but also he thought of el habash the land from wh…"
6,"""two marches to the north in the southern extremity of abyssinia stood the round dwelling of the f…"
7,"""the leaves of the forest drowsed in the heat above the heads of the hunters beneath the drowsing l…"
8,"""fejjuan the galla slave halted in his tracks stopping those behind him by the silent mandate of …"
9,"""as tantor surged forward at the sound of the report tarzan started to spring to an upright position…"


# Step 5: N-gram Calculation

In [14]:
def N_gram(df: pl.DataFrame, n: int) -> pl.DataFrame:
    def tokenize(text):
        return text.split()

    def generate_ngrams(tokens, n):
        return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

    ngram_counter = Counter()
    for row in df.iter_rows(named=True):
        text = row["text"]
        tokens = tokenize(text)
        ngrams = generate_ngrams(tokens, n)
        ngram_counter.update(ngrams)

    ngram_data = [
        {"index": i, "words": list(ngram), "count": count}
        for i, (ngram, count) in enumerate(ngram_counter.items())
    ]

    return pl.DataFrame(ngram_data)

In [15]:
unigram_df = N_gram(paragraphs_lower_df, n=1)
bigram_df = N_gram(paragraphs_lower_df, n=2)
trigram_df = N_gram(paragraphs_lower_df, n=3)
tetragram_df = N_gram(paragraphs_lower_df, n=4)
pentagram_df = N_gram(paragraphs_lower_df, n=5)

In [16]:
dfs = [df.drop("index") for df in [unigram_df.head(10), bigram_df.head(10), trigram_df.head(10), tetragram_df.head(10), pentagram_df.head(10)]]
dfs_renamed = [df.rename({"words": f"words{i+1}", "count": f"count{i+1}"}) for i, df in enumerate(dfs)]

combined_df = pl.concat(dfs_renamed, how="horizontal")

In [17]:
combined_df

words1,count1,words2,count2,words3,count3,words4,count4,words5,count5
list[str],i64,list[str],i64,list[str],i64,list[str],i64,list[str],i64
"[""his""]",1072,"[""his"", ""great""]",11,"[""his"", ""great"", ""bulk""]",1,"[""his"", ""great"", ""bulk"", ""swaying""]",1,"[""his"", ""great"", ""bulk"", ""swaying"", ""to""]",1
"[""great""]",208,"[""great"", ""bulk""]",1,"[""great"", ""bulk"", ""swaying""]",1,"[""great"", ""bulk"", ""swaying"", ""to""]",1,"[""great"", ""bulk"", ""swaying"", ""to"", ""and""]",1
"[""bulk""]",4,"[""bulk"", ""swaying""]",1,"[""bulk"", ""swaying"", ""to""]",1,"[""bulk"", ""swaying"", ""to"", ""and""]",1,"[""bulk"", ""swaying"", ""to"", ""and"", ""fro""]",1
"[""swaying""]",6,"[""swaying"", ""to""]",1,"[""swaying"", ""to"", ""and""]",1,"[""swaying"", ""to"", ""and"", ""fro""]",1,"[""swaying"", ""to"", ""and"", ""fro"", ""as""]",1
"[""to""]",1620,"[""to"", ""and""]",4,"[""to"", ""and"", ""fro""]",2,"[""to"", ""and"", ""fro"", ""as""]",1,"[""to"", ""and"", ""fro"", ""as"", ""he""]",1
"[""and""]",2450,"[""and"", ""fro""]",2,"[""and"", ""fro"", ""as""]",1,"[""and"", ""fro"", ""as"", ""he""]",1,"[""and"", ""fro"", ""as"", ""he"", ""threw""]",1
"[""fro""]",2,"[""fro"", ""as""]",1,"[""fro"", ""as"", ""he""]",1,"[""fro"", ""as"", ""he"", ""threw""]",1,"[""fro"", ""as"", ""he"", ""threw"", ""his""]",1
"[""as""]",490,"[""as"", ""he""]",85,"[""as"", ""he"", ""threw""]",1,"[""as"", ""he"", ""threw"", ""his""]",1,"[""as"", ""he"", ""threw"", ""his"", ""weight""]",1
"[""he""]",1525,"[""he"", ""threw""]",2,"[""he"", ""threw"", ""his""]",1,"[""he"", ""threw"", ""his"", ""weight""]",1,"[""he"", ""threw"", ""his"", ""weight"", ""first""]",1
"[""threw""]",10,"[""threw"", ""his""]",2,"[""threw"", ""his"", ""weight""]",1,"[""threw"", ""his"", ""weight"", ""first""]",1,"[""threw"", ""his"", ""weight"", ""first"", ""upon""]",1


# Step 6: N-gram Model

In [18]:
def n_gram_model(paragraphs: pl.DataFrame, n: int, word_list: list, predicted_num: int) -> list:
    ngrams = []
    for i in range(1, n + 1):
        ngrams.append(N_gram(paragraphs, i))

    predicted_words = word_list.copy()
    for index in range(predicted_num):
        context_size = min(n - 1, len(predicted_words))
        context = tuple(predicted_words[-context_size:]) if context_size > 0 else tuple()

        next_word = None
        for i in range(context_size, -1, -1):
            if i == 0:
                ngram_df = ngrams[0]
                next_word = ngram_df.sort("count", descending=True).row(0)[1][0]
                break
            else:
                ngram_df = ngrams[i]
                filtered_df = ngram_df.filter(
                    pl.col("words").list.slice(0, i) == list(context[-i:])
                )
                if len(filtered_df) > 0:
                    next_word = filtered_df.sort("count", descending=True).row(0)[1][-1]
                    break

        if next_word is None:
            break

        predicted_words.append(next_word)

    return predicted_words

In [19]:
word_list = ["Knowing", "weⅼⅼ", "the", "winⅾings", "of", "the", "traiⅼ", "he"]
word_list2 = ["For", "haⅼf", "a", "ⅾay", "he", "ⅼoⅼⅼeⅾ", "on", "the", "huge", "baⅽk", "and"]

predicted_num = 10

for index in range(1, 6):
    predicted_sentence1 = n_gram_model(paragraphs_lower_df, index, word_list, predicted_num)
    print(str (index) + " gram Predicted Sentence:", " ".join(predicted_sentence1))

print("--------------------------------------------------------------------------------------------------------")

for index in range(1, 6):
    predicted_sentence2 = n_gram_model(paragraphs_lower_df, index, word_list2, predicted_num)
    print(str (index) + " gram Predicted Sentence:", " ".join(predicted_sentence2))

1 gram Predicted Sentence: Knowing weⅼⅼ the winⅾings of the traiⅼ he the the the the the the the the the the
2 gram Predicted Sentence: Knowing weⅼⅼ the winⅾings of the traiⅼ he had been a great tourney and the great tourney and
3 gram Predicted Sentence: Knowing weⅼⅼ the winⅾings of the traiⅼ he had been the last event the score for the first
4 gram Predicted Sentence: Knowing weⅼⅼ the winⅾings of the traiⅼ he had been at the head of the column in the
5 gram Predicted Sentence: Knowing weⅼⅼ the winⅾings of the traiⅼ he had been at the head of a faction that was
--------------------------------------------------------------------------------------------------------
1 gram Predicted Sentence: For haⅼf a ⅾay he ⅼoⅼⅼeⅾ on the huge baⅽk and the the the the the the the the the the
2 gram Predicted Sentence: For haⅼf a ⅾay he ⅼoⅼⅼeⅾ on the huge baⅽk and the great tourney and the great tourney and the great
3 gram Predicted Sentence: For haⅼf a ⅾay he ⅼoⅼⅼeⅾ on the huge baⅽk and the two men a