<a href="https://colab.research.google.com/github/AsadiAhmad/Word-Predictor/blob/main/Code/Word_Predictor_N_gram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Import Libraries

In [52]:
import requests

import polars as pl

from collections import Counter

# Step 2: Get Corpus

In [53]:
corpus_url = "https://raw.githubusercontent.com/AsadiAhmad/Word-Predictor/main/Corpus/Tarzan.txt"

response = requests.get(corpus_url)

if response.status_code == 200:
    raw_text = response.text
    print("Text fetched successfully!")
else:
    print(f"Failed to fetch the text. Status code: {response.status_code}")

Text fetched successfully!


# Step 3: Convert Corpus to Polars Data frame

## Extract paragraphs

In [54]:
paragraphs = raw_text.split("\r\n\r\n")

paragraphs_df = pl.DataFrame({
    "index": range(0, len(paragraphs)),
    "text": paragraphs
})

## Set Confige

In [71]:
pl.Config.set_tbl_rows(-1)
pl.Config.set_tbl_cols(-1)
pl.Config.set_fmt_str_lengths(100)
pl.Config.set_fmt_table_cell_list_len(100)

## Show parts of corpus

In [56]:
paragraphs_df.head(10)

index,text
i64,str
0,"""﻿The Project Gutenberg eBook of Tarzan, lord of the jungle  This ebook is for the use of anyon…"
1,"""Title: Tarzan, lord of the jungle"""
2,""" Author: Edgar Rice Burroughs"""
3,"""Release date: February 12, 2024 [eBook #72938]"""
4,"""Language: English"""
5,"""Original publication: New York, NY: Grosset & Dunlap, Publishers, 1928"""
6,"""Credits: Greg Weeks, Mary Meehan and the Online Distributed Proofreading Team at http://www.pgdp.net"""
7,""" *** START OF THE PROJECT GUTENBERG EBOOK TARZAN, LORD OF THE JUNGLE ***"""
8,""""""
9,""""""


# Step 4: Preprocessing

## Clear some rows

In [57]:
indices_to_remove = list(range(0, 50))

paragraphs_filtered_df = paragraphs_df.filter(
    ~pl.col("index").is_in(indices_to_remove)
)

paragraphs_filtered_df = paragraphs_filtered_df.with_row_index("new_index").drop("index")

In [58]:
paragraphs_filtered_df.head(100)

new_index,text
u32,str
0,""" His great bulk swaying to and fro as he threw his weight first upon one side and then upon the ot…"
1,"""In peace he had lived with Dango the hyena, Sheeta the leopard and Numa the lion. Man alone had mad…"
2,"""Always during the long hundred years of his life, Tantor had known man. There had been black men, a…"
3,"""Tantor shared this caution with his fellows and avoided men--especially white men; and so had there…"
4,"""In the forest Fahd and Motlog, of the tribe el-Harb, hunted north from the menzil of Sheik Ibn Jad …"
5,"""Fejjuan, as his comrades, thought of fresh meat, but also he thought of el-Habash, the land from wh…"
6,"""Two marches to the north, in the southern extremity of Abyssinia, stood the round dwelling of the f…"
7,"""The leaves of the forest drowsed in the heat above the heads of the hunters. Beneath the drowsing l…"
8,"""Fejjuan, the Galla slave, halted in his tracks, stopping those behind him by the silent mandate of …"
9,"""As Tantor surged forward at the sound of the report Tarzan started to spring to an upright position…"


# Step 5: N-gram Calculation

In [59]:
def N_gram(df: pl.DataFrame, n: int) -> pl.DataFrame:
    def tokenize(text):
        return text.lower().split()

    def generate_ngrams(tokens, n):
        return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

    ngram_counter = Counter()
    for row in df.iter_rows(named=True):
        text = row["text"]
        tokens = tokenize(text)
        ngrams = generate_ngrams(tokens, n)
        ngram_counter.update(ngrams)

    ngram_data = [
        {"index": i, "words": list(ngram), "count": count}
        for i, (ngram, count) in enumerate(ngram_counter.items())
    ]

    return pl.DataFrame(ngram_data)

In [85]:
unigram_df = N_gram(paragraphs_filtered_df, n=1)
bigram_df = N_gram(paragraphs_filtered_df, n=2)
trigram_df = N_gram(paragraphs_filtered_df, n=3)
tetragram_df = N_gram(paragraphs_filtered_df, n=4)
pentagram_df = N_gram(paragraphs_filtered_df, n=5)

In [86]:
dfs = [df.drop("index") for df in [unigram_df.head(10), bigram_df.head(10), trigram_df.head(10), tetragram_df.head(10), pentagram_df.head(10)]]
dfs_renamed = [df.rename({"words": f"words{i+1}", "count": f"count{i+1}"}) for i, df in enumerate(dfs)]

combined_df = pl.concat(dfs_renamed, how="horizontal")

In [87]:
combined_df

words1,count1,words2,count2,words3,count3,words4,count4,words5,count5
list[str],i64,list[str],i64,list[str],i64,list[str],i64,list[str],i64
"[""his""]",1058,"[""his"", ""great""]",11,"[""his"", ""great"", ""bulk""]",1,"[""his"", ""great"", ""bulk"", ""swaying""]",1,"[""his"", ""great"", ""bulk"", ""swaying"", ""to""]",1
"[""great""]",194,"[""great"", ""bulk""]",1,"[""great"", ""bulk"", ""swaying""]",1,"[""great"", ""bulk"", ""swaying"", ""to""]",1,"[""great"", ""bulk"", ""swaying"", ""to"", ""and""]",1
"[""bulk""]",4,"[""bulk"", ""swaying""]",1,"[""bulk"", ""swaying"", ""to""]",1,"[""bulk"", ""swaying"", ""to"", ""and""]",1,"[""bulk"", ""swaying"", ""to"", ""and"", ""fro""]",1
"[""swaying""]",6,"[""swaying"", ""to""]",1,"[""swaying"", ""to"", ""and""]",1,"[""swaying"", ""to"", ""and"", ""fro""]",1,"[""swaying"", ""to"", ""and"", ""fro"", ""as""]",1
"[""to""]",1604,"[""to"", ""and""]",4,"[""to"", ""and"", ""fro""]",1,"[""to"", ""and"", ""fro"", ""as""]",1,"[""to"", ""and"", ""fro"", ""as"", ""he""]",1
"[""and""]",2331,"[""and"", ""fro""]",1,"[""and"", ""fro"", ""as""]",1,"[""and"", ""fro"", ""as"", ""he""]",1,"[""and"", ""fro"", ""as"", ""he"", ""threw""]",1
"[""fro""]",1,"[""fro"", ""as""]",1,"[""fro"", ""as"", ""he""]",1,"[""fro"", ""as"", ""he"", ""threw""]",1,"[""fro"", ""as"", ""he"", ""threw"", ""his""]",1
"[""as""]",474,"[""as"", ""he""]",85,"[""as"", ""he"", ""threw""]",1,"[""as"", ""he"", ""threw"", ""his""]",1,"[""as"", ""he"", ""threw"", ""his"", ""weight""]",1
"[""he""]",1432,"[""he"", ""threw""]",2,"[""he"", ""threw"", ""his""]",1,"[""he"", ""threw"", ""his"", ""weight""]",1,"[""he"", ""threw"", ""his"", ""weight"", ""first""]",1
"[""threw""]",10,"[""threw"", ""his""]",2,"[""threw"", ""his"", ""weight""]",1,"[""threw"", ""his"", ""weight"", ""first""]",1,"[""threw"", ""his"", ""weight"", ""first"", ""upon""]",1


# Step X: N-gram Model

Generated Sentence: knowing well the windings of the trail
