In [5]:
import math

In [2]:
def computeTF(token: str, doc: str):
  """
  Compute the Term Frequency (TF) of a token in a document.

  Parameters:
      token (str): The token for which to compute the term frequency.
      doc (str): The document in which to compute the term frequency.

  Returns:
      float: The term frequency of the token in the document.
  """
  # Split the document into words (tokens) and normalize to lower case
  tokens = doc.lower().split()

  # Count occurrences of the token in the document
  count_t = tokens.count(token.lower())

  # Total number of tokens in the document
  size_d = len(tokens)

  # Avoid division by zero
  if size_d == 0:
      return 0.0

  # Compute the term frequency
  return count_t / size_d

In [6]:
def computeDF(token: str, corpus: list):
  """
  Compute the Document Frequency (DF) of a token in a corpus.

  Parameters:
      token (str): The token for which to compute the document frequency.
      corpus (list): A list of documents (each document is a string).

  Returns:
      int: The document frequency of the token in the corpus.
  """
  df = 0
  for doc in corpus:
      # Check if the token is present in the document
      if token.lower() in doc.lower().split():
          df += 1
  return df


In [7]:
def computeIDF(token: str, corpus: list):
  """
  Compute the Inverse Document Frequency (IDF) of a token in a corpus.

  Parameters:
      token (str): The token for which to compute the IDF.
      corpus (list): A list of documents (each document is a string).

  Returns:
      float: The IDF value of the token in the corpus.
  """
  # Total number of documents in the corpus
  size_c = len(corpus)

  # Compute the Document Frequency (DF)
  df = computeDF(token, corpus)

  # Compute IDF with smoothing
  idf = math.log(size_c / (df + 1))

  return idf

In [13]:
def computeTFIDF(token: str, doc: str, corpus: list):
  """
  Compute the TF-IDF of a token in a document given a corpus.

  Parameters:
      token (str): The token for which to compute the TF-IDF.
      doc (str): The document in which to compute the term frequency (TF).
      corpus (list): A list of documents (each document is a string).

  Returns:
      float: The TF-IDF value of the token for the document in the corpus.
  """
  # Compute Term Frequency (TF)
  tf = computeTF(token, doc)

  # Compute Inverse Document Frequency (IDF)
  idf = computeIDF(token, corpus)

  # Compute TF-IDF
  tf_idf = tf * idf

  return tf_idf

In [9]:
def score_sentence(query, sentence, corpus):
  """
  Compute the cumulative TF-IDF score of a query for a single sentence.
  """
  query_tokens = query.lower().replace('?', '').split()
  return sum(computeTFIDF(token, sentence, corpus) for token in query_tokens)

In [10]:
pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [11]:
pip install apache_beam

Collecting apache_beam
  Downloading apache_beam-2.61.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)
Collecting crcmod<2.0,>=1.7 (from apache_beam)
  Downloading crcmod-1.7.tar.gz (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.7/89.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.2,>=0.3.1.1 (from apache_beam)
  Downloading dill-0.3.1.1.tar.gz (151 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.0/152.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting cloudpickle~=2.2.1 (from apache_beam)
  Downloading cloudpickle-2.2.1-py3-none-any.whl.metadata (6.9 kB)
Collecting fastavro<2,>=0.23.6 (from apache_beam)
  Downloading fastavro-1.9.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Collecting fasteners<1.0,>=0.3 (from apache_beam)
  Do

In [14]:
# constructing the corpus

from datasets import load_dataset

# loading wikipedia using HuggingFace
wikipedia = load_dataset('wikipedia', '20220301.en', trust_remote_code=True)

# Searching for the New Zealand's wikipedia passage
for passage in wikipedia['train']:
  if passage['title'] == 'New Zealand':
    new_zealand_passage = passage['text']
    break

# Breaking the passage into a corpus of sentences.
corpus = new_zealand_passage.split('.')
print(corpus)

Downloading data:   0%|          | 0/41 [00:00<?, ?files/s]

train-00000-of-00041.parquet:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

train-00001-of-00041.parquet:   0%|          | 0.00/705M [00:00<?, ?B/s]

train-00002-of-00041.parquet:   0%|          | 0.00/558M [00:00<?, ?B/s]

train-00003-of-00041.parquet:   0%|          | 0.00/491M [00:00<?, ?B/s]

train-00004-of-00041.parquet:   0%|          | 0.00/431M [00:00<?, ?B/s]

train-00005-of-00041.parquet:   0%|          | 0.00/391M [00:00<?, ?B/s]

train-00006-of-00041.parquet:   0%|          | 0.00/366M [00:00<?, ?B/s]

train-00007-of-00041.parquet:   0%|          | 0.00/326M [00:00<?, ?B/s]

train-00008-of-00041.parquet:   0%|          | 0.00/329M [00:00<?, ?B/s]

train-00009-of-00041.parquet:   0%|          | 0.00/312M [00:00<?, ?B/s]

train-00010-of-00041.parquet:   0%|          | 0.00/267M [00:00<?, ?B/s]

train-00011-of-00041.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

train-00012-of-00041.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

train-00013-of-00041.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

train-00014-of-00041.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

train-00015-of-00041.parquet:   0%|          | 0.00/236M [00:00<?, ?B/s]

train-00016-of-00041.parquet:   0%|          | 0.00/215M [00:00<?, ?B/s]

train-00017-of-00041.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

train-00018-of-00041.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

train-00019-of-00041.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

train-00020-of-00041.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

train-00021-of-00041.parquet:   0%|          | 0.00/255M [00:00<?, ?B/s]

train-00022-of-00041.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

train-00023-of-00041.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

train-00024-of-00041.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00025-of-00041.parquet:   0%|          | 0.00/218M [00:00<?, ?B/s]

train-00026-of-00041.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

train-00027-of-00041.parquet:   0%|          | 0.00/206M [00:00<?, ?B/s]

train-00028-of-00041.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

train-00029-of-00041.parquet:   0%|          | 0.00/219M [00:00<?, ?B/s]

train-00030-of-00041.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

train-00031-of-00041.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

train-00032-of-00041.parquet:   0%|          | 0.00/200M [00:00<?, ?B/s]

train-00033-of-00041.parquet:   0%|          | 0.00/203M [00:00<?, ?B/s]

train-00034-of-00041.parquet:   0%|          | 0.00/201M [00:00<?, ?B/s]

train-00035-of-00041.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00036-of-00041.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

train-00037-of-00041.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00038-of-00041.parquet:   0%|          | 0.00/203M [00:00<?, ?B/s]

train-00039-of-00041.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00040-of-00041.parquet:   0%|          | 0.00/185M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6458670 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]

['New Zealand ( ) is an island country in the southwestern Pacific Ocean', ' It consists of two main landmasses—the North Island () and the South Island ()—and over 700 smaller islands, covering a total area of ', ' New Zealand is about  east of Australia across the Tasman Sea and  south of the islands of New Caledonia, Fiji, and Tonga', " The country's varied topography and sharp mountain peaks, including the Southern Alps, owe much to tectonic uplift and volcanic eruptions", " New Zealand's capital city is Wellington, and its most populous city is Auckland", '\n\nOwing to their remoteness, the islands of New Zealand were the last large habitable landmass to be settled by humans', ' Between about 1280 and 1350, Polynesians began to settle in the islands and then developed a distinctive Māori culture', ' In 1642, the Dutch explorer Abel Tasman became the first European to sight and record New Zealand', ' In 1840, representatives of the United Kingdom and Māori chiefs signed the Treaty 

In [16]:
# defining the query

query = 'What is the capital of New Zealand?'

In [17]:
# Scoring and ranking sentences
scored_sentences = [(sentence.strip(), score_sentence(query, sentence, corpus)) for sentence in corpus]
ranked_sentences = sorted(scored_sentences, key=lambda x: x[1], reverse=True)

# Printing the top 3 highest scored sentences
print("Top 3 relevant sentences:")
for sentence, score in ranked_sentences[:3]:
    print(f"Score: {score:.4f}, Sentence: {sentence.strip()}")

Top 3 relevant sentences:
Score: 0.7053, Sentence: New Zealand's capital city is Wellington, and its most populous city is Auckland
Score: 0.5166, Sentence: The South Island is the largest landmass of New Zealand
Score: 0.4207, Sentence: Elizabeth II is the queen of New Zealand and thus the head of state
