<a href="https://colab.research.google.com/github/AsadiAhmad/TF-IDF-Model/blob/main/Code/TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Import Libraries

In [7]:
import requests

import polars as pl

# Step 2: Download Documents and Queries

In [8]:
def download_text(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch {url}. Status code: {response.status_code}")
        return None

In [9]:
urls = {
    "Documents": "https://raw.githubusercontent.com/AsadiAhmad/TF-IDF-Model/main/Dataset/Documents.txt",
    "Queries": "https://raw.githubusercontent.com/AsadiAhmad/TF-IDF-Model/main/Dataset/Queries.txt",
    "Result": "https://raw.githubusercontent.com/AsadiAhmad/TF-IDF-Model/main/Dataset/Result.txt"
}

In [10]:
data = {}
for key, url in urls.items():
    data[key] = download_text(url)

print("Datasets downloaded successfully!")

Datasets downloaded successfully!


# Step 3: Convert Dataset into Polars Data frame

## Convert into polars

In [11]:
documents_text = data["Documents"]
queries_text = data["Queries"]
result_text = data["Result"]

In [12]:
documents_splited = documents_text.split('********************************************')

documents = pl.DataFrame({
    "index": range(1, len(documents_splited) + 1),
    "document": documents_splited
})

documents = documents.with_columns(
    pl.col("document")
    .str.split("\n")
    .list.slice(1)
    .list.join(" ")
    .str.replace_all(r'\s+', ' ')
    .str.strip_chars()
)

In [13]:
queries_splited = queries_text.split('#')

queries = pl.DataFrame({
    "index": range(1, len(queries_splited) + 1),
    "query": queries_splited
})

queries = queries.with_columns(
    pl.col("query")
    .str.split("\n")
    .list.slice(1)
    .list.join(" ")
    .str.replace_all(r'\s+', ' ')
    .str.strip_chars()
)

In [14]:
reference_lines = []
for line in result_text.split('\n'):
    line = line.strip()
    if line.endswith('-1'):
        refs = line[:-2].strip().split()
        reference_lines.append([int(ref) for ref in refs])

result = pl.DataFrame({
    "index": range(1, len(reference_lines) + 1),
    "references": reference_lines
})

## Set Config for polar showing

In [15]:
pl.Config.set_tbl_rows(-1)
pl.Config.set_tbl_cols(-1)
pl.Config.set_fmt_str_lengths(70)
pl.Config.set_fmt_table_cell_list_len(10)

## Showing all frames

In [16]:
documents.head(10)

index,document
i64,str
1,"""THE INDIAN COUNCIL OF LIBRARY AND INFORMATION SERVICES RESEARCH AND TR…"
2,"""Document 2 THE LINGERING FRAGRANCE: PROCEEDINGS OF THE XXIV ALL INDIA …"
3,"""Document 3 XXV ALL INDIA LIBRARY CONFERENCE TRIVANDRUM 14-18 MAY 1979.…"
4,"""Document 4 MALAWI LIBRARY ASSOCIATION SECOND ANNUAL GENERAL MEETING, C…"
5,"""Document 5 A PERSONAL VIEW OF THE ZAMBIA LIBRARY SERVICE. PERSONAL VIE…"
6,"""Document 6 CLA 80: TURNING ON THE POWER. REPORT OF PROCEEDINGS OF THE …"
7,"""Document 7 UNCERTAIN BEGINNINGS. REPORT OF THE AMERICAN LIBRARY ASSOCI…"
8,"""Document 8 MIDWINTER IN REAGAN'S WASHINGTON: AN ALA CONFERENCE REPORT.…"
9,"""Document 9 THE WHITE HOUSE CONFERENCE ON LIBRARY AND INFORMATION SERVI…"
10,"""Document 10 INFORMATION: BOOKS ARE JUST THE BEGINNING. THE MICHIGAN WH…"


In [17]:
queries.head(10)

index,query
i64,str
1,"""I AM INTERESTED IN THE IDENTIFICATION AND EVALUATION OF NOVEL COMPUTER…"
2,"""2 I WOULD BE INTERESTED TO RECEIVE INFORMATION ON NON-USERS OF LIBRARI…"
3,"""3 I AM INTERESTED IN COMPUTER DOCUMENTATION SYSTEMS FOR CHEMICAL PATEN…"
4,"""4 I WOULD BE PLEASED TO RECEIVE ANY INFORMATION ON THE USE OF JOURNALS…"
5,"""5 MY DISSERTATION IS AN OVERVIEW OF NEWS DATABASES, LOOKING AT THE AVA…"
6,"""6 MY DISSERTATION IS ON LIBRARY OUTREACH FOR THE CHINESE COMMUNITY IN …"
7,"""7 I WOULD BE PLEASED TO RECEIVE PAPERS ON THE SECURITY OF INFORMATION …"
8,"""8 I AM INTERESTED IN THE PROFESSIONAL EDUCATION OF LIBRARIANS, IN PART…"
9,"""9 I AM INTERESTED IN THE EXTENT TO WHICH LIBRARIES ARE CHARGING FOR SE…"
10,"""10 I AM INTERESTED IN INFORMATION ON THE PROVISION OF CURRENT AWARENES…"


In [18]:
result.head(10)

index,references
i64,list[i64]
1,"[3392, 3396]"
2,"[2623, 4291]"
3,"[1407, 1431, 3794, 3795, 3796]"
4,"[604, 3527, 4644, 5087, 5112, 5113, 5295]"
5,[3401]
6,"[5626, 5627, 5628, 5629, 5631]"
7,"[277, 278, 279, 1277, 1278, 1769, 2279, 3265, 3267, … 5778]"
8,"[4485, 4486, 4499, 4717, 5011, 5170, 5533, 5534, 5723, 5816]"
9,"[178, 680, 681, 1412, 3178, 3689, 3922, 4374, 4692, … 5859]"
10,"[769, 770, 1309, 1310, 1807, 2318, 2319, 2321, 2407, … 5801]"
