In [None]:
!pip install PyPDF2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 KB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
# import os
# import PyPDF2  
# import string

# # Define the path to the input directory
# input_dir = '/content/'

# # Define the preprocess function
# def preprocess(text):
#     # Remove punctuation
#     text = text.translate(str.maketrans('', '', string.punctuation))
#     # Lowercase the text
#     text = text.lower()
#     return text

# # Loop over each file in the input directory
# for filename in os.listdir(input_dir):
#     if filename.endswith('.pdf'):
#         # Define the path to the PDF file
#         pdf_file_path = os.path.join(input_dir, filename)

#         # Open the PDF file and create a PDF reader object
#         with open(pdf_file_path, 'rb') as pdf_file:
#             pdf_reader = PyPDF2.PdfReader(pdf_file)
#             num_pages = len(pdf_reader.pages)

#             # Extract the text from each page of the PDF file
#             text_list = []
#             for i in range(num_pages):
#                 page = pdf_reader.pages[i]
#                 text_list.append(page.extract_text())

#             # Preprocess the text and write it to the output file
#             output_file_path = os.path.splitext(pdf_file_path)[0] + '.txt'
#             with open(output_file_path, 'w') as output_file:
#                 for text in text_list:
#                     # Preprocess the text (e.g., remove punctuation, lowercasing, etc.)
#                     preprocessed_text = preprocess(text)
#                     # Write the preprocessed text to the output file
#                     output_file.write(preprocessed_text + '\n')
import os
import PyPDF2  
import string

# Define the path to the input directory
input_dir = '/content/'

# Define the preprocess function
def preprocess(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Lowercase the text
    text = text.lower()
    return text

# Loop over each file in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith('.pdf'):
        # Define the path to the PDF file
        pdf_file_path = os.path.join(input_dir, filename)

        # Open the PDF file and create a PDF reader object
        with open(pdf_file_path, 'rb') as pdf_file:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            num_pages = len(pdf_reader.pages)

            # Extract the text from each page of the PDF file
            text_list = []
            for i in range(num_pages):
                page = pdf_reader.pages[i]
                text_list.append(page.extract_text())

            # Preprocess the text and write it to the output file
            output_file_path = os.path.splitext(pdf_file_path)[0] + '.txt'
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                for text in text_list:
                    # Preprocess the text (e.g., remove punctuation, lowercasing, etc.)
                    preprocessed_text = preprocess(text)
                    # Write the preprocessed text to the output file
                    output_file.write(preprocessed_text + '\n')


In [None]:
import math
import nltk
import os
import PyPDF2
import string
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import defaultdict

# Step 1: Collect the corpus
corpus = []
input_dir = '/content/'

# Loop over each file in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith('.txt'):
        # Define the path to the text file
        text_file_path = os.path.join(input_dir, filename)

        # Open the text file and read its contents
        with open(text_file_path, 'r') as text_file:
            text = text_file.read()

            # Append the preprocessed text to the corpus
            corpus.append(preprocess(text))

# Step 2: Preprocess the documents
def preprocess(text):
    # Tokenize the text into words
    words = nltk.word_tokenize(text.lower())
    
    # Remove stop words and punctuation
    words = [word for word in words if word.isalnum() and not word in stopwords.words('english')]
    
    # Stem the words
    stemmer = nltk.stem.PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    
    return words

# Step 3: Calculate term frequencies
def calculate_tf(document):
    tf = defaultdict(int)
    for word in document:
        tf[word] += 1
    return tf

tf_corpus = [calculate_tf(document) for document in corpus]

# Step 4: Calculate inverse document frequency (IDF)
def calculate_idf(corpus):
    N = len(corpus)
    idf = defaultdict(float)
    for document in corpus:
        for word in document:
            idf[word] += 1
    
    for word in idf:
        idf[word] = math.log(N / idf[word])
    
    return idf

idf = calculate_idf(corpus)

# Step 5: Calculate document length
def calculate_document_length(document):
    length = 0
    for word in document:
        length += tf_corpus[corpus.index(document)][word] * idf[word] ** 2
    return math.sqrt(length)

document_lengths = [calculate_document_length(document) for document in corpus]

# Step 6: Build the index
index = defaultdict(list)
for i, document in enumerate(corpus):
    for word in set(document):
        index[word].append((i, tf_corpus[i][word], idf[word]))

# Step 7: Perform the query with BM25+RD scoring
k1 = 1.2
b = 0.75
avg_doc_len = sum(document_lengths) / len(document_lengths)
num_docs = len(corpus)
doc_freqs = defaultdict(int)
for word in index:
    doc_freqs[word] = len(index[word])
    
def perform_query(query, idf, input_dir):
    query = preprocess(query)
    query_tf = calculate_tf(query)
    query_idf = {word: idf[word] for word in query}
    scores = defaultdict(float)
    for word in query:
        for document, tf, idf in index[word]:
            idf = query_idf[word]
            tf = tf_corpus[document][word]
            doc_len = document_lengths[document]
            doc_freq = doc_freqs[word]
            score = idf * ((tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (doc_len / avg_doc_len)))) * ((num_docs/2) / doc_freq)
            scores[document] += score
    ranked_documents = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for i, result in enumerate(ranked_documents[:10]):
        document_index = result[0]
        score = result[1]
        filename = os.listdir(input_dir)[document_index]
        file_path = os.path.join(input_dir, filename)
        print(f"Rank {i+1}: Document {document_index} with score {score} and path {file_path}")
    return ranked_documents[:10]

query = "scheduling"
results = perform_query(query, idf, input_dir)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Rank 1: Document 0 with score -6.764895511760308 and path /content/.config
