In [12]:
import pandas as pd
import numpy as np
import math
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/macbookair/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/macbookair/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
def process_text(text):
    if pd.isna(text):
        return []
    
    # Convert into the string and also make it lowercase 
    text = str(text).lower()
    
    # Removing  punctuations 
    for char in string.punctuation:
        text = text.replace(char, "")
        
    # Sentence into a list of words
    words = text.split()
    
    # list of english stop words (like 'the', 'a', 'in')

    stops = set(stopwords.words('english'))
    
    # Filter out the stop words

    clean_words = []
    for w in words:
        if w not in stops:
            clean_words.append(w)
            
    # Apply  Poerter Stemming 

    stemmer = PorterStemmer()
    final_list = []
    for w in clean_words:
        root = stemmer.stem(w)
        final_list.append(root)
        
    return final_list

In [14]:
class BM25_Model:
    def __init__(self, data_list):
        # k1 and b are tha parameters which must be set 
        # k1 controls how much term frequency matters
        # b controls how much document length matters
        self.k1 = 1.5
        self.b = 0.75
        
        self.data = data_list
        self.doc_lengths = []
        self.avg_len = 0
        self.N = 0 
        self.idf = {} 
        self.doc_freqs = [] 
        self.train()
        
    def train(self):
        self.N = len(self.data)
        total_len = 0
        
        print("Training model ...")
        
        # Loop through every document to calculate stats
        for doc in self.data:
            length = len(doc)
            self.doc_lengths.append(length)
            total_len += length
            
            # Count frequency of words in this specific doc
            counts = {}
            for word in doc:
                if word not in counts:
                    counts[word] = 0
                counts[word] += 1
            self.doc_freqs.append(counts)
            
        # Calculate average length
        self.avg_len = total_len / self.N
        
        # Calculate IDF for every unique word in the entire dataset
        # First, we need to know how many docs contain each word
        word_doc_counts = {}
        for d_counts in self.doc_freqs:
            for word in d_counts.keys():
                if word not in word_doc_counts:
                    word_doc_counts[word] = 0
                word_doc_counts[word] += 1
                
        # Now apply the formula for IDF
        # Formula: log( (N - n + 0.5) / (n + 0.5) + 1 )
        for word, n in word_doc_counts.items():
            score = math.log(((self.N - n + 0.5) / (n + 0.5)) + 1)
            self.idf[word] = score
            
    def get_scores(self, query):
        # Create an array of zeros, one for each document
        scores = np.zeros(self.N)
        
        for q_word in query:
            
            if q_word not in self.idf:
                continue
                
            idf_val = self.idf[q_word]
            
            for i in range(self.N):
                doc_len = self.doc_lengths[i]
                
                tf = 0
                if q_word in self.doc_freqs[i]:
                    tf = self.doc_freqs[i][q_word]
                
                # Calculate BM25 score for this doc
                numerator = idf_val * tf * (self.k1 + 1)
                denominator = tf + self.k1 * (1 - self.b + self.b * (doc_len / self.avg_len))
                
                scores[i] += numerator / denominator
                
        return scores

In [15]:
print("Reading CSV file...")
# Using latin-1 because utf-8 gave errors as the CSV contains all special characters 
df = pd.read_csv("Articles.csv", encoding="latin-1")
print(f"Total articles found: {len(df)}")

# Combine title and text
df['combined_text'] = df['Heading'].astype(str) + " " + df['Article'].astype(str)

print("Processing text... (This takes a few seconds)")
all_tokens = []

# Loop through every article
for text in df['combined_text']:
    tokens = process_text(text)
    all_tokens.append(tokens)
        
print("Text processing finished")

print("Building BM25 Model ...")
model = BM25_Model(all_tokens)
print("Model is ready!")

Reading CSV file...
Total articles found: 2692
Processing text... (This takes a few seconds)
Text processing finished
Building BM25 Model ...
Training model ...
Model is ready!


#### Now the model is ready so now just have to implemented the interactive search 

In [16]:
while True:
    user_input = input("\nEnter search query (type 'exit' to quit): ")
    
    if user_input == "exit":
        print("Exiting search...")
        break
        
    
    query_tokens = process_text(user_input)
    print(f"Looking for: {query_tokens}")
    
    if not query_tokens:
        print("Please type a valid word.")
        continue
        
   
    scores = model.get_scores(query_tokens)
    
    # Sort scores from high to low
    # argsort gives indices of sorted elements
    sorted_indices = np.argsort(scores)[::-1]
    
    # Printing top 10
    print("\nResults:")
    count = 0
    for idx in sorted_indices:
        score = scores[idx]
        
        if score > 0:
            count += 1
            print(f"{count}. Score: {score:.4f}")
            print(f"   Date: {df.iloc[idx]['Date']}")
            print(f"   Heading: {df.iloc[idx]['Heading']}")
            # Just show first 100 characters
            print(f"   Snippet: {str(df.iloc[idx]['Article'])[:100]}...")
            print("")
            
        if count >= 10:
            break
            
    if count == 0:
        print("No results found.")

Looking for: ['petrol', 'price']

Results:
1. Score: 12.1742
   Date: 11/30/2016
   Heading: Fuel prices up petrol by Rs 2 diesel by Rs 270
   Snippet: strong>ISLAMABAD: Federal Government has increased the prices of petrol and diesel from December 1.<...

2. Score: 12.0686
   Date: 2/7/2015
   Heading: january 2015 saw record sale of petrol in paki
   Snippet: ISLAMABAD: The consistent fall in the price of petrol saw a record sale in the month of January, wit...

3. Score: 11.8895
   Date: 3/31/2016
   Heading: Govt hikes prices of petrol di
   Snippet: strong>ISLAMABAD: Government has ratcheted up prices of petrol and high speed diesel by Rs1.50 and R...

4. Score: 11.7410
   Date: 3/31/2015
   Heading: petrol price goes up by rs 4 for apri
   Snippet: ISLAMABAD: The new price of petrol effective from midnight Tuesday, October 31, will be Rs 74.29 per...

5. Score: 11.7284
   Date: 3/31/2015
   Heading: petrol price increased by rs 4 for the month of apri
   Snippet: ISLAMABAD: The n