In [1]:
import json
import time

In [3]:
def tokenization(text):
    text = text.lower()
    punctuation = '.,!?;:"()[]{}'
    for char in punctuation:
        text = text.replace(char, ' ')
    return text.split()

In [10]:
def Index_Creation(corpus):
    inverted_index = {}
    for doc_id, document in enumerate(corpus):
        words = tokenization(document)
        for word in words:
            if word not in inverted_index:
                inverted_index[word] = []
            if doc_id not in inverted_index[word]:
                inverted_index[word].append(doc_id)
    return inverted_index

In [11]:
def Boolean_Search(query, index):
    query = query.lower().strip()
    keywords = query.split()  # Space means OR
    matches = set()
    
    for keyword in keywords:
        if '&' in keyword:
            sub_keywords = keyword.split('&')
            if sub_keywords[0] in index:
                and_matches = set(index[sub_keywords[0]])
            else:
                and_matches = set()
                
            for sub in sub_keywords[1:]:
                and_matches = and_matches.intersection(set(index.get(sub, [])))
                
            matches = matches.union(and_matches)
        else:
            matches = matches.union(set(index.get(keyword, [])))
    
    return list(matches)


In [14]:
def load_dataset(file_path):
    corpus = []
    try:
        with open(file_path, 'r') as file:
            for line in file:
                try:
                    data = json.loads(line)
                    if "short_description" in data:
                        corpus.append(data["short_description"])
                except json.JSONDecodeError:
                    continue
    except FileNotFoundError:
        print("File not found:", file_path)
    return corpus

In [18]:
def experiment(queries, index):
    timings = []
    for query in queries:
        start_time = time.perf_counter()
        search_results = Boolean_Search(query, index)
        end_time = time.perf_counter()
        timings.append({
            "query": query,
            "num_results": len(search_results),
            "time_microseconds": (end_time - start_time) *10**6
        })
    return timings

if __name__ == "__main__":
    data_collection = load_dataset("news.json")
    term_index = Index_Creation(data_collection)
    
    sample_queries = [
        "king&salman",
        "football basketball",
        "iphone ipad",
        "artificial&intelligence",
        "covid vaccine"
    ]
    
    experiment_results = experiment(sample_queries, term_index)

    for result in experiment_results:
        print("Query: {}, Results: {}, Time: {:.2f} µs".format(
            result["query"], result["num_results"], result["time_microseconds"]))


Query: king&salman, Results: 8, Time: 161.80 µs
Query: football basketball, Results: 444, Time: 185.80 µs
Query: iphone ipad, Results: 179, Time: 89.10 µs
Query: artificial&intelligence, Results: 14, Time: 65.30 µs
Query: covid vaccine, Results: 121, Time: 50.60 µs
