# Required Imports

In [1]:
import os
import warnings
import json
import re
import nltk
from collections import Counter
from nltk.corpus import stopwords
import validators
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download('stopwords')
# Initialize Word_Net_Lemmatizer
lemmatizer = WordNetLemmatizer()

warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\athar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\athar\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\athar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load the files

In [2]:
def load_data_from_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

In [3]:
num_barrels = 1000
barrels = [None] * num_barrels
barrel_files = os.listdir(r"Files\Barrels")
# Load all barrels that currently exist
for i, barrel in enumerate(barrel_files):
    barrels[i] = (load_data_from_json(os.path.join(r"Files\Barrels", barrel)))
# Load lexicon
lexicon = load_data_from_json(r"Files\lexicon.json")
# Load the documents
documents = load_data_from_json(r"Files\documents.json")


KeyboardInterrupt: 

In [3]:
num_barrels = 1000
# Load lexicon
lexicon = load_data_from_json(r"Files\lexicon.json")
# Load the documents
documents = load_data_from_json(r"Files\documents.json")
barrel_files = os.listdir(r"Files\Barrels")

# Implementing Lexicon

In [4]:
# Function that takes in content, preprocesses it, 
# and converts it to a list of words
def pre_process_string(content):
    # Remove \n and \t
    content = content.replace('\n', ' ')
    content = content.replace('\t', ' ')
    # Remove all non-characters
    content = re.sub(r'[^a-zA-Z\s]', ' ', content)
    # Remove multiple spaces
    content = re.sub(r'\s+', ' ', content)
    # Convert all characters to lowercase
    content = content.lower()
    # Convert the title into a list of words
    content = content.split()
    # Remove one and two character words
    content = [word for word in content if len(word) > 2]
    # Remove stop_words using nltk
    content = [word for word in content if not word in stopwords.words('english')]
    return content
    
# Function that takes in a list of words and adds them to the lexicon
def build_lexicon(words):
    # Build the lexicon
    new_words = []
    # Look through the words
    for word in words:
        # Lemmatize the word
        word = lemmatizer.lemmatize(word)
        # if that word is not already in lexicon
        if word not in lexicon and word not in new_words:
            # Then add it
            new_words.append(word)
    lexicon.extend(new_words)
    return

# Function to build forward index from raw articles
def build_forward_index(articles):

    # initialize forward_index
    forward_index = dict()

    #initialize documents
    docs = dict()

    # Load the already existing forward_index
    try:
        data = load_data_from_json(r"Files\forward_index.json")
    except:
        with open(r"Files\forward_index.json", "w") as file:
            json.dump(dict(), file)
        data = load_data_from_json(r"Files\forward_index.json")
        
    num_articles = len(documents)
    
    # Extract all urls currently indexed
    try:
        article_urls = [article['url'] for article in documents.values()]
    except:
        article_urls = []
        
    # For each article
    for article in articles:
        # if article is not already forward indexed
        if article['url'] not in article_urls:
            # Pre-process the title and content
            title_words = pre_process_string(article['title'])
            content_words = pre_process_string(article['content'])
            # Update the lexicon
            build_lexicon(title_words + content_words)
            # Lemmatize the words in content and title
            content_words = [lemmatizer.lemmatize(word) for word in content_words]
            title_words = [lemmatizer.lemmatize(word) for word in title_words]
            # Convert the words in title and content to their respective indexes
            content_ids = [lexicon.index(word) for word in content_words]
            title_ids = [lexicon.index(word) for word in title_words]
            # Count the frequencies of words
            frequency = Counter((title_ids * 10) + content_ids)
            forward_index[num_articles] = frequency
            docs[str(num_articles)] = {'title': article['title'], 'url': article['url']}
            # Add the url to the article
            article_urls.append(article['url'])
            num_articles += 1
    data.update(forward_index)
    print(docs)
    documents.update(docs)
    # Update the lexicon json file
    with open(r"Files\lexicon.json", "w") as file:
        json.dump(lexicon, file)
    # Update the forward_index json file
    with open(r"Files\forward_index.json", "w") as file:
        json.dump(data, file)
    # Update the documents json file
    with open(r"Files\documents.json", "w") as file:
        json.dump(documents, file)

In [None]:
%%time
build_forward_index(json_data)

CPU times: total: 1.23 s
Wall time: 1.25 s


## Implementing Barrels

In [5]:
def build_inverted_index_with_barrels():

    # Load the forward index
    try:
        forward_index = load_data_from_json(r"Files\forward_index.json")
    except:
        return
    
    barrels = [None] * num_barrels

    # Iterate through all articles in the forward_index
    for doc_id, data in forward_index.items():
        # Look at all words in an article
        for word_id in data:
            # Calculate the barrel number for that word
            barrel_no = int(word_id) % num_barrels
            barrel_filename = f"barrel_{str(barrel_no).zfill(5)}.json"
            
            # Check if that barrel exists, if not then create it
            barrel_path = os.path.join(r"Files\Barrels", barrel_filename)
            if not os.path.exists(barrel_path):
                with open(barrel_path, "w") as file:
                    json.dump(dict(), file)
                barrel_files.append(barrel_filename)
            # Load the barrel
            if barrels[barrel_no] is None:
                barrels[barrel_no] = (load_data_from_json(barrel_path))
                
            # If that word is not already in that barrel
            if word_id not in barrels[barrel_no]:
                # Then create a dict at that word_id
                barrels[barrel_no][word_id] = dict()
            # And add the doc_id for that word along with frequency if it is not already there
            if doc_id not in barrels[barrel_no][word_id]:
                barrels[barrel_no][word_id].update({doc_id: data[word_id]})

    # Update the loaded barrels
    for i, barrel in enumerate(barrels):
        if barrel is not None:
            with open(os.path.join(r"Files\Barrels", barrel_files[i]), "w") as file:
                json.dump(barrel, file)
    
    # # Clear the forward_index
    with open(r"Files\forward_index.json", "w") as file:
        json.dump(dict(), file)

In [6]:
%%time 
build_inverted_index_with_barrels()

CPU times: total: 17min 12s
Wall time: 17min 16s


### Testing out Single and Multi-Word Search on stored forward and reverse index

In [6]:
def rank_results(search_result): 
     # Rank these documents
    # Sort the dictionary by values (descending order)
    sorted_tuples = sorted(search_result.items(), key=lambda x: x[1], reverse=True)
    
    # Convert the sorted list of tuples back to a dictionary
    ranked_result = dict(sorted_tuples)
    # Extract the article ids
    ranked_articles = ranked_result.keys()
    ranked_articles = list(ranked_articles)
    

    return ranked_articles

In [7]:
# Function for single word queries
def single_word_search(word):
    # Lemmatize the word
    word = word.lower()
    word = lemmatizer.lemmatize(word)
        
    # Find the id of the word in lexicon
    try:
        word_id = lexicon.index(word)
        # Calculate the barrel of the word
        barrel_no = word_id % num_barrels
        # Load the corresponding barrel
        barrel_filename = f"barrel_{str(barrel_no).zfill(5)}.json"
        barrel_path = os.path.join(r"Files\Barrels", barrel_filename)
        barrel = load_data_from_json(barrel_path)
        # Find out in which documents does the word appear
        search_result = barrel[str(word_id)]
    except:
        search_result = None
    
    if search_result is None: 
        return []

    article_ids = rank_results(search_result)
    titles = [documents[article]['title'] for article in article_ids]
    urls = [documents[article]['url'] for article in article_ids]

    return titles

# Function for multi-word queries
def multi_word_search(query): 
    result = []

    # Preprocess the query
    words = pre_process_string(query)

    # Remove those words that are not in lexicon
    words = [word for word in words if word in lexicon]
    # Convert each word to its word_id
    word_ids = [lexicon.index(word) for word in words]
    # Calculate barrel_no of each word
    barrel_nos = [word_id % num_barrels for word_id in word_ids]

    # Load the necessary barrels
    barrels = [None] * num_barrels
    for barrel_no in barrel_nos:
        # If a barrel isn't already loaded, then load it
        if barrels[barrel_no] == None:
            barrel_filename = f"barrel_{str(barrel_no).zfill(5)}.json"
            barrel_path = os.path.join(r"Files\Barrels", barrel_filename)
            barrels[barrel_no] = load_data_from_json(barrel_path)
            
    # Check the first word
    if word_ids:
        result = barrels[barrel_nos[0]][str(word_ids[0])]
        # Check the rest of the words
        for i, word_id in enumerate(word_ids[1:], start = 1):
            # Produce the result for current word
            current_result = barrels[barrel_nos[i]][str(word_id)]
            # Include those articles that are also in the result of current word
            result.update({d:result[d]+current_result[d] for d in result.keys() if d in current_result.keys()})

    if result is None:
        return []
    
    # rank the results
    result = rank_results(result)

    article_ids = result
    titles = [documents[article]['title'] for article in article_ids]
    urls = [documents[article]['url'] for article in article_ids]
                
    return titles

In [9]:
%%time
single_word_search("pretty")

CPU times: total: 15.6 ms
Wall time: 25.5 ms


['Biden speaks on guns, gets pretty much everything wrong',
 "Wimbledon: Cameron Norrie says reaching semi-finals is 'pretty sick' after Novak Djokovic loss",
 "Brooks: Cruz 'Has Been Pretty Much Right' on Russia, GOP Hasn't Generally Been Soft on Russia",
 'Elon Musk Is Foolishly Supporting the Nazis in Ukraine – Pretty Dumb for a Smart Guy – or Is It That Even Billionaires Can Be Bought or Made From Pentagon Contracts and Taxpayer Subsidies?',
 'Australian National Review Founder says,“It’s Pretty Obvious These Cyber Attacks are Done by the Same Western Elite Controlled Deep State, as a Predecessor to Role Out Their Solution of Digital ID Enslavement',
 '7 Actors Who Were Considered Too “Ugly” or Too “Pretty” for a Role',
 "LIV Golf: Rory McIlroy calls players who switch 'pretty duplicitous'",
 'WATCH OUT! Fauci Says ‘Pretty Bad Flu Season’ May Be Imminent',
 "Dr. Scott Gottlieb: Rising Monkeypox Cases Suggest It's Spread 'Pretty Wide'",
 "Trump Jr. Warns of Possible Blackmailing of 

In [13]:
%%time
multi_word_search("Writing your first django app")

CPU times: total: 78.1 ms
Wall time: 86.3 ms


['Writing your first Django app',
 'The History of Money, Warlord Banksters, and the Worship of Mammon',
 'The Killing Fields of Samoa',
 '53 Years of Knowledge: Vaccination and Chronic Disease',
 'Jill Biden: I didn’t expect ‘healing role’ as first lady',
 'England in Pakistan: Andy Zaltzman on an astonishing and record-breaking first Test',
 "University writing instructors are no longer grading students' writing",
 'PROF. ELLWANGER: Chatbot’s essay doesn’t make the grade: Artificial intelligence and the art of writing',
 'Exclusive — Kevin McCarthy Lays Out GOP ‘Commitment to America’: ‘A Clear Contrast’ Between Republicans, Democrats for Midterms',
 'University writing center prefers applicants have experience in anti-racism',
 'Final goodbye: Recalling influential people who died in 2021',
 'I Tried The Peloton App For Two Weeks Without A Bike Or Treadmill — Here Are My Honest Thoughts',
 'Writing on war and living in a world from Hell',
 'Former CIA officer and satirist Alex Finle

## Implementing Add Files

In [11]:
def add_content(file, url, title, content):

    # Check if a file is uploaded
    if file:
        # Load the file
        data = load_data_from_json(file)
    else:
        # Check if the url, title and content are correct
        if url and title and content:
            # Validate the url
            if not validators.url(url):
                print("Please provide a valid url")
                return
            # Load the data
            data = [{"title": title, "content": content, "url": url}]
        else:
            print("Please provide a file or url, title, and content")
            return

    # Build forward and inverted index on it
    try:
        build_forward_index(data)
    except:
        print("Error building forward index")
        return
    build_inverted_index_with_barrels()
    
    print("Successfully added content")


In [12]:
add_content(None, "https://docs.djangoproject.com/en/4.2/intro/tutorial01/", "Writing your first Django app", "This tutorial is written for Django 4.2, which supports Python 3.8 and later. If the Django version doesn’t match, you can refer to the tutorial for your version of Django by using the version switcher at the bottom right corner of this page, or update Django to the newest version. If you’re using an older version of Python, check What Python version can I use with Django? to find a compatible version of Django.")

{'158506': {'title': 'Writing your first Django app', 'url': 'https://docs.djangoproject.com/en/4.2/intro/tutorial01/'}}
Successfully added content


In [26]:
url = "https://docs.djangoproject.com/en/4.2/intro/tutorial01/"
title =  "Writing your first Django app"
content = "This tutorial is written for Django 4.2, which supports Python 3.8 and later. If the Django version doesn’t match, you can refer to the tutorial for your version of Django by using the version switcher at the bottom right corner of this page, or update Django to the newest version. If you’re using an older version of Python, check What Python version can I use with Django? to find a compatible version of Django."
data = [{"title":title, "content":content, "url":url}]

In [27]:
data

[{'title': 'Writing your first Django app',
  'content': 'This tutorial is written for Django 4.2, which supports Python 3.8 and later. If the Django version doesn’t match, you can refer to the tutorial for your version of Django by using the version switcher at the bottom right corner of this page, or update Django to the newest version. If you’re using an older version of Python, check What Python version can I use with Django? to find a compatible version of Django.',
  'url': 'https://docs.djangoproject.com/en/4.2/intro/tutorial01/'}]

In [16]:
lexicon.index('django')

20368

In [8]:
len(documents)

158507