# Required Imports

In [1]:
import os
import warnings
import json
import re
import nltk
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download('stopwords')
# Initialize Word_Net_Lemmatizer
lemmatizer = WordNetLemmatizer()

warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\haris\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\haris\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\haris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Implementing Lexicon

In [2]:
def load_data_from_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

In [78]:
json_file_path = r"Files\articles_sampled_10000.json"
json_data = load_data_from_json(json_file_path)

In [3]:
# Function that takes in content, preprocesses it, 
# and converts it to a list of words
def pre_process_string(content):
    # Remove \n and \t
    content = content.replace('\n', ' ')
    content = content.replace('\t', ' ')
    # Remove all non-characters
    content = re.sub(r'[^a-zA-Z\s]', ' ', content)
    # Remove multiple spaces
    content = re.sub(r'\s+', ' ', content)
    # Convert all characters to lowercase
    content = content.lower()
    # Convert the title into a list of words
    content = content.split()
    # Remove one and two character words
    content = [word for word in content if len(word) > 2]
    # Remove stop_words using nltk
    content = [word for word in content if not word in stopwords.words('english')]
    return content
    
# Function that takes in a list of words and adds them to the lexicon
def build_lexicon(words, lexicon):
    # Build the lexicon
    new_words = []
    # Look through the words
    for word in words:
        # Lemmatize the word
        word = lemmatizer.lemmatize(word)
        # if that word is not already in lexicon
        if word not in lexicon and word not in new_words:
            # Then add it
            new_words.append(word)
    lexicon.extend(new_words)
    return lexicon

def sort_lexicon():
    # Load the lexicon
    data = load_data_from_json(r"Files\lexicon.json")
    with open(r"Files\lexicon.json", "w") as file:
        # sort the data
        data = sorted(data)
        json.dump(data, file)

# Function to build forward index from raw articles
def build_forward_index(articles):
    # initialize forward_index
    forward_index = dict()

    #initialize documents
    docs = dict()

    # Load the already existing forward_index
    try:
        data = load_data_from_json(r"Files\forward_index.json")
    except:
        with open(r"Files\forward_index.json", "w") as file:
            json.dump(dict(), file)
        data = load_data_from_json(r"Files\forward_index.json")
        
    # Load the lexicon
    try:
        lexicon = load_data_from_json(r"Files\lexicon.json")
    except:
        with open(r"Files\lexicon.json", "w") as file:
            json.dump(list(), file)
        lexicon = load_data_from_json(r"Files\lexicon.json")

    # Load the documents
    try:
        documents = load_data_from_json(r"Files\documents.json")
    except:
        with open(r"Files\documents.json", "w") as file:
            json.dump(dict(), file)
        documents = load_data_from_json(r"Files\documents.json")
        
    num_articles = len(documents)
    
    # Extract all urls currently indexed
    try:
        article_urls = [article['url'] for article in documents.values()]
    except:
        article_urls = []
        
    # For each article
    for article in articles:
        # if article is not already forward indexed
        if article['url'] not in article_urls:
            # Pre-process the title and content
            title_words = pre_process_string(article['title'])
            content_words = pre_process_string(article['content'])
            # Update the lexicon
            lexicon = build_lexicon(title_words + content_words, lexicon)
            # Lemmatize the words in content and title
            content_words = [lemmatizer.lemmatize(word) for word in content_words]
            title_words = [lemmatizer.lemmatize(word) for word in title_words]
            # Convert the words in title and content to their respective indexes
            content_ids = [lexicon.index(word) for word in content_words]
            title_ids = [lexicon.index(word) for word in title_words]
            # Count the frequencies of words
            frequency = Counter((title_ids * 10) + content_ids)
            forward_index[num_articles] = frequency
            docs[num_articles] = {'title': article['title'], 'url': article['url']}
            # Add the url to the article
            article_urls.append(article['url'])
            num_articles += 1
    data.update(forward_index)
    documents.update(docs)
    # Update the lexicon json file
    with open(r"Files\lexicon.json", "w") as file:
        json.dump(lexicon, file)
    # Update the forward_index json file
    with open(r"Files\forward_index.json", "w") as file:
        json.dump(data, file)
    # Update the documents json file
    with open(r"Files\documents.json", "w") as file:
        json.dump(documents, file)

In [80]:
%%time
build_forward_index(json_data)

CPU times: total: 20min 46s
Wall time: 20min 46s


In [6]:
sort_lexicon()

In [31]:
%%time
lemmatizer = WordNetLemmatizer()
words = ['Hello', 'tests', 'forms', 'testers', 'indexes']
for i in range(len(words)):
    words[i] = lemmatizer.lemmatize(words[i])

CPU times: total: 0 ns
Wall time: 0 ns


In [32]:
words

['Hello', 'test', 'form', 'tester', 'index']

In [4]:
# Load the lexicon
forward_index = load_data_from_json(r"Files\forward_index.json")

In [14]:
article_urls = [article['url'] for article in forward_index.values()]
article_urls

['https://www.upi.com/Top_News/US/2022/05/18/primaries-pennsylvania-idaho-oregon-kentucky-carolina/3671652801442/',
 'https://www.thesun.co.uk/fabulous/19443934/sister-uninvited-wedding-day-white-sequinned-gown/',
 'https://therussophile.org/russia-defences-in-kherson-evacuating-civilians-advancing-kharkov-bakhmut-uk-defence-chief-to-dc-3.html/',
 'https://www.lawenforcementtoday.com/randi-weingarten-states-teachers-are-social-justice-warriors/',
 'https://jacobinmag.com/2022/04/canadian-left-freedom-convoy-populism/',
 'https://www.mediaite.com/election-2022/five-more-herschel-walker-exes-come-forward-to-accuse-him-of-terrifying-violent-behavior-i-saw-a-fist-flying-toward-me/',
 'https://feeds.feedblitz.com/~/686267874/0/usatoday-newstopstories~Historians-draw-parallels-between-Dont-Say-Gay-legislation-and-Floridas-purge-of-gay-teachers-decades-ago/',
 'https://www.charlotteobserver.com/news/politics-government/article268384902.html#storylink=rss',
 'https://nypost.com/2022/04/29/cali

## Implementing inverted index

In [40]:
def build_inverted_index():
    # Load the forward index
    try:
        forward_index = load_data_from_json(r"Files\forward_index.json")
    except:
        with open(r"Files\forward_index.json", "w") as file:
            json.dump(dict(), file)
        forward_index = load_data_from_json(r"Files\forward_index.json")
        
    # Load the inverted index
    try:
        inverted_index = load_data_from_json(r"Files\inverted_index.json")
    except:
        with open(r"Files\inverted_index.json", "w") as file:
            json.dump(dict(), file)
        inverted_index = load_data_from_json(r"Files\inverted_index.json")

    # Iterate through all articles in the forward_index
    for doc_id, data in forward_index.items():
        # Look at all words in an article
        for word_id in data:
            # If that word is not already in inverted index
            if word_id not in inverted_index:
                # Then create a dict at that word_id
                inverted_index[word_id] = dict()
            # And add the doc_id for that word along with frequency if it is not already there
            if doc_id not in inverted_index[word_id]:
                inverted_index[word_id].update({doc_id: data[str(word_id)]})

    # Update the inverted index
    with open(r"Files\inverted_index.json", "w") as file:
        json.dump(inverted_index, file)

In [41]:
%%time 
build_inverted_index()

CPU times: total: 78.1 ms
Wall time: 89.4 ms


### Testing out Single and Multi-Word Search on stored forward and reverse index

In [9]:
# Function for single word queries
def single_word_search(word):

    # Lemmatize the word
    word = lemmatizer.lemmatize(word)
        
    # Find the id of the word in lexicon
    try:
        word_id = lexicon.index(word)
        # Calculate the barrel of the word
        barrel_no = word_id // 10000
        # Update the word_id
        word_id = word_id % 10000
        # Find out in which documents does the word appear
        search_result = barrels[barrel_no][str(word_id)]
    except:
        search_result = None
    
    if search_result is None: 
        return []
        
    # Rank these documents
    # Sort the dictionary by values (descending order)
    sorted_tuples = sorted(search_result.items(), key=lambda x: x[1], reverse=True)
    
    # Convert the sorted list of tuples back to a dictionary
    ranked_result = dict(sorted_tuples)
    # Extract the article ids
    ranked_articles = ranked_result.keys()
    ranked_articles = list(ranked_articles)

    article_ids = ranked_articles
    titles = [documents[article]['title'] for article in article_ids]
    urls = [documents[article]['url'] for article in article_ids]

    return titles

# Function for multi-word queries
def multi_word_search(query): 
    result = []

    # Preprocess the query
    words = pre_process_string(query)

    # Remove those words that are not in lexicon
    words = [word for word in words if word in lexicon]
    # Convert each word to its word_id
    word_ids = [lexicon.index(word) for word in words]
    # Calculate barrel_no of each word and its index in that barrel
    barrel_nos = [word_id // 10000 for word_id in word_ids]
    word_ids = [word_id % 10000 for word_id in word_ids]

    # Check the first word
    if word_ids:
        result = loaded_barrels[barrel_nos[0]][str(word_ids[0])]
        # Check the rest of the words
        for i, word_id in enumerate(word_ids[1:], start = 1):
            # Produce the result for current word
            current_result = loaded_barrels[barrel_nos[i]][str(word_id)]
            # Include those articles that are also in the result of current word
            result.update({d:result[d]+current_result[d] for d in result.keys() if d in current_result.keys()})

    # rank the results
    result = rank_results(result)

    article_ids = result
    titles = [documents[article]['title'] for article in article_ids]
    urls = [documents[article]['url'] for article in article_ids]
                
    return titles

def multi_word_search(): 
    query = request.args.get('word')
    result = []
    words = query.split()

    if words:
        result = inverted_index.search(words[0]) if inverted_index.search(words[0]) else []
        for word in words[1:]:
            current_result = inverted_index.search(word)
            if current_result:
                result = [d for d in result if d in current_result]

    ranked_results = rank_results(remove_duplicates(result, key="article_id"))

    article_ids = [result['article_id'] for result in ranked_results]
    titles = [result['title'] for result in ranked_results]
    urls = [result['url'] for result in ranked_results]

    json_response = jsonify(article_ids=article_ids, titles=titles, urls=urls)

    return json_response 

In [4]:
barrels = []
barrel_files = os.listdir(r"Files\Barrels")
# Load all barrels that currently exist
for barrel in barrel_files:
    barrels.append(load_data_from_json(os.path.join(r"Files\Barrels", barrel)))
# Load lexicon
lexicon = load_data_from_json(r"Files\lexicon.json")
# Load the documents
documents = load_data_from_json(r"Files\documents.json")


In [10]:
single_word_search("car")

["'Extreme anxiousness and high excitement' - Secret Aerodynamicist on car changes that might be F1's biggest ever",
 'Surveillance Detection – The Professional Course Of Real Life Experience – Part Twelve, Preparing The Murder Of Irah Sok',
 'Car Sick: The Toxic Soup Inside Your Car',
 'New Tesla Hack:  Car Will “accept entirely new keys—with no authentication required and zero indication given by the in-car display” (Video)',
 'Equity is goal, not mandate, in California electric car rule',
 'With one final auction, NCDOT is now rid of former Ringling Bros. circus train cars',
 'Edmunds: Should you fix your car — or buy a new one?',
 'Two of NCDOT’s former circus train cars have found new homes. Others still available.',
 'How transit-friendly is Charlotte? I (mostly) left my car parked for a week to find out',
 'On the road in San Francisco, riding in a driverless taxi',
 'NC Transportation Museum’s ‘Jim Crow’ railroad car gets special historic designation',
 '$29,000 for a used car?

## Implementing Barrels

In [83]:
def build_inverted_index():
    # Load the forward index
    try:
        forward_index = load_data_from_json(r"Files\forward_index.json")
    except:
        with open(r"Files\forward_index.json", "w") as file:
            json.dump(dict(), file)
        forward_index = load_data_from_json(r"Files\forward_index.json")

    barrels = []
    barrel_files = os.listdir(r"Files\Barrels")
    # Load all barrels that currently exist
    for barrel in barrel_files:
        barrels.append(load_data_from_json(os.path.join(r"Files\Barrels", barrel)))

    # Iterate through all articles in the forward_index
    for doc_id, data in forward_index.items():
        # Look at all words in an article
        for word_id in data:
            # Calculate the barrel number for that word
            barrel_no = int(word_id) // 10000
            barrel_filename = f"barrel_{barrel_no}.json"
            
            # Check if that barrel exists, if not then create it
            barrel_path = os.path.join(r"Files\Barrels", barrel_filename)
            if not os.path.exists(barrel_path):
                with open(barrel_path, "w") as file:
                    json.dump(dict(), file)
                # Load the newly created barrel
                barrels.append(load_data_from_json(barrel_path))
            # If that word is not already in that barrel
            if word_id not in barrels[barrel_no]:
                # Then create a dict at that word_id
                barrels[barrel_no][word_id] = dict()
            # And add the doc_id for that word along with frequency if it is not already there
            if doc_id not in barrels[barrel_no][word_id]:
                barrels[barrel_no][word_id].update({doc_id: data[word_id]})

    # Update all barrels
    i = 0
    for barrel in barrel_files:
        with open(os.path.join(r"Files\Barrels", barrel), "w") as file:
            json.dump(barrels[i], file)
            i += 1

In [84]:
%%time 
build_inverted_index()

CPU times: total: 57.6 s
Wall time: 57.7 s
