# Required Imports

In [1]:
import os
import warnings
import json
import re
import nltk
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download('stopwords')
# Initialize Word_Net_Lemmatizer
lemmatizer = WordNetLemmatizer()

warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\athar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\athar\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\athar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Implementing Lexicon

In [2]:
def load_data_from_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

In [3]:
json_file_path = r"Files\articles_sampled_100.json"
json_data = load_data_from_json(json_file_path)

In [4]:
# Function that takes in content, preprocesses it, 
# and converts it to a list of words
def pre_process_string(content):
    # Remove \n and \t
    content = content.replace('\n', ' ')
    content = content.replace('\t', ' ')
    # Remove all non-characters
    content = re.sub(r'[^a-zA-Z\s]', ' ', content)
    # Remove multiple spaces
    content = re.sub(r'\s+', ' ', content)
    # Convert all characters to lowercase
    content = content.lower()
    # Convert the title into a list of words
    content = content.split()
    # Remove one and two character words
    content = [word for word in content if len(word) > 2]
    # Remove stop_words using nltk
    content = [word for word in content if not word in stopwords.words('english')]
    return content
    
# Function that takes in a list of words and adds them to the lexicon
def build_lexicon(words, lexicon):
    # Build the lexicon
    new_words = []
    # Look through the words
    for word in words:
        # Lemmatize the word
        word = lemmatizer.lemmatize(word)
        # if that word is not already in lexicon
        if word not in lexicon and word not in new_words:
            # Then add it
            new_words.append(word)
    lexicon.extend(new_words)
    return lexicon

def sort_lexicon():
    # Load the lexicon
    data = load_data_from_json(r"Files\lexicon.json")
    with open(r"Files\lexicon.json", "w") as file:
        # sort the data
        data = sorted(data)
        json.dump(data, file)

# Function to build forward index from raw articles
def build_forward_index(articles):
    # initialize forward_index
    forward_index = dict()
    # Load the already existing forward_index
    try:
        data = load_data_from_json(r"Files\forward_index.json")
    except:
        with open(r"Files\forward_index.json", "w") as file:
            json.dump(dict(), file)
        data = load_data_from_json(r"Files\forward_index.json")
    # Load the lexicon
    try:
        lexicon = load_data_from_json(r"Files\lexicon.json")
    except:
        with open(r"Files\lexicon.json", "w") as file:
            json.dump(list(), file)
        lexicon = load_data_from_json(r"Files\lexicon.json")
    num_articles = len(data)
    # Extract all urls currently indexed
    try:
        article_urls = [article['url'] for article in data.values()]
    except:
        article_urls = []
    # For each article
    for article in articles:
        # if article is not already forward indexed
        if article['url'] not in article_urls:
            # Pre-process the title and content
            title_words = pre_process_string(article['title'])
            content_words = pre_process_string(article['content'])
            # Update the lexicon
            lexicon = build_lexicon(title_words + content_words, lexicon)
            # Lemmatize the words in content and title
            content_words = [lemmatizer.lemmatize(word) for word in content_words]
            title_words = [lemmatizer.lemmatize(word) for word in title_words]
            # Convert the words in title and content to their respective indexes
            content_ids = [lexicon.index(word) for word in content_words]
            title_ids = [lexicon.index(word) for word in title_words]
            # Count the frequencies of words
            frequency = Counter((title_ids * 10) + content_ids)
            forward_index[num_articles] = {'title': article['title'], 'url': article['url'], 'hitlist': frequency}
            # Add the url to the article
            article_urls.append(article['url'])
            num_articles += 1
    data.update(forward_index)
    # Update the lexicon json file
    with open(r"Files\lexicon.json", "w") as file:
        json.dump(lexicon, file)
    # Update the forward_index json file
    with open(r"Files\forward_index.json", "w") as file:
        json.dump(data, file)

In [5]:
%%time
build_forward_index(json_data)

CPU times: total: 46.9 ms
Wall time: 81.8 ms


In [22]:
sort_lexicon()

In [31]:
%%time
lemmatizer = WordNetLemmatizer()
words = ['Hello', 'tests', 'forms', 'testers', 'indexes']
for i in range(len(words)):
    words[i] = lemmatizer.lemmatize(words[i])

CPU times: total: 0 ns
Wall time: 0 ns


In [32]:
words

['Hello', 'test', 'form', 'tester', 'index']

In [4]:
# Load the lexicon
forward_index = load_data_from_json(r"Files\forward_index.json")

In [14]:
article_urls = [article['url'] for article in forward_index.values()]
article_urls

['https://www.upi.com/Top_News/US/2022/05/18/primaries-pennsylvania-idaho-oregon-kentucky-carolina/3671652801442/',
 'https://www.thesun.co.uk/fabulous/19443934/sister-uninvited-wedding-day-white-sequinned-gown/',
 'https://therussophile.org/russia-defences-in-kherson-evacuating-civilians-advancing-kharkov-bakhmut-uk-defence-chief-to-dc-3.html/',
 'https://www.lawenforcementtoday.com/randi-weingarten-states-teachers-are-social-justice-warriors/',
 'https://jacobinmag.com/2022/04/canadian-left-freedom-convoy-populism/',
 'https://www.mediaite.com/election-2022/five-more-herschel-walker-exes-come-forward-to-accuse-him-of-terrifying-violent-behavior-i-saw-a-fist-flying-toward-me/',
 'https://feeds.feedblitz.com/~/686267874/0/usatoday-newstopstories~Historians-draw-parallels-between-Dont-Say-Gay-legislation-and-Floridas-purge-of-gay-teachers-decades-ago/',
 'https://www.charlotteobserver.com/news/politics-government/article268384902.html#storylink=rss',
 'https://nypost.com/2022/04/29/cali

## Implementing inverted index

In [15]:
def build_inverted_index():
    # Load the forward index
    try:
        forward_index = load_data_from_json(r"Files\forward_index.json")
    except:
        with open(r"Files\forward_index.json", "w") as file:
            json.dump(dict(), file)
        forward_index = load_data_from_json(r"Files\forward_index.json")
    # Load the inverted index
    try:
        inverted_index = load_data_from_json(r"Files\inverted_index.json")
    except:
        with open(r"Files\inverted_index.json", "w") as file:
            json.dump(dict(), file)
        inverted_index = load_data_from_json(r"Files\inverted_index.json")

    # Iterate through all articles in the forward_index
    for doc_id, data in forward_index.items():
        # Look at all words in an article
        for word_id in data['hitlist']:
            # If that word is not already in inverted index
            if word_id not in inverted_index:
                # Then create a list at that word_id
                inverted_index[word_id] = list()
            # And append the doc_id for that word if it is not already there
            if doc_id not in inverted_index[word_id]:
                inverted_index[word_id].append(doc_id)

    # Update the inverted index
    with open(r"Files\inverted_index.json", "w") as file:
        json.dump(inverted_index, file)

In [21]:
%%time 
build_inverted_index()

CPU times: total: 62.5 ms
Wall time: 86.8 ms
