# Required Imports

In [8]:
import os
import sys
import warnings
import json
import re
import nltk
import time
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download('stopwords')
# Initialize Word_Net_Lemmatizer
lemmatizer = WordNetLemmatizer()

warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\athar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\athar\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\athar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Implementing Lexicon

In [2]:
def load_data_from_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

In [3]:
json_file_path = r"Files\articles_sampled.json"
json_data = load_data_from_json(json_file_path)

In [10]:
# Function that takes in content, preprocesses it, 
# and converts it to a list of words
def pre_process_string(content):
    # Remove \n and \t
    content = content.replace('\n', '')
    content = content.replace('\t', '')
    # Remove all non-characters
    content = re.sub(r'[^a-zA-Z\s]', ' ', content)
    # Convert all characters to lowercase
    content = content.lower()
    # Convert the title into a list of words
    content = content.split()
    # Remove one and two character words
    content = [word for word in content if len(word) > 2]
    # Remove stop_words using nltk
    content = [word for word in content if not word in stopwords.words('english')]
    return content
    
# Function that takes in a list of words and adds them to the lexicon
def build_lexicon(words, lexicon):
    # Build the lexicon
    new_words = []
    # Look through the words
    for word in words:
        # Lemmatize the word
        word = lemmatizer.lemmatize(word)
        # if that word is not already in lexicon
        if word not in lexicon and word not in new_words:
            # Then add it
            new_words.append(word)
    lexicon.extend(new_words)
    return lexicon

def sort_lexicon():
    # Load the lexicon
    data = load_data_from_json(r"Files\lexicon.json")
    with open(r"Files\lexicon.json", "w") as file:
        # sort the data
        data = sorted(data)
        json.dump(data, file, indent = 2)

# Function to build forward index from raw articles
def build_forward_index(articles):
    # initialize forward_index
    forward_index = dict()
    # Load the already existing forward_index
    data = load_data_from_json(r"Files\forward_index.json")
    # Load the lexicon
    lexicon = load_data_from_json(r"Files\lexicon.json")
    num_articles = len(data)
    # For each article
    for article in articles:
        # Pre-process the title and content
        title_words = pre_process_string(article['title'])
        content_words = pre_process_string(article['content'])
        # Update the lexicon
        lexicon = build_lexicon(title_words + content_words, lexicon)
        # Load the lexicon
        lexicon = load_data_from_json(r"Files\lexicon.json")
        # Lemmatize the words in content and title
        content_words = [lemmatizer.lemmatize(word) for word in content_words]
        title_words = [lemmatizer.lemmatize(word) for word in title_words]
        # Convert the words in title and content to their respective indexes
        content_ids = [lexicon.index(word) for word in content_words]
        title_ids = [lexicon.index(word) for word in title_words]
        # Count the frequencies of words
        frequency = Counter((title_ids * 10) + content_ids)
        forward_index[num_articles] = {'title': article['title'], 'url': article['url'], 'hitlist': frequency}
        num_articles += 1
    data.update(forward_index)
    # Update the lexicon json file
    with open(r"Files\lexicon.json", "w") as file:
        json.dump(lexicon, file, indent = 2)
    # Update the forward_index json file
    with open(r"Files\forward_index.json", "w") as file:
        json.dump(data, file, indent = 2)

In [11]:
%%time
build_forward_index(json_data)

1000 articles forward_indexed in time 169.01993250846863
2000 articles forward_indexed in time 166.65394949913025


KeyboardInterrupt: 

In [22]:
sort_lexicon()

In [31]:
%%time
lemmatizer = WordNetLemmatizer()
words = ['Hello', 'tests', 'forms', 'testers', 'indexes']
for i in range(len(words)):
    words[i] = lemmatizer.lemmatize(words[i])

CPU times: total: 0 ns
Wall time: 0 ns


In [32]:
words

['Hello', 'test', 'form', 'tester', 'index']

In [36]:
# Load the lexicon
lexicon = load_data_from_json(r"Files\lexicon.json")