# Required Imports

In [1]:
import os
import sys
import warnings
import json
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\athar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Implementing Lexicon

In [4]:
json_file_path = r"Files\369news.json"
json_data = load_data_from_json(json_file_path)

In [18]:
# Function that takes in content, preprocesses it, 
# and converts it to a list of words
def pre_process_string(content):
    # Remove \n and \t
    content = content.replace('\n', '')
    content = content.replace('\t', '')
    # Remove all non-characters
    content = re.sub(r'[^a-zA-Z\s]', ' ', content)
    # Convert all characters to lowercase
    content = content.lower()
    # Convert the title into a list of words
    content = content.split()
    # Remove one and two character words
    content = [word for word in content if len(word) > 2]
    # Remove stop_words using nltk
    content = [word for word in content if not word in stopwords.words('english')]
    return content
    
# Function that takes in a list of words and adds them to the lexicon
def build_lexicon(words):
    # Load the lexicon
    data = load_data_from_json(r"Files\lexicon.json")
    with open(r"Files\lexicon.json", "w") as file:
        # Build the lexicon
        new_words = []
        # Look through the words
        for word in words:
            # if that word is not already in lexicon
            if word not in data and word not in new_words:
                # Then add it
                new_words.append(word)
        data.extend(new_words)
        # Update the lexicon json file
        json.dump(data, file, indent = 2)

def sort_lexicon():
    # Load the lexicon
    data = load_data_from_json(r"Files\lexicon.json")
    with open(r"Files\lexicon.json", "w") as file:
        # sort the data
        data = sorted(data)
        json.dump(data, file, indent = 2)

# Function to build forward index from raw articles
def build_forward_index(articles):
    # initialize forward_index
    forward_index = dict()
    # Load the already existing forward_index
    data = load_data_from_json(r"Files/forward_index.json")
    num_articles = len(data)
    # For each article
    for article in articles:
        # Pre-process the title and content
        title_words = pre_process_string(article['title'])
        content_words = pre_process_string(article['content'])
        # Update the lexicon
        build_lexicon(title_words + content_words)
        forward_index[num_articles] = {'title': article['title'], 'url': article['url'], 'words': title_words + content_words}
        num_articles += 1
    data.update(forward_index)
    with open(r"Files\forward_index.json", "w") as file:
        json.dump(data, file, indent = 2)