# Required Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import warnings
import csv
import json
from threading import Thread
import multiprocessing

warnings.filterwarnings('ignore')

# Reading Cleaned JSON Sample

In [2]:
def display_json(file_path):
    try:
        with open(file_path, 'r') as file:
            json_data = json.load(file)
            # Use indent parameter to make the JSON data readable
            readable_json = json.dumps(json_data, indent=4)
            print(readable_json)
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")

In [3]:
display_json('Files\cleaned.json')

{
    "index": {
        "0": 0,
        "1": 1,
        "2": 2,
        "3": 3,
        "4": 4,
        "5": 5,
        "6": 6,
        "7": 7,
        "8": 8,
        "9": 9
    },
    "source": {
        "0": "themanchestereveningnews",
        "1": "bipartisanreport",
        "2": "thesun",
        "3": "newyorkpost",
        "4": "pbs",
        "5": "westernjournal",
        "6": "thepoliticalinsider",
        "7": "newsweek",
        "8": "thetelegraph",
        "9": "shtfplan"
    },
    "title": {
        "0": [
            "cristiano",
            "ronaldo",
            "given",
            "deadline",
            "respond",
            "charge"
        ],
        "1": [
            "more",
            "maga",
            "rioters",
            "found",
            "guilty",
            "over",
            "jan",
            "attack"
        ],
        "2": [
            "emotional",
            "louise",
            "thompson",
            "fights",
            "back",
      

In [4]:
def build_forward_index(data):
    forward_index = {}
    
    for doc_id, words in data["title"].items():
        forward_index[doc_id] = set(words)
        
    return forward_index

def build_inverted_index(data):
    inverted_index = {}
    
    for doc_id, words in data["title"].items():
        for word in words:
            if word not in inverted_index:
                inverted_index[word] = set()
            inverted_index[word].add(doc_id)
    
    return inverted_index

def load_data_from_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

In [5]:
json_file_path = "Files\cleaned.json"
sample_data = load_data_from_json(json_file_path)

forward_index = build_forward_index(sample_data)
inverted_index = build_inverted_index(sample_data)

print("Forward Index:")
print(forward_index)
print("\nInverted Index:")
print(inverted_index)

Forward Index:
{'0': {'cristiano', 'respond', 'deadline', 'ronaldo', 'given', 'charge'}, '1': {'guilty', 'found', 'over', 'more', 'maga', 'rioters', 'jan', 'attack'}, '2': {'thompson', 'been', 'condition', 'shes', 'fights', 'she', 'incurable', 'tears', 'louise', 'diagnosed', 'reveals', 'health', 'back', 'emotional', 'with'}, '3': {'taking', 'applications', 'program', 'has', 'stopped', 'despite', 'cooling', 'new', 'heatwave'}, '4': {'savage', 'pope', 'considers', 'ukraine', 'war', 'russia', 'kyiv', 'rebukes', 'visiting', 'for'}, '5': {'solve', 'mystery', 'cascade', 'messages', 'disaster', 'good', 'reveal', 'decisions', 'bad', 'led', 'familys', 'pulled', 'dead', 'phone', 'that', 'for', 'from'}, '6': {'twitter', 'state', 'deep', 'amendment', 'involvement', 'violations', 'revelations', 'first', 'illustrate'}, '7': {'fact', 'russian', 'wiped', 'real', 'ukrainian', 'column', 'mini', 'check', 'out', 'video'}, '8': {'todays', 'tips', 'bets', 'races', 'best', 'for', 'and', 'racing', 'marlboroug

# Next Steps

In [6]:
class Node:
    def __init__(self, key, value):
        self.key = key
        self.value = value
        self.next = None

class LinkedList:
    def __init__(self):
        self.head = None

    def insert(self, key, value):
        new_node = Node(key, value)
        new_node.next = self.head
        self.head = new_node

    def search(self, key):
        current = self.head
        while current:
            if current.key == key:
                return current.value
            current = current.next
        return None
    
    def display(self):
        values = []
        current = self.head
        while current:
            values.append((current.key, current.value))
            current = current.next
        return values

class HashTable:
    def __init__(self, size):
        self.size = size
        self.table = [None] * size

    def hash_function(self, key):
        return hash(key) % self.size

    def insert(self, key, value): 
        index = self.hash_function(key)
        if self.table[index] is None:
            self.table[index] = LinkedList()
        self.table[index].insert(key, value)

    def search(self, key):
        index = self.hash_function(key)
        if self.table[index] is not None:
            return self.table[index].search(key)
        return None
    
    def display(self):
        table_values = []
        for i, linked_list in enumerate(self.table):
            if linked_list is not None:
                values = linked_list.display()
                for key, value in values:
                    table_values.append((i, key, value))
        return table_values

In [7]:
json_file_path = "Files\cleaned.json"
json_data = load_data_from_json(json_file_path)

# Code for Multi Search Word Query

In [33]:
def build_forward_index(data):
    forward_index = HashTable(size=100)
    for article_id, title_words in data["title"].items(): 
        if article_id in data["content"]:
            content_words = data["content"][article_id] 
            words = title_words + content_words

            forward_index.insert(article_id, words)

    return forward_index 
 

def build_inverted_index(data):
    inverted_index = HashTable(size=100)
    for article_id, words in data["content"].items(): 
        topic = data["title"][article_id]
        url = data["url"][article_id]
    
        for word in words:
            if inverted_index.search(word):
                inverted_index.search(word).append({"article_id": article_id, "title": topic, "url": url})
            else:
                inverted_index.insert(word, [{"article_id": article_id, "title": topic, "url": url}])

    return inverted_index

def single_word_search(inverted_index, word):
    return inverted_index.search(word)

def multi_word_search_2(inverted_index, query):
    result = set()
    words = query.split()
    if words:
        result = set(inverted_index.search(words[0]))
        for word in words[1:]:
            result.intersection_update(inverted_index.search(word))
    return list(result)

def multi_word_search(inverted_index, query):
    result = []

    words = query.split()

    if words:
        result = inverted_index.search(words[0]) if inverted_index.search(words[0]) else []

        for word in words[1:]:
            current_result = inverted_index.search(word)
            if current_result:
                result = [d for d in result if d in current_result]

    return result 

from collections import Counter

def rank_results(input_list): 
    counts = Counter(tuple(item['article_id']) for item in input_list) 
    sorted_list = sorted(input_list, key=lambda item: counts[tuple(item['article_id'])], reverse=True) 
    unique_set = set()
    output = [item for item in sorted_list if tuple(item['title']) not in unique_set and not unique_set.add(tuple(item['article_id']))]
    
    return output

def display_results(results):
    for article_id in results:
        print(f"Article {article_id}: {data['content'][article_id]}")

def add_content(data, new_article):
    article_id = str(len(data["index"]))
    data["index"][article_id] = len(data["index"])
    data["source"][article_id] = new_article[0]
    data["title"][article_id] = new_article[1]
    data["content"][article_id] = new_article[2]

    return data

def remove_duplicates(input_list, key = None):
    seen = set()
    result = []
    
    for item in input_list: 
        hashable_item = frozenset(item.items()) if key is None else item.get(key)
        
        if hashable_item not in seen:
            seen.add(hashable_item)
            result.append(item)
    
    return result

### Building Index

In [9]:
forward_index = build_forward_index(json_data)
inverted_index = build_inverted_index(json_data)

In [15]:
print(forward_index.search("7"))

['fact', 'check', 'video', 'russian', 'column', 'wiped', 'out', 'ukrainian', 'mini', 'real', 'viral', 'video', 'seemingly', 'filmed', 'ukrainian', 'soldier', 'behind', 'enemy', 'lines', 'appears', 'show', 'ukrainian', 'zaz', 'zaporozhetsa', 'modified', 'version', 'sovietera', 'supermini', 'with', 'rocket', 'launchers', 'and', 'machine', 'guns', 'mounted', 'its', 'roofdestroying', 'whole', 'column', 'russian', 'tanks', 'and', 'vehicles', 'southern', 'ukraine', 'amid', 'ukraine', 'recent', 'tactical', 'triumphs', 'including', 'pushing', 'russian', 'forces', 'off', 'the', 'longcontested', 'snake', 'island', 'many', 'were', 'prepared', 'believe', 'that', 'the', 'scenes', 'depicted', 'the', 'clip', 'are', 'real', 'but', 'while', 'russia', 'invasion', 'ukraine', 'continues', 'generate', 'copious', 'amounts', 'striking', 'and', 'often', 'horrifying', 'modern', 'combat', 'the', 'authenticity', 'the', 'clip', 'saying', 'too', 'cinematic', 'true', 'several', 'telegram', 'channels', 'and', 'twitt

In [11]:
results = remove_duplicates(inverted_index.search("for"), key="article_id")
results

[{'article_id': '0',
  'title': ['cristiano', 'ronaldo', 'given', 'deadline', 'respond', 'charge'],
  'url': 'https://www.manchestereveningnews.co.uk/sport/football/football-news/breaking-ronaldo-news-fa-charge-25145277'},
 {'article_id': '1',
  'title': ['more',
   'maga',
   'rioters',
   'found',
   'guilty',
   'over',
   'jan',
   'attack'],
  'url': 'https://bipartisanreport.com/2022/10/16/2-more-maga-rioters-found-guilty-over-jan-6-attack/'},
 {'article_id': '2',
  'title': ['emotional',
   'louise',
   'thompson',
   'fights',
   'back',
   'tears',
   'she',
   'reveals',
   'shes',
   'been',
   'diagnosed',
   'with',
   'incurable',
   'health',
   'condition'],
  'url': 'https://www.thesun.co.uk/tvandshowbiz/20762466/louise-thompson-diagnosed-lupus-made-chelsea-instagram-ptsd/'},
 {'article_id': '3',
  'title': ['cooling',
   'program',
   'has',
   'stopped',
   'taking',
   'new',
   'applications',
   'despite',
   'heatwave'],
  'url': 'https://nypost.com/2022/07/24/ch

### Single Word Search

In [12]:
single_word_query = "for"
single_word_results = single_word_search(inverted_index, single_word_query)
single_word_results = remove_duplicates(single_word_results, key="article_id")

print(f"Results for single word query '{single_word_query}': {single_word_results}")

Results for single word query 'for': [{'article_id': '0', 'title': ['cristiano', 'ronaldo', 'given', 'deadline', 'respond', 'charge'], 'url': 'https://www.manchestereveningnews.co.uk/sport/football/football-news/breaking-ronaldo-news-fa-charge-25145277'}, {'article_id': '1', 'title': ['more', 'maga', 'rioters', 'found', 'guilty', 'over', 'jan', 'attack'], 'url': 'https://bipartisanreport.com/2022/10/16/2-more-maga-rioters-found-guilty-over-jan-6-attack/'}, {'article_id': '2', 'title': ['emotional', 'louise', 'thompson', 'fights', 'back', 'tears', 'she', 'reveals', 'shes', 'been', 'diagnosed', 'with', 'incurable', 'health', 'condition'], 'url': 'https://www.thesun.co.uk/tvandshowbiz/20762466/louise-thompson-diagnosed-lupus-made-chelsea-instagram-ptsd/'}, {'article_id': '3', 'title': ['cooling', 'program', 'has', 'stopped', 'taking', 'new', 'applications', 'despite', 'heatwave'], 'url': 'https://nypost.com/2022/07/24/chuck-schumer-says-ny-cooling-program-has-stopped-taking-new-applicatio

### Multi Word Search

In [34]:
multi_word_query = "ronaldo is"
multi_word_results = multi_word_search(inverted_index, multi_word_query)
print(f"Results for multi word query '{multi_word_query}': {multi_word_results}")

Results for multi word query 'ronaldo is': [{'article_id': '0', 'title': ['cristiano', 'ronaldo', 'given', 'deadline', 'respond', 'charge'], 'url': 'https://www.manchestereveningnews.co.uk/sport/football/football-news/breaking-ronaldo-news-fa-charge-25145277'}, {'article_id': '0', 'title': ['cristiano', 'ronaldo', 'given', 'deadline', 'respond', 'charge'], 'url': 'https://www.manchestereveningnews.co.uk/sport/football/football-news/breaking-ronaldo-news-fa-charge-25145277'}, {'article_id': '0', 'title': ['cristiano', 'ronaldo', 'given', 'deadline', 'respond', 'charge'], 'url': 'https://www.manchestereveningnews.co.uk/sport/football/football-news/breaking-ronaldo-news-fa-charge-25145277'}, {'article_id': '0', 'title': ['cristiano', 'ronaldo', 'given', 'deadline', 'respond', 'charge'], 'url': 'https://www.manchestereveningnews.co.uk/sport/football/football-news/breaking-ronaldo-news-fa-charge-25145277'}]


### Ranking Results

In [68]:
ranked_results = rank_results(results)
print("Ranked Results:")
ranked_results

Ranked Results:


[{'article_id': '0',
  'title': ['cristiano', 'ronaldo', 'given', 'deadline', 'respond', 'charge'],
  'url': 'https://www.manchestereveningnews.co.uk/sport/football/football-news/breaking-ronaldo-news-fa-charge-25145277'},
 {'article_id': '1',
  'title': ['more',
   'maga',
   'rioters',
   'found',
   'guilty',
   'over',
   'jan',
   'attack'],
  'url': 'https://bipartisanreport.com/2022/10/16/2-more-maga-rioters-found-guilty-over-jan-6-attack/'},
 {'article_id': '2',
  'title': ['emotional',
   'louise',
   'thompson',
   'fights',
   'back',
   'tears',
   'she',
   'reveals',
   'shes',
   'been',
   'diagnosed',
   'with',
   'incurable',
   'health',
   'condition'],
  'url': 'https://www.thesun.co.uk/tvandshowbiz/20762466/louise-thompson-diagnosed-lupus-made-chelsea-instagram-ptsd/'},
 {'article_id': '3',
  'title': ['cooling',
   'program',
   'has',
   'stopped',
   'taking',
   'new',
   'applications',
   'despite',
   'heatwave'],
  'url': 'https://nypost.com/2022/07/24/ch

### Adding New Stuff

In [45]:
new_article = ["courseralearningacademy", ["Machine", "Learning", "AI", "Python"], [      
      "andor",
      "violent",
      "ronaldo",
      "was",
      "filmed",
      "knocking",
      "fan",
      "phone",
      "the",
      "floor",
      "united",
      "players",
      "approached",
      "the",
      "away",
      "dressing",
      "room",
      "following",
      "the",
      "loss",
      "also",
      "read",
      "waiting",
      "for",
      "the",]
]
json_data = add_content(json_data, new_article)

forward_index = build_forward_index(json_data)
inverted_index = build_inverted_index(json_data)

new_results = multi_word_search(inverted_index, "machine learning AI python")
print("\nResults after adding new content:")
display_results(json_data, new_results)


Results after adding new content:
Article 7: ['viral', 'video', 'seemingly', 'filmed', 'ukrainian', 'soldier', 'behind', 'enemy', 'lines', 'appears', 'show', 'ukrainian', 'zaz', 'zaporozhetsa', 'modified', 'version', 'sovietera', 'supermini', 'with', 'rocket', 'launchers', 'and', 'machine', 'guns', 'mounted', 'its', 'roofdestroying', 'whole', 'column', 'russian', 'tanks', 'and', 'vehicles', 'southern', 'ukraine', 'amid', 'ukraine', 'recent', 'tactical', 'triumphs', 'including', 'pushing', 'russian', 'forces', 'off', 'the', 'longcontested', 'snake', 'island', 'many', 'were', 'prepared', 'believe', 'that', 'the', 'scenes', 'depicted', 'the', 'clip', 'are', 'real', 'but', 'while', 'russia', 'invasion', 'ukraine', 'continues', 'generate', 'copious', 'amounts', 'striking', 'and', 'often', 'horrifying', 'modern', 'combat', 'the', 'authenticity', 'the', 'clip', 'saying', 'too', 'cinematic', 'true', 'several', 'telegram', 'channels', 'and', 'twitter', 'accounts', 'shared', 'video', 'unidentif