In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import warnings
import csv
import json

warnings.filterwarnings('ignore')

In [4]:
def read_and_display_tsv(tsv_filename, num_rows=100000): 
    df = pd.read_csv(tsv_filename, sep='\t', nrows=num_rows)
    print(df.head()) 
    return df

In [6]:
tsv_filename = 'data.tsv'
data = read_and_display_tsv(tsv_filename)

   Unnamed: 0.1  Unnamed: 0        PAPER_ID  SEC_INDEX  POS_INDEX  \
0             0           0  astroph0001519  body_text          1   
1             1         946  astroph0004164  body_text         46   
2             2         956  astroph0004245  body_text          4   
3             3        1141  astroph0005284  body_text         15   
4             4        1457  astroph0006088  body_text          3   

                                      SECTION  IsFOOTNOTE  \
0                                Introduction           0   
1  Discussion::Implications for Nova Searches           0   
2                  Discovery and Observations           0   
3                         Alerts and Archives           1   
4                                Observations           1   

                                   FOOTNOTE  \
0                                       NaN   
1                                       NaN   
2                                       NaN   
3  http://www.kusastro.kyoto-u

In [7]:
data.tail()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,PAPER_ID,SEC_INDEX,POS_INDEX,SECTION,IsFOOTNOTE,FOOTNOTE,CONTEXT,CONTEXT_PAR,...,SECTION_INDEX,SECTION_TOTAL,SECTION_POS_PROPORTION,PAR_TOTAL_LENGTH,PAR_INDEX,PAR_POS_PROPORTION,PAR_POS_PROPORTION_bin,SECTION_POS_PROPORTION_bin,title,n_citations
99995,99995,36654,astroph0603250,body_text,46,Identification of early SUSY-CDM micro-subhalo...,0,,"First, the local densities are calculated usin...",SKID (publicly available at: www-hpcc.astro.wa...,...,13.0,14.0,0.928571,62.0,51.0,0.822581,90%,100%,Early Supersymmetric Cold Dark Matter Substruc...,122.0
99996,99996,36657,astroph0603313,abstract,6,Abstract,0,,A full–resolution manuscript can be obtained a...,[This manuscript uses low–resolution figures. ...,...,1.0,18.0,0.055556,280.0,6.0,0.021429,10%,10%,Why X-ray--Selected AGN Appear Optically Dull,61.0
99997,99997,36659,astroph0603313,body_text,10,Sample selection and data,1,http://www.eso.org/science/eis/surveys/strateg...,We add the following optical and near–infrared...,We add the following optical and near–infrared...,...,4.0,18.0,0.222222,280.0,18.0,0.064286,10%,30%,Why X-ray--Selected AGN Appear Optically Dull,61.0
99998,99998,36660,astroph0603431,body_text,6,Data reduction and analysis::X-ray data::Data ...,1,see http://swift.gsfc.nasa.gov/docs/swift/arch...,We obtained the XRT data from the SWIFT archiv...,We obtained the XRT data from the SWIFT archiv...,...,4.0,14.0,0.285714,78.0,19.0,0.24359,30%,30%,The Gamma-ray burst 050904 : evidence for a te...,38.0
99999,99999,47742,704.2486,body_text,14,The GRB sample,1,http://swift.gsfc.nasa.gov/docs/swift/archive/,The BAT event files were retrieved from the Sw...,The BAT event files were retrieved from the Sw...,...,3.0,14.0,0.214286,77.0,22.0,0.285714,30%,30%,Testing the gamma-ray burst variability/peak l...,28.0


# Reading Cleaned JSON Sample

In [5]:
def display_json(file_path):
    try:
        with open(file_path, 'r') as file:
            json_data = json.load(file)
            # Use indent parameter to make the JSON data readable
            readable_json = json.dumps(json_data, indent=4)
            print(readable_json)
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")

In [17]:
display_json('Files\cleaned.json')

{
    "index": {
        "0": 0,
        "1": 1,
        "2": 2,
        "3": 3,
        "4": 4,
        "5": 5,
        "6": 6,
        "7": 7,
        "8": 8,
        "9": 9
    },
    "source": {
        "0": "themanchestereveningnews",
        "1": "bipartisanreport",
        "2": "thesun",
        "3": "newyorkpost",
        "4": "pbs",
        "5": "westernjournal",
        "6": "thepoliticalinsider",
        "7": "newsweek",
        "8": "thetelegraph",
        "9": "shtfplan"
    },
    "title": {
        "0": [
            "cristiano",
            "ronaldo",
            "given",
            "deadline",
            "respond",
            "charge"
        ],
        "1": [
            "more",
            "maga",
            "rioters",
            "found",
            "guilty",
            "over",
            "jan",
            "attack"
        ],
        "2": [
            "emotional",
            "louise",
            "thompson",
            "fights",
            "back",
      

In [7]:
def build_forward_index(data):
    forward_index = {}
    
    for doc_id, words in data["title"].items():
        forward_index[doc_id] = set(words)
        
    return forward_index

def build_inverted_index(data):
    inverted_index = {}
    
    for doc_id, words in data["title"].items():
        for word in words:
            if word not in inverted_index:
                inverted_index[word] = set()
            inverted_index[word].add(doc_id)
    
    return inverted_index

def load_data_from_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

In [8]:
json_file_path = "Files\cleaned.json"
sample_data = load_data_from_json(json_file_path)

forward_index = build_forward_index(sample_data)
inverted_index = build_inverted_index(sample_data)

print("Forward Index:")
print(forward_index)
print("\nInverted Index:")
print(inverted_index)

Forward Index:
{'0': {'ronaldo', 'respond', 'charge', 'cristiano', 'given', 'deadline'}, '1': {'attack', 'guilty', 'found', 'jan', 'maga', 'over', 'rioters', 'more'}, '2': {'she', 'emotional', 'health', 'incurable', 'tears', 'been', 'with', 'condition', 'louise', 'shes', 'thompson', 'back', 'diagnosed', 'reveals', 'fights'}, '3': {'cooling', 'has', 'taking', 'new', 'despite', 'stopped', 'applications', 'program', 'heatwave'}, '4': {'considers', 'kyiv', 'for', 'russia', 'savage', 'visiting', 'war', 'pope', 'rebukes', 'ukraine'}, '5': {'phone', 'dead', 'solve', 'for', 'cascade', 'messages', 'good', 'led', 'disaster', 'reveal', 'bad', 'mystery', 'that', 'decisions', 'familys', 'pulled', 'from'}, '6': {'revelations', 'state', 'violations', 'amendment', 'deep', 'illustrate', 'first', 'twitter', 'involvement'}, '7': {'fact', 'column', 'wiped', 'ukrainian', 'mini', 'russian', 'video', 'out', 'real', 'check'}, '8': {'races', 'racing', 'for', 'and', 'todays', 'tips', 'bets', 'best', 'marlboroug

# Next Steps

In [13]:
class Node:
    def __init__(self, key, value):
        self.key = key
        self.value = value
        self.next = None

class LinkedList:
    def __init__(self):
        self.head = None

    def insert(self, key, value):
        new_node = Node(key, value)
        new_node.next = self.head
        self.head = new_node

    def search(self, key):
        current = self.head
        while current:
            if current.key == key:
                return current.value
            current = current.next
        return None

class HashTable:
    def __init__(self, size):
        self.size = size
        self.table = [None] * size

    def hash_function(self, key):
        return hash(key) % self.size

    def insert(self, key, value):
        index = self.hash_function(key)
        if self.table[index] is None:
            self.table[index] = LinkedList()
        self.table[index].insert(key, value)

    def search(self, key):
        index = self.hash_function(key)
        if self.table[index] is not None:
            return self.table[index].search(key)
        return None

In [14]:
json_file_path = "Files\cleaned.json"
json_data = load_data_from_json(json_file_path)

In [19]:
forward_index = HashTable(size=100)

for article_id, title_words in json_data["title"].items():
    forward_index.insert(article_id, title_words)

inverted_index = HashTable(size=100)

for article_id, content_words in json_data["content"].items():
    for word in content_words:
        if inverted_index.search(word):
            inverted_index.search(word).append(article_id)
        else:
            inverted_index.insert(word, [article_id])

article_id_to_search = "0"
title_result = forward_index.search(article_id_to_search)
word_to_search = "for"
documents_containing_word = inverted_index.search(word_to_search)

print(f"Title for Article {article_id_to_search}: {title_result}")
print(f"Documents containing the word '{word_to_search}': {documents_containing_word}")

Title for Article 0: ['cristiano', 'ronaldo', 'given', 'deadline', 'respond', 'charge']
Documents containing the word 'for': ['0', '0', '1', '1', '1', '1', '2', '2', '3', '3', '3', '4', '4', '4', '4', '4', '4', '4', '4', '4', '4', '4', '4', '4', '4', '4', '5', '5', '6', '6', '6', '6', '6', '6', '6', '7', '7', '8', '8', '9', '9']
