## Trying to solve a way to reduce where the max chunk of time is taken

In [8]:
import os
from engine.invertedindex import invDS

In [9]:
os.listdir("engine/invertedindex/data")[:10]

['invindex42.bin',
 'invindex517.bin',
 'invindex623.bin',
 'invindex386.bin',
 'invindex630.bin',
 'invindex287.bin',
 'invindex38.bin',
 'invindex610.bin',
 'invindex430.bin',
 'invindex66.bin']

In [16]:
%%time
invindex = invDS.load_from_binary_file("engine/invertedindex/data/invindex42.bin")

CPU times: user 87.5 ms, sys: 7.95 ms, total: 95.5 ms
Wall time: 93.6 ms


In [11]:
storagedir = "engine/invertedindex/data"

In [24]:
import pickle
with open(os.path.join(storagedir, "check.pkl"), "wb") as f:
    pickle.dump(invindex, f, protocol=4)


In [25]:
%%time
with open(os.path.join(storagedir, "check.pkl"), "rb") as f:
    invindex2 = pickle.load(f)

CPU times: user 139 ms, sys: 3.51 ms, total: 143 ms
Wall time: 141 ms


Even though an attempt was made to use pickle format to reduce time, I do not think that is feasible and worth the time

## We will use this notebook to add one document to the inverted index

The few steps to take:

- Check if it has already been parsed, if it has been then just return that it already exists (based on URL)
- Appoint doc_id to it
- Make a single forward index row for it
- Make multiple inverted index rows for it
- For each word_id, add the doc_id in its inverted index list :| :( ):
- close it
- Update metadata (add 1 doc)
- Give confirmation

In [14]:
# Checking to see if the url has already been parsed
def check_if_exists(url):
    currset = get_parsed_set()
    if url in currset:
        return True
    return False

In [15]:
flush()

Now that we have the inverted index, we have to add it to the main inverted indexes

In [1]:
from forwardindex.parsed_items.parsed import get_parsed_set, set_parsed_set
import forwardindex.protobufs.forwardindex_file_pb2 as fpb
from metadata.metadatafuncs import get_metadata, save_metadata # Getting the metadata
import wordlexicon.wordlexicon as wl # Importing wordlexicon
import os # Importing os to get directory data of each os file
import sys
sys.path.append("../")
from engine.forwardindex.parse_forward import parse
from engine.invertedindex.parsing.parse import parse as invParse# Getting each document element from here
from engine.invertedindex import invDS
from engine.invertedindex.gettingWordLists import get_invIndex, open_invertedindex, save_flush, flush
from engine.docdata.getdocdata.getjson import get_raw_docdata, save_flush as doc_flush
from engine.docdata import save_docs



In [2]:
from forwardindex.getjson import get_jsondata

jsondata = get_jsondata("yahoonews.json")

onearticle = jsondata[19]

19285


In [3]:
def addtoinvIndex(invertedindex):
    for index, wordelem in enumerate(invertedindex.wordelems):
        if len(wordelem)==0:
            
            continue
        for docelem in wordelem:
            get_invIndex(index).insert(docelem, index%500)
        

Combining all of these now

In [4]:
def appending_invIndex(article):
    currset = get_parsed_set()
    if article["url"] in currset:
        return "The article already exists, URL has been matched"
    
    total_words, total_docs, lexheight = get_metadata()
    forwardindex = fpb.ForwardIndex()
    forwardindex.docelement.append(parse(onearticle, total_docs))
    print("forward index created")
    invertedindex = invDS.InvertedIndex(total_words)
    print("inverted index empty")
    invParse(forwardindex, invertedindex)
    print("inverted index parsed and now adding to storage")
    addtoinvIndex(invertedindex)

    print("Working on doc data now")
    docdata = get_raw_docdata(total_docs)
    docdata.eachdoc.append(save_docs.parse(article))
    
    save_flush()
    doc_flush()
    save_metadata(total_words, total_docs+1, lexheight)
    currset.add(article["url"])
    return "The article has been successfully added"


We missed the docdata part doing that now

In [5]:
%%time 
appending_invIndex(onearticle)

Current size of set:  4194520  bytes
forward index created
inverted index empty
inverted index parsed and now adding to storage
Working on doc data now
CPU times: user 1min 18s, sys: 3.34 s, total: 1min 22s
Wall time: 1min 22s


'The article has been successfully added'

In [6]:
get_metadata()

(341310, 125008, 5000)

In [10]:
import os
os.listdir("forwardindex/JSONFILES/dir")

['thesun.json',
 'newsbusters.json',
 'usnews.json',
 'cnn.json',
 'yahoonews.json',
 'theindependent.json',
 'theguardianuk.json',
 'thenewyorktimes.json']

In [8]:
onearticle

{'id': 'yahoonews--2022-01-02--Jan. 6 committee studying whether it can subpoena U.S. Republican lawmakers - chairman',
 'date': '2022-01-02',
 'source': 'yahoonews',
 'title': 'Jan. 6 committee studying whether it can subpoena U.S. Republican lawmakers - chairman',
 'content': "( Reuters ) -The congressional committee investigating last year 's Jan. 6 attack on the U.S. Capitol is looking at issuing subpoenas to Republican members of Congress to force their cooperation , the panel 's chairman said on Sunday .\nRepresentative Bennie Thompson , a Democrat , said on NBC 's `` Meet The Press '' that the committee is examining whether it can lawfully issue subpoenas to sitting members of Congress .\n`` I think there are some questions of whether we have the authority to do it , '' Thompson said .\n`` We 're looking at it @ @ @ @ @ @ @ there 'll be no reluctance on our part . ''\nThompson chairs the House of Representatives Select Committee on Jan. 6 , which is expected to hold public heari