In [12]:
import time
import re
import sys
import glob
import os
import gzip
import json
import math
import ray
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from pathlib import Path
sys.path.append(os.path.abspath("/home/arxiv/doc_intel_etl"))
os.environ['PYTHONPATH'] = os.path.dirname(os.getcwd())
import config
import src.blob_data_transfer as blob_pull
from regex_arxiv import REGEX_ARXIV_FLEXIBLE, clean

In [13]:
file_type = 'txt'
year_del = 3
prefix = 'arxiv_training_data/pdfplumber/text'

In [14]:
RE_FLEX = re.compile(REGEX_ARXIV_FLEXIBLE)
RE_OLDNAME_SPLIT = re.compile(r"([a-z\-]+)(\d+)")

In [4]:
'''
Get list of text files within our blob container. They should be in the following paths:

Contianer:
    arxiv:
        arxiv_dl:
        arxiv_pdf:
        arxiv_training_data:
            images:
            pdfplumber:
                chars:
                words:
                text:
                    year:
                        *.txt
We then only want the list of blobs within /arxiv_training_data/pdfplumber/text/*.txt
'''

'\nGet list of text files within our blob container. They should be in the following paths:\n\nContianer:\n    arxiv:\n        arxiv_dl:\n        arxiv_pdf:\n        arxiv_training_data:\n            images:\n            pdfplumber:\n                chars:\n                words:\n                text:\n                    year:\n                        *.txt\nWe then only want the list of blobs within /arxiv_training_data/pdfplumber/text/*.txt\n'

In [15]:
full_blob_list = blob_pull.get_blob_list(prefix)
blob_list, year_list = blob_pull.get_blob_file_list(file_type, full_blob_list, year_del)

In [16]:
ray.init()

2020-08-04 02:47:52,786	INFO resource_spec.py:212 -- Starting Ray with 37.11 GiB memory available for workers and up to 18.56 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-08-04 02:47:53,194	INFO services.py:1170 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


{'node_ip_address': '172.17.0.2',
 'raylet_ip_address': '172.17.0.2',
 'redis_address': '172.17.0.2:64980',
 'object_store_address': '/tmp/ray/session_2020-08-04_02-47-52_784029_157/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-08-04_02-47-52_784029_157/sockets/raylet',
 'webui_url': 'localhost:8265',
 'session_dir': '/tmp/ray/session_2020-08-04_02-47-52_784029_157'}

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/ray/dashboard/dashboard.py", line 1220, in <module>
    dashboard.run()
  File "/opt/conda/lib/python3.7/site-packages/ray/dashboard/dashboard.py", line 594, in run
    aiohttp.web.run_app(self.app, host=self.host, port=self.port)
  File "/opt/conda/lib/python3.7/site-packages/aiohttp/web.py", line 433, in run_app
    reuse_port=reuse_port))
  File "/opt/conda/lib/python3.7/asyncio/base_events.py", line 587, in run_until_complete
    return future.result()
  File "/opt/conda/lib/python3.7/site-packages/aiohttp/web.py", line 359, in _run_app
    await site.start()
  File "/opt/conda/lib/python3.7/site-packages/aiohttp/web_runner.py", line 104, in start
    reuse_port=self._reuse_port)
  File "/opt/conda/lib/python3.7/asyncio/base_events.py", line 1389, in create_server
    % (sa, err.strerror.lower())) from None
OSError: [Errno 99] error while attempting to bind on address ('::1', 8265, 0, 0): cannot assig

1. Get blob list
2. send individual blob in list to path_to_id to get the arxiv id
3. get the text from the blob by streaming it from get_text_stream
4. 

In [165]:
def path_to_id(blob):
    """
    Convert filepath name of ArXiv file to ArXiv ID.
    Need to remove the ".txt" from file names first if they have it
    """
    name = os.path.splitext(os.path.basename(blob))[0]
    name.replace('.txt','')
    if '.' in name:  # new  ID
        return name 
    split = [a for a in RE_OLDNAME_SPLIT.split(name) if a]
    return "/".join(split)

def get_text_stream(blob):
        return blob_pull.stream_blob(blob).decode()

def extract_references(txt, pattern=RE_FLEX):
    """
    Parameters
    ----------
        filename : str
            name of file to search for pattern
        pattern : re pattern object
            compiled regex pattern

    Returns
    -------
        citations : list
            list of found arXiv IDs
    """
    out = []
    for matches in pattern.findall(txt):
        out.extend([clean(a) for a in matches if a])
    return list(set(out))

@ray.remote
def citation_list_inner(article):
    """ Find references in all the input articles
    Parameters
    ----------
        article : str
            path to article blob
    Returns
    -------
        citations : dict[arXiv ID] = list of arXiv IDs
            dictionary of articles and their references
    """
    cites = {}
    try:
        article_text = get_text_stream(article)
        refs = extract_references(article_text)
        cites[path_to_id(article)] = refs
        return cites
    except Exception as e:
        print("Error in {}".format(article))
        print(e)
        #log.error("Error in {}".format(article))

def default_filename():
    return os.path.join(os.getcwd(), 'test.json.gz')

def save_to_default_location(citations):
    filename = default_filename()
    
#     if not os.path.isfile(filename):
#         with gzip.open(filename, 'w') as fn:
#             json.dump(json.dumps(citations), fn)
#     else:
#         with gzip.open(filename, 'r+') as fn:
#         # appending json data
#             data = json.load(fn)
#             data.update(json.dumps(citations).encode('utf-8'))
#             fn.seek(0)
#             json.dump(data, fn)
#             fn.close()
    with gzip.open(filename, 'a+') as fn:
        json_data = json.dumps(citations).encode('utf-8')
        fn.write(json_data + '\n'.encode('utf-8'))
''' 
Here what we're doing is creating a json line format file where
each line is essentially a json document. In our case each line is the
json document of the articles and their citations. Need to seperate each
line in the jsonl with the '\n'.
source: 
https://medium.com/@galea/how-to-love-jsonl-using-json-line-format-in-your-workflow-b6884f65175b
'''

" \nHere what we're doing is creating a json line format file where\neach line is essentially a json document. In our case each line is the\njson document of the articles and their citations. Need to seperate each\nline in the jsonl with the '\n'.\nsource: \nhttps://medium.com/@galea/how-to-love-jsonl-using-json-line-format-in-your-workflow-b6884f65175b\n"

In [18]:
# text_path = blob_pull.copy_blob(blob_list[-100:])
# # get all text files
# articles = []
# articles.extend(glob.glob(text_path+'/*.txt'))
cites = ray.get([citation_list_inner.remote(article) for article in blob_list[-10:]])

In [113]:
json.loads(json.dump(cites))

TypeError: dump() missing 1 required positional argument: 'fp'

In [181]:
save_to_default_location(cites)

In [182]:
file = '/home/arxiv/doc_intel_etl/notebooks/test.json.gz'
cite = []
with gzip.open(file, 'r') as f:
    for line in f:
        cite.extend(json.loads(line))#.rstrip('\n').decode('utf-8')))

In [99]:
test = json.loads(cite[0])

In [183]:
cite

[{'1401.8154': ['math/0408008', '1308.1172']},
 {'1401.8156': []},
 {'1401.8181': []},
 {'1401.8182': []},
 {'1401.8201': ['1406.0349']},
 {'1401.8202': []},
 {'1401.8203': ['gr-qc/9808028']},
 {'1401.8208': ['1202.4317',
   '1404.4255',
   'hep-th/0412030',
   'astro-ph/0611816',
   '1303.5076',
   '1406.2417']},
 {'1401.8219': ['1307.6272', '1309.0386', '1312.2986']},
 {'1401.8230': []},
 {'1401.8154': ['math/0408008', '1308.1172']},
 {'1401.8156': []},
 {'1401.8181': []},
 {'1401.8182': []},
 {'1401.8201': ['1406.0349']},
 {'1401.8202': []},
 {'1401.8203': ['gr-qc/9808028']},
 {'1401.8208': ['1202.4317',
   '1404.4255',
   'hep-th/0412030',
   'astro-ph/0611816',
   '1303.5076',
   '1406.2417']},
 {'1401.8219': ['1307.6272', '1309.0386', '1312.2986']},
 {'1401.8230': []},
 {'1401.8154': ['math/0408008', '1308.1172']},
 {'1401.8156': []},
 {'1401.8181': []},
 {'1401.8182': []},
 {'1401.8201': ['1406.0349']},
 {'1401.8202': []},
 {'1401.8203': ['gr-qc/9808028']},
 {'1401.8208': ['1202

In [72]:
cite

['[{"1401.8154": ["math/0408008", "1308.1172"]}, {"1401.8156": []}, {"1401.8181": []}, {"1401.8182": []}, {"1401.8201": ["1406.0349"]}, {"1401.8202": []}, {"1401.8203": ["gr-qc/9808028"]}, {"1401.8208": ["1202.4317", "1404.4255", "hep-th/0412030", "astro-ph/0611816", "1303.5076", "1406.2417"]}, {"1401.8219": ["1307.6272", "1309.0386", "1312.2986"]}, {"1401.8230": []}][{"1401.8154": ["math/0408008", "1308.1172"]}, {"1401.8156": []}, {"1401.8181": []}, {"1401.8182": []}, {"1401.8201": ["1406.0349"]}, {"1401.8202": []}, {"1401.8203": ["gr-qc/9808028"]}, {"1401.8208": ["1202.4317", "1404.4255", "hep-th/0412030", "astro-ph/0611816", "1303.5076", "1406.2417"]}, {"1401.8219": ["1307.6272", "1309.0386", "1312.2986"]}, {"1401.8230": []}]']