# Obtain arxiv ids via RSS

In [1]:
import feedparser
import re
import tarfile

import os, requests, shutil

In [2]:
feed = feedparser.parse('http://arxiv.org/rss/cond-mat.str-el')

In [3]:
ids = [entry['id'].rsplit('/',1)[-1] 
       for entry in feed['entries']]

In [4]:
ids[0]

u'1712.04942'

# Download source & extract tex

In [5]:
# Source files can be downloaded like so:
# https://arxiv.org/e-print/1712.04906
# while they don't come with extensions
# it seems to be .tar.gz
# we will work under this assumption
# and hope it doesn't break

In [6]:
def id_to_url(id):
    "URL to download the source file for a paper"
    return "http://arxiv.org/e-print/" + id

In [7]:
def download_source(id):
    url = id_to_url(id)
    path = os.path.join('data', id)
    
    r = requests.get(url, stream=True)
    
    if not os.path.exists(path):
        os.makedirs(path)
    
    filename = os.path.join(path, id+'.tar.gz')

    with open(filename, 'wb') as f:
        f.write(r.content)
    
    del r

In [8]:
def extract_tex(id):
    path = os.path.join('data', id)
    tar = tarfile.open(os.path.join(path,id + '.tar.gz'))
    tar.extractall(path)
    tar.close()
    
    files = os.listdir(path)
    latex_files = [os.path.join(path,fn) for fn in files if fn.endswith(".tex")]
    
    return latex_files

In [94]:
download_source(ids[4])

In [139]:
latex_files = extract_tex(ids[4])

In [140]:
latex_files

[u'data/1712.05026/mtp2.tex']

## Scrape comments from tex files

In [9]:
long_comment_regexp = "^\s*(%.*)$"
short_comment_regexp = '.*?(%.*)$'

In [10]:
def long_comments_from_lines(lines):

    # State variable
    comment_started = False 
    # Contains current comment
    comment = []
    # Contains list of all comments -- overall output
    result = []

    for line in lines:
        line_is_comment = re.search(long_comment_regexp, line)
        if not comment_started and line_is_comment:
            # beginning of comment
            comment = [line]
            comment_started = True
        elif comment_started and line_is_comment:
            # continuation of comment
            comment.append(line_is_comment.group(1))
        elif comment_started and not line_is_comment:
            # end of comment
            result.append(comment)
            comment_started = False            
        elif not comment_started and not line_is_comment:
            # continuation of non-comment
            pass

    return result

In [11]:
def short_comments_from_lines(lines):

    result = []
    for line in lines:
        if not re.search(long_comment_regexp, line):
            match = re.search(short_comment_regexp, line)
            if match:
                result.append(match.group(1))
    return result

In [12]:
for l in latex_files:
    with open(l) as f:
        lines = f.readlines()

NameError: name 'latex_files' is not defined

In [153]:
short_comments_from_lines(lines)

['% show bookmarks bar?',
 '% non-Latin characters ',
 '% show Acrobat',
 '% show Acrobat ',
 '% window fit to page when opened',
 '% fits the width of the page to the window',
 '% title',
 '% author',
 '% subject of the document',
 '% creator of the document',
 '% producer of the document',
 '% list of keywords',
 '% links in new window',
 '% false: boxed links; true: colored links',
 '%red,          % color of internal links (change box color with linkbordercolor)',
 '% color of links to bibliography',
 '% color of file links',
 '% color of external links']

In [146]:
long_comments_from_lines(lines)

[['%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n']]

In [15]:
def write_comments(ids, long_fn, short_fn,):
    
    with open(long_fn, 'w') as l_outf:
        with open(short_fn, 'w') as s_outf:
            
            for aid in ids:
                try:
                    download_source(aid)
                    latex_files = extract_tex(aid)
        
                    for l in latex_files:
                        with open(l) as f:
                            lines = f.readlines()
        
                        s_comments = short_comments_from_lines(lines)
                        l_comments = long_comments_from_lines(lines)
        
                        for comment in l_comments:
                            l_outf.writelines(comment)
                            l_outf.write('\n')

                        for comment in s_comments:
                            s_outf.write(comment)
                            s_outf.write('\n')
    
                except:
                    continue

In [17]:
write_comments(ids,
               'comments/long_comments.txt',
               'comments/short_comments.txt')