# Corpus Readers

In [1]:
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

In [2]:
DOC_PATTERN = r'(?!\.)[\w_\s]+\/[\w\s\d\-]+\.txt'
CAT_PATTERN = r'([\w_\s]+)\/.*'

In [3]:
corpus = CategorizedPlaintextCorpusReader('data/corpus', DOC_PATTERN, cat_pattern=CAT_PATTERN)

In [4]:
corpus.categories()[1]

'Star Wars'

In [6]:
corpus.fileids()[3:6]

['Star Wars/SW_EpisodeIV.txt',
 'Star Wars/SW_EpisodeV.txt',
 'Star Wars/SW_EpisodeVI.txt']

## Reading an HTML Corpus

In [7]:
import os
import nltk
import codecs
import sqlite3

In [8]:
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.api import CategorizedCorpusReader

In [9]:
CAT_PATTERN = r'([a-z_\s]+)\/.*'
DOC_PATTERN = r'(?!\.)[a-z_\s]+\/[a-f0-9]+\.json'
TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li']

In [10]:
class HTMLCorpusReader(CategorizedCorpusReader, CorpusReader):
    """
    A corpus reader for raw HTML documents to enable preprocessing.
    """

    def __init__(self, root, fileids=DOC_PATTERN, encoding='utf8',
                 tags=TAGS, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining
        arguments are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        # Initialize the NLTK corpus reader objects
        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)

        # Save the tags that we specifically want to extract.
        self.tags = tags

    def resolve(self, fileids, categories):
        """
        Returns a list of fileids or categories depending on what is passed
        to each internal corpus reader function. Implemented similarly to
        the NLTK ``CategorizedPlaintextCorpusReader``.
        """
        if fileids is not None and categories is not None:
            raise ValueError("Specify fileids or categories, not both")

        if categories is not None:
            return self.fileids(categories)
        return fileids

    def docs(self, fileids=None, categories=None):
        """
        Returns the complete text of an HTML document, closing the document
        after we are done reading it and yielding it in a memory safe fashion.
        """
        # Resolve the fileids and the categories
        fileids = self.resolve(fileids, categories)

        # Create a generator, loading one document into memory at a time.
        for path, encoding in self.abspaths(fileids, include_encoding=True):
            with codecs.open(path, 'r', encoding=encoding) as f:
                yield f.read()

    def sizes(self, fileids=None, categories=None):
        """
        Returns a list of tuples, the fileid and size on disk of the file.
        This function is used to detect oddly large files in the corpus.
        """
        # Resolve the fileids and the categories
        fileids = self.resolve(fileids, categories)

        # Create a generator, getting every path and computing filesize
        for path in self.abspaths(fileids):
            yield os.path.getsize(path)

In [11]:
html = HTMLCorpusReader('data/corpus')

In [12]:
res = list(html.tags)

In [14]:
doc = list(html.docs())

In [15]:
len(doc)

1

In [16]:
size = list(html.sizes())

In [17]:
print(size)

[53027490]


In [83]:
class SqliteCorpusReader(object):

    def __init__(self, path):
        self._cur = sqlite3.connect(path).cursor()

    def scores(self):
        """
        Returns the review score
        """
        self._cur.execute("SELECT score FROM reviews")
        for score in iter(self._cur.fetchone, None):
            yield score

    def texts(self):
        """
        Returns the full review texts
        """
        self._cur.execute("SELECT content FROM content")
        for text in iter(self._cur.fetchone, None):
            yield text

    def ids(self):
        """
        Returns the review ids
        """
        self._cur.execute("SELECT reviewid FROM content")
        for idx in iter(self._cur.fetchone, None):
            yield idx

In [298]:
sqlite = SqliteCorpusReader('data/pitchfork-data/database.sqlite')

In [299]:
scores = list(sqlite.scores())

In [300]:
scores[5]

(7.4,)

In [268]:
texts = list(sqlite.texts())

In [269]:
texts[5]

('In the pilot episode of “Insecure,” the critically lauded HBO comedy series created by Issa Rae and Larry Wilmore, Rae’s eponymous character Issa is at a crossroads. She’s in a stable but stale relationship, and the occasion of her 29th birthday has her wondering if she’s wasting time on a romance that’s heading nowhere. Issa decides to take her best friend Molly, who’s also feeling unlucky in love after a streak of failed flings, to an open mic night in hopes of setting her up with someone new—but secretly to reconnect with an ex-boyfriend. Before long, egged on by said ex, Issa winds up on the stage rapping about “Broken Pussy,” a term that she coined to explain Molly’s recent poor run of form (“Maybe it’s really rough, maybe it’s had enough.”)The resulting cheesy freestyle, set to the tune of Kelis’ twinkly 2006 hit “Bossy,” makes the cut as the second song on Insecure (Music From the HBO Original Series). Its placement injects a welcome dose of Issa’s personality (both fictional 

In [270]:
ids = list(sqlite.ids())

In [271]:
ids[5]

(22722,)