# Machine Learning on Text/Language 

In [1]:
import os 
import nltk
import math
import gensim 
import pickle 
import random 

from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.api import CategorizedCorpusReader

In [2]:
# Module Variables
ROOT   = os.getcwd() 
CORPUS = os.path.join(ROOT, "fixtures", "tagged")

## Simple Corpus Reader 

In [3]:
PKL_PATTERN = r'(?!\.)[a-z_\s]+/[a-f0-9]+\.pickle'
CAT_PATTERN = r'([a-z_\s]+)/.*'

class BaleenCorpusReader(CategorizedCorpusReader, CorpusReader):
    """
    Quick reader for the preprocessed tokenized and tagged version of the corpus. 
    """

    def __init__(self, root, fileids=PKL_PATTERN, categoryids=CAT_PATTERN):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        CategorizedCorpusReader.__init__(self, {"cat_pattern": categoryids})
        CorpusReader.__init__(self, root, fileids)
    
    def _resolve(self, fileids, categories):
        """
        Returns a list of fileids or categories depending on what is passed
        to each internal corpus reader function. This primarily bubbles up to
        the high level ``docs`` method, but is implemented here similar to
        the nltk ``CategorizedPlaintextCorpusReader``.
        """
        if fileids is not None and categories is not None:
            raise ValueError("Specify fileids or categories, not both")

        if categories is not None:
            return self.fileids(categories)
        return fileids

    def docs(self, fileids=None, categories=None):
        """
        Returns the document loaded from a pickled object for every file in
        the corpus. Similar to the BaleenCorpusReader, this uses a generator
        to acheive memory safe iteration.
        """
        # Resolve the fileids and the categories
        fileids = self._resolve(fileids, categories)

        # Create a generator, loading one document into memory at a time.
        for path, enc, fileid in self.abspaths(fileids, True, True):
            with open(path, 'rb') as f:
                yield pickle.load(f)

    def paras(self, fileids=None, categories=None):
        """
        Returns a generator of paragraphs where each paragraph is a list of
        sentences, which is in turn a list of (token, tag) tuples.
        """
        for doc in self.docs(fileids, categories):
            for paragraph in doc:
                yield paragraph

    def sents(self, fileids=None, categories=None):
        """
        Returns a generator of sentences where each sentence is a list of
        (token, tag) tuples.
        """
        for paragraph in self.paras(fileids, categories):
            for sentence in paragraph:
                yield sentence

    def words(self, fileids=None, categories=None):
        """
        Returns a generator of (token, tag) tuples.
        """
        for sentence in self.sents(fileids, categories):
            for token in sentence:
                yield token

In [4]:
corpus = BaleenCorpusReader(CORPUS)

# Print statistics about each category. 
words = nltk.ConditionalFreqDist([
        (category, word) 
        for category in corpus.categories()
        for word in corpus.words(categories=category)
    ])

In [5]:
for category, dist in words.items():
    wc = sum(dist.values())
    vb = len(dist) 
    print("{} has {:>,} vocab and {:>,} words".format(category, vb, wc))

data_science has 23,614 vocab and 261,494 words
cinema has 45,956 vocab and 688,302 words
business has 84,434 vocab and 2,258,435 words
books has 44,302 vocab and 517,482 words
cooking has 22,787 vocab and 258,994 words
tech has 52,305 vocab and 870,464 words
sports has 32,468 vocab and 579,188 words
design has 21,972 vocab and 178,851 words
do_it_yourself has 29,077 vocab and 322,049 words
politics has 44,782 vocab and 1,031,593 words
gaming has 37,759 vocab and 579,124 words
news has 162,010 vocab and 8,441,547 words


## Classifiers 

### Build Datasets 

In [16]:
def train_test_split(corpus, categories=None, test=0.2):
    """
    Build a training and testing set of documents with their associated 
    labels by creating a list of documents for the specified categories, 
    shuffling them, then returning test% and 1-test% of the data set. 
    
    Note: must specify a list of categories 
    """
    
    # Get the total list of categories
    categories = categories or corpus.categories() 
        
    # Build a list of the documents with their associated words
    # Note this loads the entire corpus into memory!
    docs = [
        (
            list(corpus.words(fileids=fileid)), 
            corpus.categories(fileids=fileid)[0]   
        )
        for fileid in corpus.fileids(categories=categories)
    ]
    
    # Shuffle the document in place 
    random.shuffle(docs) 
    
    # Find the split index 
    split = math.floor(len(docs)*test)
    
    # Return the train/test based on the split 
    return docs[split:], docs[:split]

In [17]:
train, test = train_test_split(corpus)

### Feature Extraction 

In [None]:
def normalize(documents):
    """
    Removes stopwords, lowercases, lemmatizes 
    """