In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
import os
from itertools import izip
import itertools
from collections import defaultdict
from gensim.parsing import preprocessing as gprocessing
from spacy.en import English as nlp
from bs4 import BeautifulSoup

In [3]:
os.chdir("..")
os.getcwd()

'/mnt/Storage/Coding_Projects/Candidate_Classifier'

In [3]:
with open('candidate_classifier/data/R_Debate_2015-08-06.txt', 'r') as _f:
    test = _f.read()

In [4]:
p = re.compile(r"(^\[?[A-Z ]*?\]?:)", re.U|re.M)

In [5]:
chunks = re.split(p, test, maxsplit=1)

In [6]:
chunks[1:]

['KELLY:',
 ' Welcome to the first debate night of the 2016 presidential campaign, live from Quicken Loans Arena in Cleveland, Ohio.\n\nI\'m Megyn Kelly... [applause]... along with my co-moderators, Brett Baier and Chris Wallace.\n\nTonight... [applause] Nice.\n\nTonight, thousands of people here in the Q, along with millions of voters at home will get their very first chance to see the candidates face off in a debate, answering the questions you want answered.\n\nBAIER: Less than a year from now, in this very arena, one of these 10 candidates or one of the seven on the previous debate tonight will accept the Republican party\'s nomination. [applause]\n\nTonight\'s candidates were selected based on an average of five national polls. Just a few hours ago, you heard from the candidates ranked 11th through 17. And now, the prime-time event, the top 10.\n\nWALLACE: Also of note, Fox News is partnering for tonight\'s debate with Facebook. For the past several weeks, we\'ve been asking you f

## String Processing Pipeline

In [7]:
from candidate_classifier.string_processing import *
from candidate_classifier import utils

In [38]:
# Replace [*] with ''
# Replace '. . .' with '...'
# Replace multiple ellipses with single ...
# Remove all sentences that end with ...

BRACKET_PATTERN = re.compile(r"\[[a-zA-Z ]*\]", re.U)
SPACED_ELLIPSIS_PATTERN = re.compile(r"((?:\.\s){3})")
MULTI_ELLIPSIS_PATTERN = re.compile(r"(?:(?:\.){3} ?)+")
ENDS_WITH_ELLIPSIS = lambda s: s[-3:] == '...'
STARTS_WITH_DASH = lambda s: s.strip()[0] == '-'

In [15]:
str_transformers = [StringTransformer(prefilter_substitutions=[BRACKET_PATTERN, 
                                                               'whitespace',
                                                               'strip',
                                                               (SPACED_ELLIPSIS_PATTERN, '...'),
                                                               (MULTI_ELLIPSIS_PATTERN, '...')])]

sent_transformers = [SentenceTransformer(filters=[ENDS_WITH_ELLIPSIS])]

In [16]:
s_processor = StringProcessor(str_transformers + sent_transformers)

## Corpora

In [9]:
from candidate_classifier.nltk_model import NgramModel
from candidate_classifier import utils
from nltk.probability import LaplaceProbDist, LidstoneProbDist
from nltk.corpus import PlaintextCorpusReader
import os
from candidate_classifier.debate_corpus_reader import DebateCorpusReader

In [10]:
corpus_root = os.path.abspath('candidate_classifier/data')

In [48]:
debates = DebateCorpusReader('candidate_classifier/data', '.*', word_tokenizer=DummyTokenizer())

In [49]:
debates.fileids()

['R_Debate_2015-08-06.txt',
 'R_Debate_2015-09-16.txt',
 'R_Debate_2015-10-28.txt',
 'R_Debate_2015-11-10.txt',
 'R_Debate_2015-12-15.txt',
 'R_Debate_2016-01-14.txt']

## Model
To get more usable text, all paragraphs from a speaker are joined and munged and the tokenized afterwards.

In [7]:
from spacy.en import English
from candidate_classifier.string_processing import *

In [13]:
nlp = English(entity=False, load_vectors=False)

In [27]:
class TransformerWrapper(object):
    def __init__(self, transformer):
        self.transformer = transformer
    
    def tokenize(self, s):
        return self.transformer(s)

    
def sent_tokenizer(s):
    doc = nlp(s)
    return [u''.join(t.text_with_ws for t in sent) for sent in doc.sents]
#     return [sent for sent in doc.sents]

sent_tokenizer(u"This is a sentence. Here's another...")

def word_tokenizer(s):
    toks = nlp(s)
    return ['<S>'] + [t.lower_ for t in toks] + ['</S>']

In [28]:
# Replace [*] with ''
# Replace '. . .' with '...'
# Replace multiple ellipses with single ...
# Remove all sentences that end with ...

BRACKET_PATTERN = re.compile(r"\[[a-zA-Z ]*\]", re.U)
SPACED_ELLIPSIS_PATTERN = re.compile(r"((?:\.\s){3})")
MULTI_ELLIPSIS_PATTERN = re.compile(r"(?:(?:\.){3} ?)+")
ENDS_WITH_ELLIPSIS = lambda s: s[-3:] == '...'
STARTS_WITH_ELLIPSIS = lambda s: s[:3] == '...'
STARTS_WITH_DASH = lambda s: s[0] == '-'
ENDS_WITH_DASH = lambda s: s[-1] == '-'


def add_delimiters(s):
    return '<S> ' + s + ' </S>'

In [32]:
doc_transformer = TransformerABC(
    prefilter_substitutions=[BRACKET_PATTERN,
                             (SPACED_ELLIPSIS_PATTERN, '...'),
                             (MULTI_ELLIPSIS_PATTERN, '...'),
                             'whitespace',
                             'strip',
                             'deaccent'],
    tokenizer=sent_tokenizer)


sent_transformer = TransformerABC(
    prefilter_substitutions=['strip'],
    filters=[STARTS_WITH_ELLIPSIS, ENDS_WITH_ELLIPSIS, STARTS_WITH_DASH, ENDS_WITH_DASH],
    tokenizer=word_tokenizer)
#     postfilter_substitutions=[add_delimiters])

In [33]:
debates = DebateCorpusReader('candidate_classifier/data', '.*', 
                             sent_tokenizer=TransformerWrapper(doc_transformer), 
                             word_tokenizer=TransformerWrapper(sent_transformer))

In [42]:
list(debates.grouped_sents(speakers='TRUMP')['TRUMP'])

[['<S>', u'i', u'fully', u'understand', u'.', '</S>'],
 ['<S>', u'i', u'fully', u'understand', u'.', '</S>'],
 ['<S>', u'i', u'can', u'not', u'say', u'.', '</S>'],
 ['<S>',
  u'i',
  u'have',
  u'to',
  u'respect',
  u'the',
  u'person',
  u'that',
  u',',
  u'if',
  u'it',
  u"'s",
  u'not',
  u'me',
  u',',
  u'the',
  u'person',
  u'that',
  u'wins',
  u',',
  u'if',
  u'i',
  u'do',
  u'win',
  u',',
  u'and',
  u'i',
  u"'m",
  u'leading',
  u'by',
  u'quite',
  u'a',
  u'bit',
  u',',
  u'that',
  u"'s",
  u'what',
  u'i',
  u'want',
  u'to',
  u'do',
  u'.',
  '</S>'],
 ['<S>', u'i', u'can', u'totally', u'make', u'that', u'pledge', u'.', '</S>'],
 ['<S>',
  u'if',
  u'i',
  u"'m",
  u'the',
  u'nominee',
  u',',
  u'i',
  u'will',
  u'pledge',
  u'i',
  u'will',
  u'not',
  u'run',
  u'as',
  u'an',
  u'independent',
  u'.',
  '</S>'],
 ['<S>',
  u'but',
  u'--',
  u'and',
  u'i',
  u'am',
  u'discussing',
  u'it',
  u'with',
  u'everybody',
  u',',
  u'but',
  u'i',
  u"'m",
  

In [5]:
import string

In [19]:
NLP = English(entity=False, tagger=False, parser=False, load_vectors=False)
PUNCT = frozenset(string.punctuation)
def add_punct(s):
    if s[-1] not in PUNCT:
        return s + u'.'
    else:
        return s

def tokenizer(s):
    return [t.lower_ for t in NLP(s)]

STRING_PROCESSOR = TransformerABC(
    prefilter_substitutions=['html',
                             'whitespace',
                             'strip',
                             'deaccent',
                              add_punct],
                                  tokenizer=tokenizer)

In [20]:
print STRING_PROCESSOR("I feel pretty.")

[u'i', u'feel', u'pretty', u'.']


In [21]:
STRING_PROCESSOR("I'm the greatest.")

[u'i', u"'m", u'the', u'greatest', u'.']