In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import spacy 
# import en_core_web_sm
from sklearn.metrics.pairwise import linear_kernel
import numpy as np


In [2]:
df = pd.read_csv('../data/src_law/first_merged_case_law.csv', sep='\t', encoding='utf-8')

In [76]:
df.head()

Unnamed: 0.1,Unnamed: 0,case_text,case_title,court,date,docket,tags,type_of_law,url,web_source
0,0,"United States Court of Appeals,Seventh Circuit...",Ward v. Holder,United States Seventh Circuit,01/21/2011,10-2063,"Government Law, Immigration Law",case,http://caselaw.findlaw.com/us-7th-circuit/1553...,findlaw.com
1,1,"United States Court of Appeals,Seventh Circuit...",Alvarado-Fonseca v. Holder,United States Seventh Circuit,01/06/2011,10-1917,"Constitutional Law, Criminal Law & Procedure, ...",case,http://caselaw.findlaw.com/us-7th-circuit/1551...,findlaw.com
2,2,"United States Court of Appeals,Seventh Circuit...",Lin v. Holder,United States Seventh Circuit,12/23/2010,10-1401,"Government Law, Health Law, Immigration Law",case,http://caselaw.findlaw.com/us-7th-circuit/1549...,findlaw.com
3,3,,Samirah v. Holder,United States Seventh Circuit,12/03/2010,08-1889,"Civil Procedure, Government Law, Immigration Law",case,http://www.ca7.uscourts.gov/fdocs/docs.fwx?sub...,findlaw.com
4,4,,US v. Perez-Molina,United States Seventh Circuit,12/03/2010,10-2427,"Criminal Law & Procedure, Sentencing, Immigrat...",case,http://www.ca7.uscourts.gov/fdocs/docs.fwx?sub...,findlaw.com


In [4]:
data = df.case_text[df.case_text.notnull()]


## Do Count Vectorizer

In [5]:

count_vectorizer = CountVectorizer()
counts= count_vectorizer.fit_transform(data)
features= count_vectorizer.vocabulary_


In [7]:
features


{'united': 77543,
 'states': 71901,
 'court': 25256,
 'of': 56140,
 'appeals': 13633,
 'seventh': 69228,
 'circuit': 22382,
 'elizabeth': 31108,
 'ward': 79843,
 'individually': 42614,
 'and': 13079,
 'as': 14479,
 'parent': 58548,
 'natural': 54244,
 'guardian': 37888,
 'to': 75377,
 'estefanie': 32395,
 'ebrada': 30493,
 'cainto': 20245,
 'minor': 52301,
 'child': 22026,
 'petitioners': 59680,
 'eric': 32079,
 'holder': 40122,
 'jr': 45176,
 'attorney': 15070,
 'general': 36501,
 'the': 74796,
 'respondent': 65545,
 'no': 55018,
 '10': 369,
 '2063': 3152,
 'decided': 26765,
 'january': 44654,
 '21': 3226,
 '2011': 3029,
 'before': 17421,
 'bauer': 17183,
 'manion': 50160,
 'hamilton': 38504,
 'judges': 45226,
 'daniel': 26404,
 'cory': 25037,
 'plews': 60356,
 'shadley': 69326,
 'racher': 62974,
 'braun': 19217,
 'south': 71212,
 'bend': 17679,
 'in': 42187,
 'for': 35085,
 'timothy': 75254,
 'hayes': 39052,
 'department': 27528,
 'justice': 45398,
 'washington': 79936,
 'dc': 26605,

## Apply TF-IDF Weights

In [6]:
vectorizer = TfidfVectorizer(stop_words='english')
vectors = vectorizer.fit_transform(data).toarray()
words = vectorizer.get_feature_names()

In [9]:
vectors.shape

(4527, 81833)

## Initial Ranking Result Using Cosine Similarity

In [7]:
## Helper function for getting top 10 values:
def get_top_values(lst, n):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the highest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["cat", "pig"]
    
    [-1:-n-1:-1]
    '''
    return [i for i in np.argsort(lst)[-1:-n-1:-1]]

# vectorizer = TfidfVectorizer(stop_words='english')
# vectors = vectorizer.fit_transform(data).toarray()

queries = ['Deportation proceedings in Texas?', 
           'Illegal Immigrantion in California',  
          '8 U.S.C. § 1154(a)(1)(A)(iii)']

tokenized_queries = vectorizer.transform(queries)
cosine_similarities = linear_kernel(tokenized_queries, vectors)
titles = ['Deportation in Texas', 'Illegal Immigrantion in California',
          '8 U.S.C. § 1154(a)(1)(A)(iii)']

for i, query in enumerate(queries):
    print (query)
    print (get_top_values(cosine_similarities[i], 5), titles[i])
    print ("")

    
# cosine_similarities.shape

Deportation proceedings in Texas?
[2788, 4507, 1580, 1409, 1596] Deportation in Texas

Illegal Immigrantion in California
[2933, 2822, 2820, 2925, 2924] Illegal Immigrantion in California

8 U.S.C. § 1154(a)(1)(A)(iii)
[1140, 706, 547, 2347, 2601] 8 U.S.C. § 1154(a)(1)(A)(iii)



In [8]:
# cosine_similarities[0][0]

# print len(data)
# print cosine_similarities.shape

print (data[706])


United States Court of Appeals, Third Circuit.
Wilson Emilio Peguero MATEO, Petitioner v. ATTORNEY GENERAL UNITED STATES of America, Respondent
No. 15-1160
Decided: September 06, 2017

Before: McKEE, JORDAN, and VANASKIE, Circuit Judges.Tracey M. Hubbard, Esq. (ARGUED), Bank Towers Building, 321 Spruce Street, Suite 509, 1500 Liberty Center, Scranton, PA 18503, Counsel for Petitioner Matthew A. Connelly, Esq. (ARGUED), Thomas. W. Hussey, Esq., United States Department of Justice, Office of Immigration Litigation, P.O. Box 878, Ben Franklin Station, Washington, DC 20044, Counsel for Respondent
OPINIONThis appeal requires us to determine whether Wilson Emilio Peguero Mateo's conspiracy plea for Robbery of a Motor Vehicle under Pennsylvania law qualifies as a “crime of violence” under 18 U.S.C. § 16(b), as incorporated into 8 U.S.C. § 1101(a)(43)(F) of the Immigration and Nationality Act (“INA”). In light of the Supreme Court's decision in Johnson v. United States, ––– U.S. ––––, 135 S.Ct

In [12]:
a = np.array([1,2,3,4,5,6,5,4,3,2,1])

for i in np.argsort(a)[-3:][::-1]: 
    print i


5
6
4


In [9]:
nlp = spacy.load('en')

# nlp = en_core_web_sm.load()
document = data[0]
document = nlp(document)


In [17]:
print (dir(document))



['__bytes__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__pyx_vtable__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', '_py_tokens', '_realloc', '_vector', '_vector_norm', 'count_by', 'doc', 'ents', 'from_array', 'from_bytes', 'has_vector', 'is_parsed', 'is_tagged', 'mem', 'merge', 'noun_chunks', 'noun_chunks_iterator', 'print_tree', 'read_bytes', 'sentiment', 'sents', 'similarity', 'string', 'tensor', 'text', 'text_with_ws', 'to_array', 'to_bytes', 'user_data', 'user_hooks', 'user_span_hooks', 'user_token_hooks', 'vector', 'vector_norm', 'vocab']


In [18]:
# list(document.sents)
all_tags = {w.pos: w.pos_ for w in document}
all_tags
# all tags of first sentence of our document 
# for word in list(document.sents)[0]:  
#     print word, word.tag_

{82: 'ADJ',
 83: 'ADP',
 84: 'ADV',
 87: 'CCONJ',
 88: 'DET',
 89: 'INTJ',
 90: 'NOUN',
 91: 'NUM',
 92: 'PART',
 93: 'PRON',
 94: 'PROPN',
 95: 'PUNCT',
 97: 'SYM',
 98: 'VERB',
 99: 'X',
 101: 'SPACE'}

## top unigrams of the document.

In [19]:
#define some parameters  
noisy_pos_tags = ['PROP']
min_token_length = 2

#Function to check if the token is a noise or not  
def isNoise(token):     
    is_noise = False
    if token.pos_ in noisy_pos_tags:
        is_noise = True 
    elif token.is_stop == True:
        is_noise = True
    elif len(token.string) <= min_token_length:
        is_noise = True
    return is_noise 
def cleanup(token, lower = True):
    if lower:
        token = token.lower()
    return token.strip()

# top unigrams used in the reviews 
from collections import Counter
cleaned_list = [cleanup(word.string) for word in document if not isNoise(word)]
Counter(cleaned_list) .most_common(5)


[('member', 35),
 ('bia', 24),
 ('petitioners', 17),
 ('single', 15),
 ('review', 14)]

## Entity Detection

In [27]:
labels = set([w.label_ for w in document.ents]) 
for label in labels: 
    entities = [cleanup(e.string, lower=False) for e in document.ents if label==e.label_] 
    entities = list(set(entities)) 
    print label,entities

ORDINAL [u'7th', u'first', u'First']
NORP [u'Respondent']
GPE [u'the United States', u'\xa7 1003.1(e', u'Philippines', u'Washington', u'Garcia-Flores', u'a United States', u'Plews', u'South Bend', u'Shadley']
PERCENT [u'8 U.S.C. \xa7', u'8 C.F.R. \xa7']
PERSON [u'MANION', u'Cainto', u'Eric H. HOLDER', u'Elizabeth P. WARD', u'I. BACKGROUNDPetitioner Ward', u'Martinez-Camargo', u'\u201d Martinez-Camargo v.', u'Petitioners', u'Timothy G. Hayes', u'Garcia', u'\u201d Garcia-Flores', u'Patel', u'Ward', u'Daniel P. Cory', u'e)(5']
CARDINAL [u'two', u'\xa7 1003.1(e', u'one', u'2010.II', u'491', u'692', u'691', u'three', u'8', u'328', u'329', u'568', u'469', u'563', u'565', u'1003.1(e)(5', u'487', u'10', u'282', u'962', u'17', u'1003.1(e', u'more than a cookie-cutter', u'683']
DATE [u'January 21, 2011', u'March 2004', u'November 2006', u'December 2008', u'2002', u'1252', u'March 31', u'May 2004', u'March 31, 2010', u'under 8', u'April 2007']
ORG [u'HAMILTON, Circuit Judges', u'Title', u'Martine

In [20]:
document

United States Court of Appeals,Seventh Circuit.
Elizabeth P. WARD, individually and as parent and natural guardian to Estefanie Ebrada Cainto, a minor child, Petitioners, v. Eric H. HOLDER, Jr., Attorney General of the United States, Respondent.
No. 10-2063.
Decided: January 21, 2011

Before BAUER, MANION and HAMILTON, Circuit Judges.Daniel P. Cory, Plews, Shadley, Racher & Braun, South Bend, IN, for Petitioners. Timothy G. Hayes, Department of Justice, Washington, DC, for Respondent.
Petitioners seek review of a decision of the Board of Immigration Appeals (the “BIA”) upholding an immigration judge's finding that they are removable from the United States under 8 U.S.C. § 1227(a)(1) and ineligible for cancellation of removal under 8 U.S.C. § 1154(a)(1)(A)(iii). Petitioners contend that a single member of the BIA issued an improper written opinion that went beyond the scope of an individual BIA member's power to affirm, modify, or remand an immigration judge's decision in a “brief order

In [20]:
from pyparsing import *
 
# raw = """Indemnified Capital Investments, S.A. v. R.J. O'Brien & Assoc., Inc., 
# 12 F.3d 1406, 1409 (7th Cir.1993). The New Jersey Superior Court's Appellate 
# Division affirmed the dismissal of Dale's common-law claim, but otherwise 
# reversed and remanded for further proceedings, 308 N. J. Super. 516, blah 70 A. 2d 270 (1998). 
# See also Warth v. Seldin, 422 U.S. 490, 499 n. 10, 95 S.Ct. 2197, 2205 n. 10, 45 L.Ed.2d 343 (1975).
# NFMA, NEPA, or MUSYA. Sierra Club v. Marita, 843 F.Supp. 1526 (E.D.Wis.1994) ("Nicolet ")."""

raw = document
    

# some basic things to match for
integer = Word(nums)
ordinal = Combine(integer + oneOf("d nd st rd th"))
ndot = Literal("n.")
print ndot
 
# need to forward declare citation, since we will also refer to it
# within its own definition (as negative lookahead when parsing
# page numbers)
citation = Forward()
 
# source_abbrev is very tricky, have to incorporate negative lookaheads
# to avoid reading non page numbers as page numbers
source_abbrev = originalTextFor(
    OneOrMore(ordinal | 
              (~integer + Word(alphanums+'.')))
    )
 
# use the first expression for pages if you want to iterate over each 
# separate page in the list
pages = Group(delimitedList((Group(integer + ndot + integer) | ~citation + integer)))
# use the second expression for pages if you just want the list of pages
# as a string
pages = originalTextFor(delimitedList((Group(integer + ndot + integer) | ~citation + integer)))
 
citation << integer("volume") + source_abbrev("abbrev") + pages("pages")
 
for cit in citation.searchString(raw):
    print cit.dump()
    # can also reference individual fields as cit.volume, cit.pages, etc.
    print
    
    
# GOAL: Patel v. Holder, 563 F.3d 565, 568 (7th Cir.2009)
# CURRENTLY: ['563', 'F.3d', '565, 568']

"n."
['1252', '. The written order issued by the BIA on March', '31']
- abbrev: . The written order issued by the BIA on March
- pages: 31
- volume: 1252

['2010', 'constitutes a final order of removal under', '8 ']
- abbrev: constitutes a final order of removal under
- pages: 8 
- volume: 2010

['2004', '. The marriage dissolved shortly thereafter and Ward never applied for permanent resident status based upon her marriage. In November', '2006']
- abbrev: . The marriage dissolved shortly thereafter and Ward never applied for permanent resident status based upon her marriage. In November
- pages: 2006
- volume: 2004

['2007', '. In December', '2008']
- abbrev: . In December
- pages: 2008
- volume: 2007

['563', 'F.3d', '565, 568']
- abbrev: F.3d
- pages: 565, 568
- volume: 563

['282', 'F.3d', '487, 491']
- abbrev: F.3d
- pages: 487, 491
- volume: 282

['282', 'F.3d at', '491']
- abbrev: F.3d at
- pages: 491
- volume: 282

['1003', '.1 of Title', '8 ']
- abbrev: .1 of Title
- pages: 8 

In [14]:
document

United States Court of Appeals,Seventh Circuit.
Elizabeth P. WARD, individually and as parent and natural guardian to Estefanie Ebrada Cainto, a minor child, Petitioners, v. Eric H. HOLDER, Jr., Attorney General of the United States, Respondent.
No. 10-2063.
Decided: January 21, 2011

Before BAUER, MANION and HAMILTON, Circuit Judges.Daniel P. Cory, Plews, Shadley, Racher & Braun, South Bend, IN, for Petitioners. Timothy G. Hayes, Department of Justice, Washington, DC, for Respondent.
Petitioners seek review of a decision of the Board of Immigration Appeals (the “BIA”) upholding an immigration judge's finding that they are removable from the United States under 8 U.S.C. § 1227(a)(1) and ineligible for cancellation of removal under 8 U.S.C. § 1154(a)(1)(A)(iii). Petitioners contend that a single member of the BIA issued an improper written opinion that went beyond the scope of an individual BIA member's power to affirm, modify, or remand an immigration judge's decision in a “brief order

## Sentiment Analysis

In [17]:
from textblob import TextBlob
str_= str(document.text.encode("ascii", "ignore").replace("\n", ''))

opinion = TextBlob(str_)
print opinion.sentiment

from textblob.sentiments import NaiveBayesAnalyzer
opinion = TextBlob(str_, analyzer=NaiveBayesAnalyzer())
print opinion.sentiment



Sentiment(polarity=0.033440583440583434, subjectivity=0.4278635778635779)
Sentiment(classification='pos', p_pos=1.0, p_neg=4.326180711426607e-42)


## Testing possible Citation Extractor

In [12]:
class Citation(object):
    '''Convenience class which represents a single citation found in a document.
    
    '''
    def __init__(self, reporter, page, volume):
        self.reporter = reporter
        self.volume = volume
        self.page = page
        self.extra = None
        self.defendant = None
        self.plaintiff = None
        self.court = None
        self.year = None
        self.match_url = None

    def base_citation(self):
        return u"%d %s %d" % (self.volume, self.reporter, self.page)

    def as_regex(self):
        return r"%d(\s+)%s(\s+)%d" % (self.volume, self.reporter, self.page)

    # TODO: Update css for no-link citations
#     def as_html(self):
#         template = u'<span class="volume">%(volume)d</span>\\1' \
#             u'<span class="reporter">%(reporter)s</span>\\2' \
#             u'<span class="page">%(page)d</span>'
#         inner_html = template % self.__dict__
#         span_class = "citation"
#         if self.match_url:
#             inner_html = u'<a href="%s">' % self.match_url + inner_html + u'</a>'
#         else:
#             span_class += " no-link"
#         return u'<span class="%s">%s</span>' % (span_class, inner_html)

    def __repr__(self):
        print_string = self.base_citation()
        if self.defendant:
            print_string = u' '.join([self.defendant, print_string])
            if self.plaintiff:
                print_string = u' '.join([self.plaintiff, 'v.', print_string])
        if self.extra:
            print_string = u' '.join([print_string, self.extra])
        if self.court and self.year:
            paren = u"(%s %d)" % (self.court, self.year)
        elif self.year:
            paren = u'(%d)' % self.year
        elif self.court:
            paren = u"(%s)" % self.court
        else:
            paren = ''
        print_string = u' '.join([print_string, paren])
        return print_string
    
# Adapted from nltk Penn Treebank tokenizer
def strip_punct(text):
    #starting quotes
    text = re.sub(r'^\"', r'', text)
    text = re.sub(r'(``)', r'', text)
    text = re.sub(r'([ (\[{<])"', r'', text)

    #punctuation
    text = re.sub(r'\.\.\.', r'', text)
    text = re.sub(r'[,;:@#$%&]', r'', text)
    text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1', text)
    text = re.sub(r'[?!]', r'', text)

    text = re.sub(r"([^'])' ", r"", text)

    #parens, brackets, etc.
    text = re.sub(r'[\]\[\(\)\{\}\<\>]', r'', text)
    text = re.sub(r'--', r'', text)

    #ending quotes
    text = re.sub(r'"', "", text)
    text = re.sub(r'(\S)(\'\')', r'', text)

    return text.strip()

def get_year(token):
    '''Given a string token, look for a valid 4-digit number at the start and 
    return its value.
    '''
    token = strip_punct(token)
    if not token.isdigit():
        # Sometimes funny stuff happens?
        token = re.sub(r'(\d{4}).*', r'\1', token)
        if not token.isdigit():
            return None
    if len(token) != 4:
        return None
    year = int(token)
    if year < 1754: # Earliest case in the database
        return None
    return year

def get_court(paren_string, year):
    if year is None:
        return strip_punct(paren_string)
    year_index = paren_string.find(str(year))
    return strip_punct(paren_string[:year_index])


In [13]:
import reporter_tokenizer

import os
import re
import sys

FORWARD_SEEK = 20

BACKWARD_SEEK = 70 # Average case name length in the db is 67

STOP_TOKENS = ['v', 're', 'parte', 'denied', 'citing', "aff'd", "affirmed",
               "remanded", "see", "granted", "dismissed"]

def add_defendant(citation, words, reporter_index):
    '''Scan backwards from 2 tokens before reporter until you find v., in re, etc.
    If no known stop-token is found, no defendant name is stored.  In the future, 
    this could be improved.'''
    start_index = None
    for index in range(reporter_index - 1, max(reporter_index - BACKWARD_SEEK, 0), -1):
        word = words[index]
        if word == ',':
            # Skip it
            continue
        if strip_punct(word).lower() in STOP_TOKENS:
            if word == 'v.':
                citation.plaintiff = words[index - 1]
            start_index = index + 1
            break
        if word.endswith(';'):
            # String citation
            break
    if start_index:
        citation.defendant = u' '.join(words[start_index:reporter_index - 1])


def add_post_citation(citation, words, reporter_index):
    '''Add to a citation object any additional information found after the base
    citation, including court, year, and possibly page range.

    Examples:
        Full citation: 123 U.S. 345 (1894)
        Post-citation info: year=1894

        Full citation: 123 F.2d 345, 347-348 (4th Cir. 1990)
        Post-citation info: year=1990, court="4th Cir.", extra (page range)="347-348"
    '''
    end_position = reporter_index + 2
    # Start looking 2 tokens after the reporter (1 after page)
    for start in range(reporter_index + 2, min(reporter_index + FORWARD_SEEK, len(words))):
        if words[start].startswith('('):
            for end in range(start, start + FORWARD_SEEK):
                if words[end].find(')') > -1:
                    # Sometimes the paren gets split from the preceding content
                    if words[end].startswith(')'):
                        citation.year = get_year(words[end - 1])
                    else:
                        citation.year = get_year(words[end])
                    citation.court = get_court(u' '.join(words[start:end + 1]), citation.year)
                    end_position = end
                    break
            if start > reporter_index + 2:
                # Then there's content between page and (), starting with a comma, which we skip
                citation.extra = u' '.join(words[reporter_index + 3:start])
            break
    return end_position

def extract_base_citation(words, reporter_index):
    '''Given a list of words and the index of a federal reporter, look before and after
    for volume and page number.  If found, construct and return a Citation object.'''
    reporter = words[reporter_index]
    # Get rid of extra space so that we only have one version to check
    if reporter == 'U. S.':
        reporter = 'U.S.'
    if words[reporter_index - 1].isdigit():
        volume = int(words[reporter_index - 1])
    else: # No volume, therefore not a valid citation
        return None
    page_str = words[reporter_index + 1]
    if page_str.find(',') == len(page_str) - 1:
        # Strip off ending comma, which occurs when there is a page range next
        page_str = page_str[:-1]
    if page_str.isdigit():
        page = int(page_str)
    else: # No page, therefore not a valid citation
        return None

    return Citation(reporter, page, volume)

def get_citations(text, html=True):
    if html:
        text = get_visible_text(text)
    words = reporter_tokenizer.tokenize(text)
    citations = []
    previous_end_position = 0
    # Exclude first and last tokens when looking for reporters, because valid
    # citations must have a volume before and a page number after the reporter.
    for i in range(1,len(words)-1):
        # Find reporter
        if words[i] in reporter_tokenizer.REPORTERS:
            citation = extract_base_citation(words, i)
            if citation is None:
                # Not a valid citation; continue looking
                continue
            end_position = add_post_citation(citation, words, i)
            add_defendant(citation, words, i)
            citations.append(citation)

            # Advance the counter; no need to re-check tokens in this citation
            i = end_position
            previous_end_position = end_position + 1

    return citations

get_citations(str(document), html=False)

[Patel v. Holder, 563 F.3d 565 (7th Cir.2009),
 Martinez-Camargo v. INS, 282 F.3d 487 (7th Cir.2002),
 Gutnik v. Gonzales, 469 F.3d 683 (7th Cir.2006),
 Gutnik v. Gonzales, 469 F.3d 683 (7th Cir.2006),
 Georgis v. Ashcroft, 328 F.3d 962 (7th Cir.2003.BAUER)]

In [48]:
type(document)

spacy.tokens.doc.Doc

In [73]:
import re

# re.findall()


# regex = #("(\\d+)\\s? C\\.?\\s?F\\.?\\s?R\\.?")
regex = r"(\\d+\\s? C\\.?\\s?F\\.?\\s?R\\.?), (?:[\\s,]+(?:§+|parts?))?, (?:[\\s,]+(?:§+|parts?))?" 

# Extra separators (section sign, part)
#         "\\s*(\\d+(?:(?:[-–—]\\d+)?[a-z]?", # Part number
#         "(?:\\.(?:13h[-–—]l|\\d+[-–—]?\\d*\\.5\\d|(?:\\d+T|T|\\d+[-–—]DD[-–—]|\\d+[-–—]WH[-–—]|\\d+[a-z]{1,2}\\d*[-–—])?\\d+)[a-z]{0,2}(?:(?:(?:\\([a-z\\d]{1,2}\\))*[-–—]\\d+)+[a-z]{0,2})?)?",  # Optionally: period and section number
#         "(?:(?:\\s*\\((?:[a-z\\d]{1,2}|[ixv]+)\\))+)?)?)"
        
        
re.findall(regex , str(document))


[]

In [41]:
words = """ Hey, how is\
 it going? """
    
print (words)

 Hey, how is it going? 
