# Text Preprocessing

##### Author: Alex Sherman | alsherman@deloitte.com

In [16]:
import os
import configparser
from IPython.core.display import display, HTML

config = configparser.ConfigParser()
config.read('../../config.ini')
DB_PATH = config['NLP']['DB_PATH']

In [17]:
# confirm DB_PATH is correct db directory, otherwise the rest of the code will not work
DB_PATH

'sqlite:///C:\\Users\\alsherman\\Desktop\\PycharmProjects\\firm_initiatives\\ml_guild\\raw_data\\annual_report.db'

In [18]:
# read the oracle 10k documents 

import pandas as pd
from sqlalchemy import create_engine
engine = create_engine(DB_PATH)

df = pd.read_sql("SELECT * FROM annual_report WHERE COMPANY = 'oracle'", con=engine)
df.head(25)

Unnamed: 0,annual_report_id,company,report_name,report_year,section_name,section_text,section_type
0,211,oracle,oracle-corporation_annual_report_1994.docx,1994,ORACLE SYSTEMS,,bold
1,212,oracle,oracle-corporation_annual_report_1994.docx,1994,FORM 10-K,(Annual Report) Filed 07/27/94 for the...,bold
2,213,oracle,oracle-corporation_annual_report_1994.docx,1994,SECURITIES AND EXCHANGE COMMISSION,"Washington, D.C. 20549",bold
3,214,oracle,oracle-corporation_annual_report_1994.docx,1994,Form 10-K [X] ANNUAL REPORT PURSUANT TO SECTIO...,,bold
4,215,oracle,oracle-corporation_annual_report_1994.docx,1994,"FOR THE FISCAL YEAR ENDED MAY 31, 1994",OR,bold
5,216,oracle,oracle-corporation_annual_report_1994.docx,1994,[ ] TRANSITION REPORT PURSUANT TO SECTION 13 O...,COMMISSION FILE NUMBER 0-14376,heading
6,217,oracle,oracle-corporation_annual_report_1994.docx,1994,Oracle Systems Corporation,(Exact name of registrant as specified in its ...,bold
7,218,oracle,oracle-corporation_annual_report_1994.docx,1994,SECURITIES REGISTERED PURSUANT TO SECTION 12(B...,(Title of class) Indicate by check mark wheth...,heading
8,219,oracle,oracle-corporation_annual_report_1994.docx,1994,ORACLE SYSTEMS CORPORATION 1994 FORM 10-K ANNU...,PART I i,heading
9,220,oracle,oracle-corporation_annual_report_1994.docx,1994,PART I,,heading


In [19]:
df[df.section_text.str.contains('CEO')].section_name

2452          Board of Directors Composition and category
2454    Attendance of each Director at the Board Meeti...
2458    Brief resume of Directors proposed to be appoi...
2467                                  Compensation policy
2468    Details of remuneration paid to the Directors ...
2496                  Chaitanya Kamat\tMakarand  Padalkar
2594                                  Mr. Chaitanya Kamat
2596                                  Mr. Robert K Weiler
Name: section_name, dtype: object

In [20]:
# example text
text = df.section_text[2452]
text

'The composition of the Board of Directors of the Company (“the Board”) as on March 31, 2011, was as under:  * Only the Audit Committee and Shareholders’ Grievances Committee are considered. All Directorships of Mr. William T Comfort, Jr., Mr. Frank Brienzi, Ms. Dorian Daley, Mr. William Corey West and Mr. Derek H Williams are in foreign companies. None of the directors are related inter se. 1   Mr. Frank Brienzi was appointed as a Director in the Annual General Meeting held on August 25, 2010. 2   Mr. Joseph John was appointed as a Director and Whole‑time Director in the Annual General Meeting held on August 25, 2010. He ceased to be a director with effect from March 31, 2011. 3  Mr. Chaitanya Kamat was appointed as an Additional Director and as the Managing Director and CEO with effect from October 25, 2010 subject to the approval of the members of the Company. 4  Mr. S Venkatachalam was appointed as an Additional Director with effect from October 25, 2010. 5   Mr. William Corey West

### SpaCy

#### Installation:
- conda install -c conda-forge spacy
- Download Microsoft Visual C++: http://landinghub.visualstudio.com/visual-cpp-build-tools

spaCy is a free, open-source library for advanced Natural Language Processing (NLP) in Python.

If you're working with a lot of text, you'll eventually want to know more about it. For example, what's it about? What do the words mean in context? Who is doing what to whom? What companies and products are mentioned? Which texts are similar to each other?

spaCy is designed specifically for production use and helps you build applications that process and "understand" large volumes of text. It can be used to build information extraction or natural language understanding systems, or to pre-process text for deep learning.

spaCy is not research software. It's built on the latest research, but it's designed to get things done. This leads to fairly different design decisions than NLTK or CoreNLP, which were created as platforms for teaching and research. The main difference is that spaCy is integrated and opinionated. spaCy tries to avoid asking the user to choose between multiple algorithms that deliver equivalent functionality. Keeping the menu small lets spaCy deliver generally better performance and developer experience.

### SpaCy Features 

NAME |	DESCRIPTION |
:----- |:------|
Tokenization|Segmenting text into words, punctuations marks etc.|
Part-of-speech (POS) Tagging|Assigning word types to tokens, like verb or noun.|
Dependency Parsing|	Assigning syntactic dependency labels, describing the relations between individual tokens, like subject or object.|
Lemmatization|	Assigning the base forms of words. For example, the lemma of "was" is "be", and the lemma of "rats" is "rat".|
Sentence Boundary Detection (SBD)|	Finding and segmenting individual sentences.|
Named Entity Recognition (NER)|	Labelling named "real-world" objects, like persons, companies or locations.|
Similarity|	Comparing words, text spans and documents and how similar they are to each other.|
Text Classification|	Assigning categories or labels to a whole document, or parts of a document.|
Rule-based Matching|	Finding sequences of tokens based on their texts and linguistic annotations, similar to regular expressions.|
Training|	Updating and improving a statistical model's predictions.|
Serialization|	Saving objects to files or byte strings.|

SOURCE: https://spacy.io/usage/spacy-101

In [21]:
import spacy
from spacy import displacy

In [22]:
# read in a English language model
nlp = spacy.load('en')

In [23]:
# instantiate the document text
doc = nlp(text)

In [24]:
# view the text
doc

The composition of the Board of Directors of the Company (“the Board”) as on March 31, 2011, was as under:  * Only the Audit Committee and Shareholders’ Grievances Committee are considered. All Directorships of Mr. William T Comfort, Jr., Mr. Frank Brienzi, Ms. Dorian Daley, Mr. William Corey West and Mr. Derek H Williams are in foreign companies. None of the directors are related inter se. 1   Mr. Frank Brienzi was appointed as a Director in the Annual General Meeting held on August 25, 2010. 2   Mr. Joseph John was appointed as a Director and Whole‑time Director in the Annual General Meeting held on August 25, 2010. He ceased to be a director with effect from March 31, 2011. 3  Mr. Chaitanya Kamat was appointed as an Additional Director and as the Managing Director and CEO with effect from October 25, 2010 subject to the approval of the members of the Company. 4  Mr. S Venkatachalam was appointed as an Additional Director with effect from October 25, 2010. 5   Mr. William Corey West 

In [25]:
string_formatting_url = 'https://spacy.io/assets/img/pipeline.svg'
iframe = '<iframe src={} width=1000 height=200></iframe>'.format(string_formatting_url)
HTML(iframe)

### Tokenization

spaCy first tokenizes the text, i.e. segments it into words, punctuation and so on. This is done by applying rules specific to each language. For example, punctuation at the end of a sentence should be split off – whereas "U.K." should remain one token. 

In [26]:
string_formatting_url = 'https://spacy.io/assets/img/tokenization.svg'
iframe = '<iframe src={} width=650 height=400></iframe>'.format(string_formatting_url)
HTML(iframe)

### Part-of-speech (POS) Tagging

After tokenization, spaCy can parse and tag a given Doc. This is where the statistical model comes in, which enables spaCy to make a prediction of which tag or label most likely applies in this context. A model consists of binary data and is produced by showing a system enough examples for it to make predictions that generalise across the language – for example, a word following "the" in English is most likely a noun.

Annotation | Description
:----- |:------|
Text |The original word text|
Lemma |The base form of the word.|
POS |The simple part-of-speech tag.|
Tag |The detailed part-of-speech tag.|
Dep |Syntactic dependency, i.e. the relation between tokens.|
Shape |The word shape – capitalisation, punctuation, digits.|
Is Alpha |Is the token an alpha character?|
Is Stop |Is the token part of a stop list, i.e. the most common words of the language?|

In [91]:
print('{:13} | {:13} | {:8} | {:8} | {:11} | {:8} | {:8} | {:8} | '.format(
    'text', 'lemma_', 'pos_', 'tag_', 'dep_', 'shape_', 'is_alpha', 'is_stop'))
print('_'*100)

for token in doc:
    print('{:13} | {:13} | {:8} | {:8} | {:11} | {:8} | {:8} | {:8} |'.format(
          token.text, token.lemma_, token.pos_, token.tag_, token.dep_
        , token.shape_, token.is_alpha, token.is_stop))

text          | lemma_        | pos_     | tag_     | dep_        | shape_   | is_alpha | is_stop  | 
____________________________________________________________________________________________________
              |               | SPACE    |          |             |          |        0 |        0 |
The           | the           | DET      | DT       | det         | Xxx      |        1 |        0 |
Company       | company       | PROPN    | NNP      | compound    | Xxxxx    |        1 |        0 |
designs       | design        | VERB     | VBZ      | ROOT        | xxxx     |        1 |        0 |
,             | ,             | PUNCT    | ,        | punct       | ,        |        0 |        0 |
develops      | develop       | VERB     | VBZ      | conj        | xxxx     |        1 |        0 |
,             | ,             | PUNCT    | ,        | punct       | ,        |        0 |        0 |
markets       | market        | NOUN     | NNS      | conj        | xxxx     |        1 | 

,             | ,             | PUNCT    | ,        | punct       | ,        |        0 |        0 |
was           | be            | VERB     | VBD      | auxpass     | xxx      |        1 |        1 |
incorporated  | incorporate   | VERB     | VBN      | ROOT        | xxxx     |        1 |        0 |
in            | in            | ADP      | IN       | prep        | xx       |        1 |        1 |
June          | june          | PROPN    | NNP      | pobj        | Xxxx     |        1 |        0 |
1977          | 1977          | NUM      | CD       | nummod      | dddd     |        0 |        0 |
.             | .             | PUNCT    | .        | punct       | .        |        0 |        0 |
Unless        | unless        | ADP      | IN       | mark        | Xxxxx    |        1 |        0 |
the           | the           | DET      | DT       | det         | xxx      |        1 |        1 |
context       | context       | NOUN     | NN       | nsubj       | xxxx     |        1 |  

In [105]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

matched_sents = [] # collect data of matched sentences to be visualized

def collect_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start : end] # matched span
    sent = span.sent # sentence containing matched span
    # append mock entity for match in displaCy style to matched_sents
    # get the match span by ofsetting the start and end of the span with the
    # start and end of the sentence in the doc
    match_ents = [{'start': span.start_char - sent.start_char, 
                   'end': span.end_char - sent.start_char,
                   'label': 'MATCH'}]
    matched_sents.append({'text': sent.text, 'ents': match_ents,'span':span })

pattern = [{'POS':'NOUN', 'OP':'+'},{'LOWER':'services'}]

#pattern = [{'TAG': 'VBN'},{'TAG':'IN','OP': '+'},{'TAG':'DT','OP': '+'},{'TAG': 'NNP', 'OP': '*'}]#,{},{'TAG': 'CD'}]#,{'TAG': 'CD', 'OP': '+'}]#{'POS': 'ADV', 'OP': '*'}]#,{'POS': 'ADJ'}]
matcher.add('FacebookIs', collect_sents, pattern) # add pattern

In [138]:
from spacy.matcher import Matcher
from collections import defaultdict

matcher = Matcher(nlp.vocab)
entities = defaultdict(int)

def collect_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start : end] # matched span
    entities[span.text] += 1

pattern = [{'POS':'NOUN', 'OP':'+'},{'LOWER':'services'}]

matcher.add('entities', collect_sents, pattern) # add pattern

In [140]:
for section in text[0:30]:
    matcher(nlp(section)) # match on your text
entities

defaultdict(int,
            {'consulting services': 2,
             'customer support services': 2,
             'education services': 9,
             'integration services': 5,
             'maintenance services': 1,
             'multimedia services': 1,
             'support services': 10,
             'systems integration services': 5})

### Exercise 
get all sentences with word risk for topic analysis

In [236]:
matcher = Matcher(nlp.vocab)
pattern = [{'POS':'NOUN', 'OP':'+'},{'LOWER':'risk'}]
risks = defaultdict(int)

def collect_sents(matcher, doc, i, matches):
    print(matches)
    print()
    match_id, start, end = matches[i]
    span = doc[start : end] # matched span
    risks[span.text.lower()] += 1

matcher.add('risk', collect_sents, pattern) # add pattern

In [None]:
groupby_text = df.groupby('report_year')['section_text'].sum()
section_text_by_year = pd.DataFrame(groupby_text).reset_index()

years = {}
for ind, row in section_text_by_year.iterrows():
    print(ind)
    #if ind == 1:
    #    break
    year = row['report_year']
    doc = nlp(row['section_text'])
    matcher(doc) # match on your text
    years[year] = risks.copy()

0
[(14326900376835226264, 15001, 15003)]

1
[(14326900376835226264, 14312, 14314)]

2
[(14326900376835226264, 17458, 17460)]

3
[(14326900376835226264, 18108, 18110)]

4
[(14326900376835226264, 13084, 13087), (14326900376835226264, 13085, 13087), (14326900376835226264, 13093, 13096), (14326900376835226264, 13094, 13096), (14326900376835226264, 13162, 13164), (14326900376835226264, 13236, 13238), (14326900376835226264, 17443, 17445)]

[(14326900376835226264, 13084, 13087), (14326900376835226264, 13085, 13087), (14326900376835226264, 13093, 13096), (14326900376835226264, 13094, 13096), (14326900376835226264, 13162, 13164), (14326900376835226264, 13236, 13238), (14326900376835226264, 17443, 17445)]

[(14326900376835226264, 13084, 13087), (14326900376835226264, 13085, 13087), (14326900376835226264, 13093, 13096), (14326900376835226264, 13094, 13096), (14326900376835226264, 13162, 13164), (14326900376835226264, 13236, 13238), (14326900376835226264, 17443, 17445)]

[(14326900376835226264, 13

[(14326900376835226264, 18267, 18269), (14326900376835226264, 18323, 18325), (14326900376835226264, 18326, 18328), (14326900376835226264, 18334, 18336), (14326900376835226264, 19836, 19842), (14326900376835226264, 19837, 19842), (14326900376835226264, 19838, 19842), (14326900376835226264, 19839, 19842), (14326900376835226264, 19840, 19842), (14326900376835226264, 19870, 19875), (14326900376835226264, 19871, 19875), (14326900376835226264, 19872, 19875), (14326900376835226264, 19873, 19875), (14326900376835226264, 24383, 24385), (14326900376835226264, 25318, 25320), (14326900376835226264, 25374, 25376), (14326900376835226264, 25377, 25379), (14326900376835226264, 26088, 26094), (14326900376835226264, 26089, 26094), (14326900376835226264, 26090, 26094), (14326900376835226264, 26091, 26094), (14326900376835226264, 26092, 26094), (14326900376835226264, 26126, 26131), (14326900376835226264, 26127, 26131), (14326900376835226264, 26128, 26131), (14326900376835226264, 26129, 26131)]

[(14326900

[(14326900376835226264, 20333, 20335), (14326900376835226264, 20356, 20359), (14326900376835226264, 20357, 20359), (14326900376835226264, 20980, 20986), (14326900376835226264, 20981, 20986), (14326900376835226264, 20982, 20986), (14326900376835226264, 20983, 20986), (14326900376835226264, 20984, 20986), (14326900376835226264, 26289, 26291), (14326900376835226264, 27827, 27829), (14326900376835226264, 27883, 27885), (14326900376835226264, 27886, 27888), (14326900376835226264, 29128, 29131), (14326900376835226264, 29129, 29131), (14326900376835226264, 29305, 29311), (14326900376835226264, 29306, 29311), (14326900376835226264, 29307, 29311), (14326900376835226264, 29308, 29311), (14326900376835226264, 29309, 29311)]

[(14326900376835226264, 20333, 20335), (14326900376835226264, 20356, 20359), (14326900376835226264, 20357, 20359), (14326900376835226264, 20980, 20986), (14326900376835226264, 20981, 20986), (14326900376835226264, 20982, 20986), (14326900376835226264, 20983, 20986), (14326900

[(14326900376835226264, 23173, 23175), (14326900376835226264, 23895, 23900), (14326900376835226264, 23896, 23900), (14326900376835226264, 23897, 23900), (14326900376835226264, 23898, 23900), (14326900376835226264, 28332, 28334), (14326900376835226264, 30100, 30102), (14326900376835226264, 30156, 30158), (14326900376835226264, 30159, 30161), (14326900376835226264, 30162, 30164), (14326900376835226264, 30905, 30908), (14326900376835226264, 30906, 30908), (14326900376835226264, 31168, 31174), (14326900376835226264, 31169, 31174), (14326900376835226264, 31170, 31174), (14326900376835226264, 31171, 31174), (14326900376835226264, 31172, 31174)]

[(14326900376835226264, 23173, 23175), (14326900376835226264, 23895, 23900), (14326900376835226264, 23896, 23900), (14326900376835226264, 23897, 23900), (14326900376835226264, 23898, 23900), (14326900376835226264, 28332, 28334), (14326900376835226264, 30100, 30102), (14326900376835226264, 30156, 30158), (14326900376835226264, 30159, 30161), (14326900

[(14326900376835226264, 26826, 26828), (14326900376835226264, 27801, 27806), (14326900376835226264, 27802, 27806), (14326900376835226264, 27803, 27806), (14326900376835226264, 27804, 27806), (14326900376835226264, 34370, 34372), (14326900376835226264, 42924, 42926), (14326900376835226264, 42980, 42982), (14326900376835226264, 42983, 42985), (14326900376835226264, 42986, 42988), (14326900376835226264, 43478, 43481), (14326900376835226264, 43479, 43481), (14326900376835226264, 43716, 43721), (14326900376835226264, 43717, 43721), (14326900376835226264, 43718, 43721), (14326900376835226264, 43719, 43721)]

[(14326900376835226264, 26826, 26828), (14326900376835226264, 27801, 27806), (14326900376835226264, 27802, 27806), (14326900376835226264, 27803, 27806), (14326900376835226264, 27804, 27806), (14326900376835226264, 34370, 34372), (14326900376835226264, 42924, 42926), (14326900376835226264, 42980, 42982), (14326900376835226264, 42983, 42985), (14326900376835226264, 42986, 42988), (14326900

[(14326900376835226264, 4098, 4100), (14326900376835226264, 10193, 10195), (14326900376835226264, 29436, 29441), (14326900376835226264, 29437, 29441), (14326900376835226264, 29438, 29441), (14326900376835226264, 29439, 29441), (14326900376835226264, 35848, 35850)]

[(14326900376835226264, 4098, 4100), (14326900376835226264, 10193, 10195), (14326900376835226264, 29436, 29441), (14326900376835226264, 29437, 29441), (14326900376835226264, 29438, 29441), (14326900376835226264, 29439, 29441), (14326900376835226264, 35848, 35850)]

[(14326900376835226264, 4098, 4100), (14326900376835226264, 10193, 10195), (14326900376835226264, 29436, 29441), (14326900376835226264, 29437, 29441), (14326900376835226264, 29438, 29441), (14326900376835226264, 29439, 29441), (14326900376835226264, 35848, 35850)]

[(14326900376835226264, 4098, 4100), (14326900376835226264, 10193, 10195), (14326900376835226264, 29436, 29441), (14326900376835226264, 29437, 29441), (14326900376835226264, 29438, 29441), (143269003768

In [None]:
years

In [201]:
pd.DataFrame(years).T

Unnamed: 0,credit risk,currency exchange risk,currency risk,customer risk,default risk,enterprise risk,equity hedge minimizes currency risk,equity price risk,exchange risk,hedge minimizes currency risk,...,liquidity risk,litigation risk,market rate risk,market risk,minimizes currency risk,mitigates credit risk,price risk,rate risk,reinvestment risk,yen equity hedge minimizes currency risk
1994,15.0,,,,,,,,,,...,,,,,,,,,,
1995,15.0,,,,,,,,,,...,,,,,,,,,,
1996,15.0,,,,,,,,,,...,,,,,,,,,,
1997,15.0,,,,,,,,,,...,,,,,,,,,,
1998,15.0,,15.0,,,,,,,,...,,,15.0,,,,,30.0,15.0,
1999,15.0,,,,30.0,,,,,,...,,,,30.0,,,,,,
2000,15.0,,,,30.0,,,,,,...,,,,30.0,,,,,,
2001,15.0,,,,30.0,,,,,,...,,,,30.0,,,,,15.0,
2002,15.0,,60.0,,45.0,,60.0,,,60.0,...,,,,60.0,60.0,,,,,30.0
2003,15.0,,45.0,,15.0,,30.0,15.0,,30.0,...,,,,30.0,30.0,,15.0,15.0,,30.0


In [216]:
df[df['section_text'].str.contains('risk')]

Unnamed: 0,annual_report_id,company,report_name,report_year,section_name,section_text,section_type
28,239,oracle,oracle-corporation_annual_report_1994.docx,1994,Additional Customer Information,Revenues from international customers (includ...,heading
68,279,oracle,oracle-corporation_annual_report_1994.docx,1994,Concentration of Credit Risk,Financial instruments which potentially subje...,heading
124,443,oracle,oracle-corporation_annual_report_1995.docx,1995,Additional Customer Information,Revenues from international customers (includ...,bold
165,484,oracle,oracle-corporation_annual_report_1995.docx,1995,Concentration of Credit Risk,Financial instruments which potentially subje...,bold
203,522,oracle,oracle-corporation_annual_report_1996.docx,1996,FORWARD-LOOKING STATEMENTS,"In addition to historical information, this A...",heading
238,557,oracle,oracle-corporation_annual_report_1996.docx,1996,FACTORS THAT MAY AFFECT FUTURE RESULTS AND MAR...,The Company operates in a rapidly changing en...,heading
261,580,oracle,oracle-corporation_annual_report_1996.docx,1996,Concentration of Credit Risk,Financial instruments which potentially subje...,heading
292,1188,oracle,oracle-corporation_annual_report_1997.docx,1997,FORWARD-LOOKING STATEMENTS,"In addition to historical information, this A...",heading
325,1221,oracle,oracle-corporation_annual_report_1997.docx,1997,FACTORS THAT MAY AFFECT FUTURE RESULTS AND MAR...,The Company operates in a rapidly changing en...,heading
350,1246,oracle,oracle-corporation_annual_report_1997.docx,1997,Concentration of Credit Risk,Financial instruments which potentially subje...,bold


In [230]:
displacy.serve(matched_sents, style='ent', manual=True)

TypeError: tuple indices must be integers or slices, not str

In [188]:
displacy.serve(doc, style='dep')

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)



    Serving on port 5000...
    Using the 'dep' visualizer



127.0.0.1 - - [25/Dec/2017 21:53:24] "GET / HTTP/1.1" 200 154334
127.0.0.1 - - [25/Dec/2017 21:53:24] "GET /favicon.ico HTTP/1.1" 200 154334



    Shutting down server on port 5000.



### Named Entity Recognition (NER)

A named entity is a "real-world object" that's assigned a name – for example, a person, a country, a product or a book title. spaCy can recognise various types of named entities in a document, by asking the model for a prediction. 

In [82]:
for ent in doc.ents:
    print('{:10} | {:50} '.format(ent.label_, ent.text))

ORG        | the Board of Directors of the Company (“the Board” 
DATE       | March 31, 2011,                                    
NORP       |                                                    
ORG        | the Audit Committee                                
ORG        | Directorships                                      
PERSON     | William T Comfort                                  
GPE        | Jr.                                                
PERSON     | Frank Brienzi                                      
PERSON     | Dorian Daley                                       
PERSON     | William Corey West                                 
PERSON     | Derek H Williams                                   
CARDINAL   | 1                                                  
PERSON     | Frank Brienzi                                      
ORG        | the Annual General Meeting                         
DATE       | August 25, 2010                                    
CARDINAL   | 2           

In [None]:
displacy.serve(doc, style='ent')


    Serving on port 5000...
    Using the 'ent' visualizer



In [83]:
# observe the named entities tagged as PERSON
for ent in doc.ents:
    if 'PERSON' in ent.label_:
        print(ent)

William T Comfort
Frank Brienzi
Dorian Daley
William Corey West
Derek H Williams
Frank Brienzi
Joseph John
Chaitanya Kamat
S Venkatachalam
William Corey West
Robert K Weiler


In [29]:
# observe the named entities tagged as ORG (organization)
for ent in doc.ents:
    if 'ORG' in ent.label_:
        print(ent, ent.label_)

the Board of Directors of the Company (“the Board” ORG
the Audit Committee ORG
Directorships ORG
the Annual General Meeting ORG
the Annual General Meeting ORG
the Company ORG
   ORG
the Annual General Meeting ORG


In [187]:
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
          chunk.root.head.text)

The composition composition nsubj was
the Board Board pobj of
Directors Directors pobj of
the Company Company pobj of
the Board Board dobj “
March March pobj on
Only the Audit Committee Committee nsubjpass considered
Shareholders’ Grievances Committee Committee conj Committee
All Directorships Directorships nsubj are
Mr. William T Comfort, Jr. Jr. pobj of
Mr. Frank Brienzi Brienzi conj Jr.
Ms. Dorian Daley Daley conj Brienzi
Mr. William Corey West West conj Daley
Mr. Derek H Williams Williams conj West
foreign companies companies pobj in
None None nsubjpass related
the directors directors pobj of
1   Mr. Frank Brienzi Brienzi nsubjpass appointed
a Director Director pobj as
the Annual General Meeting Meeting pobj in
August August pobj on
Mr. Joseph John John nsubjpass appointed
a Director Director pobj as
Whole‑time Director Director conj Director
the Annual General Meeting Meeting pobj in
August August pobj on
He He nsubj ceased
a director director attr be
effect effect pobj with
March

In [171]:
[(ent, ent.label_) for ent in doc.ents]

[(the Board of Directors of the Company (“the Board”, 'ORG'),
 (March 31, 2011,, 'DATE'),
 ( , 'NORP'),
 (the Audit Committee, 'ORG'),
 (Directorships, 'ORG'),
 (William T Comfort, 'PERSON'),
 (Jr., 'GPE'),
 (Frank Brienzi, 'PERSON'),
 (Dorian Daley, 'PERSON'),
 (William Corey West, 'PERSON'),
 (Derek H Williams, 'PERSON'),
 (1, 'CARDINAL'),
 (Frank Brienzi, 'PERSON'),
 (the Annual General Meeting, 'ORG'),
 (August 25, 2010, 'DATE'),
 (2, 'CARDINAL'),
 (Joseph John, 'PERSON'),
 (the Annual General Meeting, 'ORG'),
 (August 25, 2010, 'DATE'),
 (March 31, 2011, 'DATE'),
 (3, 'CARDINAL'),
 (Chaitanya Kamat, 'PERSON'),
 (October 25, 2010 subject, 'DATE'),
 (the Company, 'ORG'),
 (4, 'CARDINAL'),
 (S Venkatachalam, 'PERSON'),
 (October 25, 2010, 'DATE'),
 (5, 'CARDINAL'),
 (  , 'ORG'),
 (William Corey West, 'PERSON'),
 (the Annual General Meeting, 'ORG'),
 (August 25, 2010, 'DATE'),
 ( , 'NORP'),
 (Robert K Weiler, 'PERSON'),
 (July 4, 2011, 'DATE')]

In [182]:
for chunk in doc.noun_chunks:
    print(chunk.text, '|', chunk.root)

The composition | composition
the Board | Board
Directors | Directors
the Company | Company
the Board | Board
March | March
Only the Audit Committee | Committee
Shareholders’ Grievances Committee | Committee
All Directorships | Directorships
Mr. William T Comfort, Jr. | Jr.
Mr. Frank Brienzi | Brienzi
Ms. Dorian Daley | Daley
Mr. William Corey West | West
Mr. Derek H Williams | Williams
foreign companies | companies
None | None
the directors | directors
1   Mr. Frank Brienzi | Brienzi
a Director | Director
the Annual General Meeting | Meeting
August | August
Mr. Joseph John | John
a Director | Director
Whole‑time Director | Director
the Annual General Meeting | Meeting
August | August
He | He
a director | director
effect | effect
March | March
Mr. Chaitanya Kamat | Kamat
an Additional Director | Director
the Managing Director | Director
CEO | CEO
effect | effect
October | October
subject | subject
the approval | approval
the members | members
the Company | Company
4  Mr. S Venkatachala

### Text Similarity - move to other notebook !!!

spaCy is able to compare two objects, and make a prediction of how similar they are. Predicting similarity is useful for building recommendation systems or flagging duplicates. For example, you can suggest a user content that's similar to what they're currently looking at, or label a support ticket as a duplicate if it's very similar to an already existing one.

In [122]:
df.head()

Unnamed: 0,section1,section2,similarity
6493101,Item 7. Management’s Discussion and Analysis...,Business Overview|2006,1.0
5819843,Business Overview|2006,Item 7. Management’s Discussion and Analysis...,1.0
9968598,Item 9. Changes In and Disagreements with Ac...,Item 9. Changes In and Disagreements with Ac...,1.0
7244285,Item 1B. Unresolved Staff Comments|2008,Item 9B. Other Information|2008,1.0
7244210,Item 1B. Unresolved Staff Comments|2008,Item 4. Submission of Matters to a Vote of S...,1.0


In [126]:
for section in df['section_text']:
    doc = nlp(section)

    for ent in doc.ents:
        if 'PERSON' in ent.label_:
            print(ent, ent.label_)

        Filed 07/27/94 PERSON
ORACLE PKWY PERSON
EDGAR Online PERSON
EDGAR Online PERSON
SQL PERSON
Oracle PERSON
ORACLE PERSON
Oracle PERSON
Oracle PERSON
ORACLE PERSON
Oracle Transparent Gateways PERSON
Kit PERSON
ORACLE Parallel Server PERSON
ORACLE PERSON
Lawrence J. Ellison PERSON
James A. Abrahamson PERSON
Jeffrey O. Henley PERSON
Raymond J. Lane PERSON
Raymond L. Ocampo PERSON
Jr. PERSON
Thomas A. Williams PERSON
Ellison PERSON
Ellison PERSON
Ellison PERSON
Abrahamson PERSON
Abrahamson PERSON
Abrahamson PERSON
Henley PERSON
Henley PERSON
Lane PERSON
Lane PERSON
Lane PERSON
Lane PERSON
Ocampo PERSON
Ocampo PERSON
Williams PERSON
Corporate Controller PERSON
Williams PERSON
Common Stock PERSON
Quarterly PERSON
LICENSES PERSON
Quarter Ended ------------------------------------------------- PERSON
Common Stock PERSON
Common Stock PERSON
Common Stock PERSON
Common Stock PERSON
Common Stock PERSON
Common Stock PERSON
Common Stock PERSON
Lawrence J. Ellison PERSON
Common Stock PERSON
Ja

Lawrence J. Ellison PERSON
Jeffrey O. Henley PERSON
Barry Ariko PERSON
Americas Gary Bloom PERSON
David J. Roux PERSON
Daniel Cooperman PERSON
Thomas A. Williams PERSON
Ellison PERSON
Ellison PERSON
Ellison PERSON
Lane PERSON
Lane PERSON
Lane PERSON
Lane PERSON
Henley PERSON
Ariko PERSON
Ariko PERSON
Ariko PERSON
Bloom PERSON
the Massively Parallel Computing Division PERSON
Bloom PERSON
Roux PERSON
Roux PERSON
Shaw PERSON
 Shaw PERSON
Shaw PERSON
Cooperman PERSON
Cooperman PERSON
Cooperman PERSON
Williams PERSON
Corporate Controller PERSON
Williams PERSON
Common Stock PERSON
Quarterly PERSON
Navio PERSON
Common Stock PERSON
Common Stock PERSON
Common Stock PERSON
Common Stock PERSON
Common Stock PERSON
Common Stock PERSON
Common Stock PERSON
Common Stock PERSON
Common Stock PERSON
Navio PERSON
Common Stock PERSON
Gains PERSON
Common Stock PERSON
4.09(18) PERSON
10.05(9 PERSON
David Roux PERSON
File No PERSON
File No PERSON
File No PERSON
  PERSON
REVENUES Licenses PERSON
EXPENSE PERSON

Varasano PERSON
Booz Allen & Hamilton PERSON
Booz Allen & Hamilton's PERSON
Wohl PERSON
Wohl PERSON
Cooperman PERSON
Cooperman PERSON
Cooperman PERSON
Minton PERSON
Corporate Controller PERSON
Corporate Controller PERSON
Minton PERSON
Minton PERSON
Common Stock PERSON
Sale Price PERSON
Common Stock PERSON
Common PERSON
Quarterly PERSON
Licenses PERSON
Research PERSON
Navio PERSON
Navio PERSON
Navio PERSON
R&D. PERSON
Navio PERSON
Navio PERSON
Navio PERSON
Navio PERSON
Marketable   PERSON
Common Stock PERSON
Common Stock PERSON
Common Stock PERSON
Oracle Japan PERSON
Quarter Ended PERSON
Common Stock PERSON
Common Stock PERSON
Common Stock PERSON
Common Stock PERSON
Common Stock PERSON
Common Stock PERSON
Common Stock PERSON
Accounting Bulletin No PERSON
Common Stock PERSON
Common Stock PERSON
Common Stock PERSON
Common Stock PERSON
Common Stock PERSON
Common Stock PERSON
File No PERSON
File No PERSON
  PERSON
Common PERSON
REVENUES  OTHER INCOME PERSON
EXPENSE PERSON
Common PERSON
Comm

Jeffrey O. Henley PERSON
     Cash Flows PERSON
      PERSON
Returns PERSON
Quarterly PERSON
Be Sold PERSON
   PERSON
Yen PERSON
Yen PERSON
Yen PERSON
Yen PERSON
Yen PERSON
Yen PERSON
         Recorded PERSON
Advance Pricing Agreements PERSON
Rights PERSON
Oracle Outsourcing PERSON
         Margin PERSON
     Margin PERSON
Margin PERSON
     Margin PERSON
     Margin PERSON
     Margin PERSON
Margin PERSON
 Disclosed PERSON
defendants’ PERSON
defendants’ PERSON
J.D. Edwards PERSON
J.D. Edwards PERSON
J.D. Edwards PERSON
J.D. Edwards PERSON
J.D. Edwards PERSON
ORACLE CORPORATION PERSON
LAWRENCE J. ELLISON PERSON
Lawrence J. Ellison PERSON
 Pursuant PERSON
LAWRENCE J. ELLISON PERSON
Lawrence J. Ellison PERSON
JEFFREY O. PERSON
Jeffrey O. Henley PERSON
JENNIFER L. MINTON   PERSON
Jennifer L. Minton   PERSON
JEFFREY BERG PERSON
Jeffrey Berg PERSON
MICHAEL J. BOSKIN PERSON
Michael J. Boskin PERSON
SAFRA A. CATZ PERSON
Safra A. Catz PERSON
Garcia-Molina PERSON
JOSEPH A. GRUNDFEST PERSON
Jose

Oracle JDeveloper PERSON
Oracle WebCenter PERSON
EPPM PERSON
Enterprise Resource Planning PERSON
Linux Support PERSON
Enterprise Linux PERSON
Windows Server PERSON
Sun PERSON
Ingres PERSON
PostgreSQL PERSON
TIBCO Software PERSON
Lombardi Software PERSON
Ascential Software PERSON
Dynamics NAV PERSON
Dynamics Snap PERSON
Lawson Software PERSON
Baan PERSON
SSA Global Technologies PERSON
HP PERSON
Sun PERSON
Sybase PERSON
BMC Software PERSON
Quest Software PERSON
Be Sold PERSON
Jeffrey O. Henley PERSON
Charles E. Phillips PERSON
Jr. PERSON
Keith G. Block PERSON
Consulting Jeff Epstein PERSON
Loic Le Guisquet PERSON
Consulting Luiz Meisler PERSON
Charles A. Rozwat PERSON
Consulting Dorian E. Daley PERSON
William Corey West PERSON
Corporate Controller PERSON
Ellison PERSON
Henley PERSON
Catz PERSON
Phillips PERSON
Partnerships PERSON
Block PERSON
Block PERSON
Epstein PERSON
Epstein PERSON
Epstein PERSON
Epstein PERSON
Le Guisquet PERSON
Meisler PERSON
Rottler PERSON
Oracle Support PERSON
Roz

Epstein PERSON
Non-Disparagement PERSON
Epstein PERSON
Oracle PERSON
Epstein PERSON
Epstein PERSON
Epstein PERSON
Epstein PERSON
Fees PERSON
Modifications PERSON
Epstein PERSON
Epstein PERSON
Epstein PERSON
Oracle PERSON
Juana Schurman PERSON
Jeff Epstein PERSON
Jeff Epstein PERSON
Oracle America PERSON
Juana M. Schurman PERSON
Assoc PERSON
Oracle America PERSON
 PERSON
Common Stock PERSON
Indicate PERSON
Cloud PERSON
Oracle PERSON
Cloud PERSON
Cloud PERSON
HCM PERSON
Cloud PERSON
Oracle PERSON
Oracle Solaris PERSON
Oracle Linux PERSON
Oracle Cloud PERSON
Oracle Cloud PERSON
Oracle Cloud PERSON
Oracle Cloud PERSON
Oracle JDeveloper PERSON
Oracle Solaris PERSON
Oracle Exadata PERSON
Oracle Solaris PERSON
Oracle Linux PERSON
Middleware Software PERSON
Oracle TimesTen In-Memory PERSON
Oracle In-Memory Database Cache PERSON
Oracle Databases PERSON
Standard PERSON
Oracle Berkeley DB PERSON
Oracle NoSQL Database PERSON
Oracle PERSON
Development Tools PERSON
Application Server PERSON
Oracle 

Exercise Price PERSON
Entire Agreement PERSON
Grant PERSON
     Grant Date PERSON
  PERSON
ORACLE CORPORATION PERSON
mm PERSON
Common Stock PERSON
Rule 13d-3 PERSON
Entire Agreement PERSON
409A Disclaimer PERSON
Code Section 409A PERSON
Code Section 409A PERSON
LLP PERSON
Mark V. Hurd PERSON
Mark V. Hurd PERSON
Safra A. Catz PERSON
Safra A. Catz PERSON
Principal Executive PERSON


In [71]:
df['section_year'] = df[['section_name','report_year']].apply(lambda x: '|'.join(x.astype(str).values), axis=1)

In [72]:
docs = [(section['section_year'], nlp(section['section_text'])) for ind, section in df.iterrows()]

In [65]:
# Use cases:
#grant similarity
#complaint similarity

In [74]:
sections = []
for section1_name, doc1 in docs:
    for section2_name, doc2 in docs:
        if section1_name != section2_name:
            sections.append([section1_name, section2_name, doc1.similarity(doc2)])

df = pd.DataFrame(sections).sort_values(2, ascending=False)

In [108]:
df[df.section1 == 'PRODUCTS|2014']
#for s in df[df.section1.str.contains("Risk")].section1.values:
#    print(s)

Unnamed: 0,section1,section2,similarity


In [77]:
df.columns = ['section1','section2','similarity']
df[(df['similarity'] < .99) & (df['similarity'] > .8)]

Unnamed: 0,section1,section2,similarity
9685380,Hardware Systems Support|2011,Hardware Systems Support|2014,0.989997
12150311,Hardware Systems Support|2014,Hardware Systems Support|2011,0.989997
10634630,Our hardware systems revenues and profitabilit...,Item 1A. Risk Factors|2013,0.989994
11451369,Item 1A. Risk Factors|2013,Our hardware systems revenues and profitabilit...,0.989994
191941,Item 10. Directors and Executive Officers of t...,Item 10. Directors and Executive Officers of t...,0.989993
2141812,Item 10. Directors and Executive Officers of t...,Item 10. Directors and Executive Officers of t...,0.989993
12064758,FORM 10-K|2014,FORM 10-K|2007,0.989991
6325517,FORM 10-K|2007,FORM 10-K|2014,0.989991
12065212,FORM 10-K|2014,FORM 10-K|2009,0.989991
5650971,FORM 10-K|2006,FORM 10-K|2009,0.989991


In [33]:
text = product['section_text'].values[0]
doc1 = nlp(text)


In [34]:
text = product['section_text'].values[1]
doc2 = nlp(text)

In [35]:
doc1.similarity(doc2)

0.77266057396661558

In [32]:
#tokens = nlp(u'dog cat banana')

similarity = []
for token1 in doc:
    for token2 in doc:
        if token1.text != token2.text:
            similarity.append([token1, token2, token1.similarity(token2)])

In [33]:
for np in doc.noun_chunks:
    print(np)

TypeError: 'generator' object is not subscriptable

In [None]:
#### lemmatization

In [39]:
for token in doc[0:5]:
    print(token.text, token.lemma_)

The the
composition composition
of of
the the
Board board


In [41]:
# print all the words that are modified by lemmatization
for token in doc:
    if token.text.lower() != token.lemma_:
        print(token.text, token.lemma_)

“ "
” "
as a
was be
as a
are be
considered consider
are be
companies company
directors director
are be
related relate
was be
appointed appoint
as a
held hold
was be
appointed appoint
as a
held hold
He -PRON-
ceased cease
was be
appointed appoint
as a
an a
as a
members member
was be
appointed appoint
as a
an a
was be
appointed appoint
as a
held hold
was be
appointed appoint
as a
an a


In [42]:
for token in doc:
    print(token.text, token.pos_, token.tag_)

The DET DT
composition NOUN NN
of ADP IN
the DET DT
Board PROPN NNP
of ADP IN
Directors PROPN NNPS
of ADP IN
the DET DT
Company PROPN NNP
( PUNCT -LRB-
“ PUNCT ``
the DET DT
Board PROPN NNP
” PUNCT ''
) PUNCT -RRB-
as ADP IN
on ADP IN
March PROPN NNP
31 NUM CD
, PUNCT ,
2011 NUM CD
, PUNCT ,
was VERB VBD
as ADP IN
under ADP IN
: PUNCT :
  SPACE 
* PUNCT .
Only ADV RB
the DET DT
Audit PROPN NNP
Committee PROPN NNP
and CCONJ CC
Shareholders’ PROPN NNP
Grievances PROPN NNP
Committee PROPN NNP
are VERB VBP
considered VERB VBN
. PUNCT .
All DET DT
Directorships PROPN NNP
of ADP IN
Mr. PROPN NNP
William PROPN NNP
T PROPN NNP
Comfort PROPN NNP
, PUNCT ,
Jr. PROPN NNP
, PUNCT ,
Mr. PROPN NNP
Frank PROPN NNP
Brienzi PROPN NNP
, PUNCT ,
Ms. PROPN NNP
Dorian PROPN NNP
Daley PROPN NNP
, PUNCT ,
Mr. PROPN NNP
William PROPN NNP
Corey PROPN NNP
West PROPN NNP
and CCONJ CC
Mr. PROPN NNP
Derek PROPN NNP
H PROPN NNP
Williams PROPN NNP
are VERB VBP
in ADP IN
foreign ADJ JJ
companies NOUN NNS
. PUNCT .
No

In [43]:
for token in doc:
    print(token.text, token.dep_, token.shape_)

The det Xxx
composition nsubj xxxx
of prep xx
the det xxx
Board pobj Xxxxx
of prep xx
Directors pobj Xxxxx
of prep xx
the det xxx
Company pobj Xxxxx
( punct (
“ parataxis “
the det xxx
Board dobj Xxxxx
” punct ”
) punct )
as prep xx
on prep xx
March pobj Xxxxx
31 nummod dd
, punct ,
2011 nummod dddd
, punct ,
was ROOT xxx
as advmod xx
under acomp xxxx
: punct :
    
* punct *
Only advmod Xxxx
the det xxx
Audit compound Xxxxx
Committee nsubjpass Xxxxx
and cc xxx
Shareholders’ compound Xxxxx’
Grievances compound Xxxxx
Committee conj Xxxxx
are auxpass xxx
considered ROOT xxxx
. punct .
All det Xxx
Directorships nsubj Xxxxx
of prep xx
Mr. compound Xx.
William compound Xxxxx
T compound X
Comfort nmod Xxxxx
, punct ,
Jr. pobj Xx.
, punct ,
Mr. compound Xx.
Frank compound Xxxxx
Brienzi conj Xxxxx
, punct ,
Ms. compound Xx.
Dorian compound Xxxxx
Daley conj Xxxxx
, punct ,
Mr. compound Xx.
William compound Xxxxx
Corey compound Xxxxx
West conj Xxxx
and cc xxx
Mr. compound Xx.
Derek compound Xxxx

In [45]:
for token in doc:
    print(token.text, token.is_alpha, token.is_stop)

The True False
composition True False
of True True
the True True
Board True False
of True True
Directors True False
of True True
the True True
Company True False
( False False
“ False False
the True True
Board True False
” False False
) False False
as True True
on True True
March True False
31 False False
, False False
2011 False False
, False False
was True True
as True True
under True True
: False False
  False False
* False False
Only True False
the True True
Audit True False
Committee True False
and True True
Shareholders’ False False
Grievances True False
Committee True False
are True True
considered True False
. False False
All True False
Directorships True False
of True True
Mr. False False
William True False
T True False
Comfort True False
, False False
Jr. False False
, False False
Mr. False False
Frank True False
Brienzi True False
, False False
Ms. False False
Dorian True False
Daley True False
, False False
Mr. False False
William True False
Corey True False
West True False

## Exercise

#### identify all sections with a person

### Customizing the vector classes

**preprocessor**: a callable that takes an entire document as input (as a single string), and returns a possibly transformed version of the document, still as an entire string. This can be used to remove HTML tags, lowercase the entire document, etc.

**tokenizer**: a callable that takes the output from the preprocessor and splits it into tokens, then returns a list of these.

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(max_features=5)
vect.fit(text)
print(vect.get_feature_names())

ModuleNotFoundError: No module named 'sklearn'

In [None]:
def pre(s):
    return ' '.join([term[0] for term in nltk.pos_tag(s.split()) if term[1] == 'NNP'])

vect = CountVectorizer(max_features=5, preprocessor=pre)
vect.fit(text)
print(vect.get_feature_names())

In [None]:
reviews = ['fast battery', 'slow charger', 'the charge']

def pre(s):
    synonyms = {'charger':'battery','charge':'battery'}
    new_sentence = [synonyms.get(term, term) for term in s.split()]
    return ' '.join(new_sentence)

vect = CountVectorizer(max_features=5, preprocessor=pre)
vect.fit(reviews)
print(vect.get_feature_names())

In [None]:
# who are the (people) being talked about
# other companies being talked about?

In [None]:
synonyms = {'charger':'battery','charge':'battery'}
synonyms.get('charge', 's')

William T Comfort PERSON
Frank Brienzi PERSON
Dorian Daley PERSON
William Corey West PERSON
Derek H Williams PERSON
Frank Brienzi PERSON
Joseph John PERSON
Chaitanya Kamat PERSON
S Venkatachalam PERSON
William Corey West PERSON
Robert K Weiler PERSON


### NLTK

In [None]:
import nltk 

# download nltk parsers
# nltk.download()

html = 'And now for something completely different'
nltk.pos_tag(html.split())

pos_tags = nltk.pos_tag(text.split())
pos_tags

for chunk in nltk.ne_chunk(pos_tags):
    if 'PERSON' in str(chunk):
        print(chunk)
        
def ie_preprocess(document):
    pos_tags = nltk.pos_tag(document.split())
    
    for chunk in nltk.ne_chunk(pos_tags):
        if 'ORGANIZATION' in str(chunk):
            yield chunk

person_sections = []

for ind, row in df.iterrows():
    if ind > 50:
        break
    for chunk in ie_preprocess(row['section_text']):         
        person_sections.append([row['section_name'], str(chunk)])

pd.DataFrame(person_sections, columns=['section_name','person'])