# NLP

##### Author: Alex Sherman | alsherman@deloitte.com

In [2]:
import os
import configparser

config = configparser.ConfigParser()
config.read('../../config.ini')
DB_PATH = config['NLP']['DB_PATH']

In [3]:
# confirm DB_PATH is correct db directory, otherwise the rest of the code will not work
DB_PATH

'sqlite:///C:\\Users\\alsherman\\Desktop\\PycharmProjects\\firm_initiatives\\ml_guild\\raw_data\\annual_report.db'

In [4]:
# read the oracle 10k documents 

import pandas as pd
from sqlalchemy import create_engine
engine = create_engine(DB_PATH)

df = pd.read_sql("SELECT * FROM annual_report WHERE COMPANY = 'oracle'", con=engine)
df.head()

Unnamed: 0,annual_report_id,company,report_name,report_year,section_name,section_text,section_type
0,211,oracle,oracle-corporation_annual_report_1994.docx,1994,ORACLE SYSTEMS,,bold
1,212,oracle,oracle-corporation_annual_report_1994.docx,1994,FORM 10-K,(Annual Report) Filed 07/27/94 for the...,bold
2,213,oracle,oracle-corporation_annual_report_1994.docx,1994,SECURITIES AND EXCHANGE COMMISSION,"Washington, D.C. 20549",bold
3,214,oracle,oracle-corporation_annual_report_1994.docx,1994,Form 10-K [X] ANNUAL REPORT PURSUANT TO SECTIO...,,bold
4,215,oracle,oracle-corporation_annual_report_1994.docx,1994,"FOR THE FISCAL YEAR ENDED MAY 31, 1994",OR,bold


In [20]:
df[df.section_text.str.contains('CEO')].section_name

2452          Board of Directors Composition and category
2454    Attendance of each Director at the Board Meeti...
2458    Brief resume of Directors proposed to be appoi...
2467                                  Compensation policy
2468    Details of remuneration paid to the Directors ...
2496                  Chaitanya Kamat\tMakarand  Padalkar
2594                                  Mr. Chaitanya Kamat
2596                                  Mr. Robert K Weiler
Name: section_name, dtype: object

In [34]:
# example text
text = df.section_text[2452]
text

'The composition of the Board of Directors of the Company (“the Board”) as on March 31, 2011, was as under:  * Only the Audit Committee and Shareholders’ Grievances Committee are considered. All Directorships of Mr. William T Comfort, Jr., Mr. Frank Brienzi, Ms. Dorian Daley, Mr. William Corey West and Mr. Derek H Williams are in foreign companies. None of the directors are related inter se. 1   Mr. Frank Brienzi was appointed as a Director in the Annual General Meeting held on August 25, 2010. 2   Mr. Joseph John was appointed as a Director and Whole‑time Director in the Annual General Meeting held on August 25, 2010. He ceased to be a director with effect from March 31, 2011. 3  Mr. Chaitanya Kamat was appointed as an Additional Director and as the Managing Director and CEO with effect from October 25, 2010 subject to the approval of the members of the Company. 4  Mr. S Venkatachalam was appointed as an Additional Director with effect from October 25, 2010. 5   Mr. William Corey West

### SpaCy

Installation:
- conda install -c conda-forge spacy
- Download Microsoft Visual C++: http://landinghub.visualstudio.com/visual-cpp-build-tools

In [23]:
import spacy
nlp = spacy.load('en')

In [24]:
doc = nlp(text)

In [35]:
from spacy import displacy
displacy.serve(doc, style='ent')


    Serving on port 5000...
    Using the 'ent' visualizer



127.0.0.1 - - [28/Nov/2017 00:23:50] "GET / HTTP/1.1" 200 15146



    Shutting down server on port 5000.



In [36]:
displacy.serve(doc, style='dep')

  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)



    Serving on port 5000...
    Using the 'dep' visualizer



127.0.0.1 - - [28/Nov/2017 00:25:48] "GET / HTTP/1.1" 200 154334



    Shutting down server on port 5000.



#### NER Named Entity Recognition

To access named entities, we iterate through doc.ents.

the entity label is available on the attribute label_

In [25]:
for ent in doc.ents:
    print(ent, ent.label_)

the Board of Directors of the Company (“the Board” ORG
March 31, 2011, DATE
  NORP
the Audit Committee ORG
Directorships ORG
William T Comfort PERSON
Jr. GPE
Frank Brienzi PERSON
Dorian Daley PERSON
William Corey West PERSON
Derek H Williams PERSON
1 CARDINAL
Frank Brienzi PERSON
the Annual General Meeting ORG
August 25, 2010 DATE
2 CARDINAL
Joseph John PERSON
the Annual General Meeting ORG
August 25, 2010 DATE
March 31, 2011 DATE
3 CARDINAL
Chaitanya Kamat PERSON
October 25, 2010 subject DATE
the Company ORG
4 CARDINAL
S Venkatachalam PERSON
October 25, 2010 DATE
5 CARDINAL
   ORG
William Corey West PERSON
the Annual General Meeting ORG
August 25, 2010 DATE
  NORP
Robert K Weiler PERSON
July 4, 2011 DATE


In [26]:
# observe the named entities tagged as PERSON
for ent in doc.ents:
    if 'PERSON' in ent.label_:
        print(ent, ent.label_)

William T Comfort PERSON
Frank Brienzi PERSON
Dorian Daley PERSON
William Corey West PERSON
Derek H Williams PERSON
Frank Brienzi PERSON
Joseph John PERSON
Chaitanya Kamat PERSON
S Venkatachalam PERSON
William Corey West PERSON
Robert K Weiler PERSON


In [29]:
# observe the named entities tagged as ORG (organization)
for ent in doc.ents:
    if 'ORG' in ent.label_:
        print(ent, ent.label_)

the Board of Directors of the Company (“the Board” ORG
the Audit Committee ORG
Directorships ORG
the Annual General Meeting ORG
the Annual General Meeting ORG
the Company ORG
   ORG
the Annual General Meeting ORG


In [33]:
for np in doc.noun_chunks:
    print(np)

TypeError: 'generator' object is not subscriptable

In [None]:
#### lemmatization

In [39]:
for token in doc[0:5]:
    print(token.text, token.lemma_)

The the
composition composition
of of
the the
Board board


In [41]:
# print all the words that are modified by lemmatization
for token in doc:
    if token.text.lower() != token.lemma_:
        print(token.text, token.lemma_)

“ "
” "
as a
was be
as a
are be
considered consider
are be
companies company
directors director
are be
related relate
was be
appointed appoint
as a
held hold
was be
appointed appoint
as a
held hold
He -PRON-
ceased cease
was be
appointed appoint
as a
an a
as a
members member
was be
appointed appoint
as a
an a
was be
appointed appoint
as a
held hold
was be
appointed appoint
as a
an a


In [42]:
for token in doc:
    print(token.text, token.pos_, token.tag_)

The DET DT
composition NOUN NN
of ADP IN
the DET DT
Board PROPN NNP
of ADP IN
Directors PROPN NNPS
of ADP IN
the DET DT
Company PROPN NNP
( PUNCT -LRB-
“ PUNCT ``
the DET DT
Board PROPN NNP
” PUNCT ''
) PUNCT -RRB-
as ADP IN
on ADP IN
March PROPN NNP
31 NUM CD
, PUNCT ,
2011 NUM CD
, PUNCT ,
was VERB VBD
as ADP IN
under ADP IN
: PUNCT :
  SPACE 
* PUNCT .
Only ADV RB
the DET DT
Audit PROPN NNP
Committee PROPN NNP
and CCONJ CC
Shareholders’ PROPN NNP
Grievances PROPN NNP
Committee PROPN NNP
are VERB VBP
considered VERB VBN
. PUNCT .
All DET DT
Directorships PROPN NNP
of ADP IN
Mr. PROPN NNP
William PROPN NNP
T PROPN NNP
Comfort PROPN NNP
, PUNCT ,
Jr. PROPN NNP
, PUNCT ,
Mr. PROPN NNP
Frank PROPN NNP
Brienzi PROPN NNP
, PUNCT ,
Ms. PROPN NNP
Dorian PROPN NNP
Daley PROPN NNP
, PUNCT ,
Mr. PROPN NNP
William PROPN NNP
Corey PROPN NNP
West PROPN NNP
and CCONJ CC
Mr. PROPN NNP
Derek PROPN NNP
H PROPN NNP
Williams PROPN NNP
are VERB VBP
in ADP IN
foreign ADJ JJ
companies NOUN NNS
. PUNCT .
No

In [43]:
for token in doc:
    print(token.text, token.dep_, token.shape_)

The det Xxx
composition nsubj xxxx
of prep xx
the det xxx
Board pobj Xxxxx
of prep xx
Directors pobj Xxxxx
of prep xx
the det xxx
Company pobj Xxxxx
( punct (
“ parataxis “
the det xxx
Board dobj Xxxxx
” punct ”
) punct )
as prep xx
on prep xx
March pobj Xxxxx
31 nummod dd
, punct ,
2011 nummod dddd
, punct ,
was ROOT xxx
as advmod xx
under acomp xxxx
: punct :
    
* punct *
Only advmod Xxxx
the det xxx
Audit compound Xxxxx
Committee nsubjpass Xxxxx
and cc xxx
Shareholders’ compound Xxxxx’
Grievances compound Xxxxx
Committee conj Xxxxx
are auxpass xxx
considered ROOT xxxx
. punct .
All det Xxx
Directorships nsubj Xxxxx
of prep xx
Mr. compound Xx.
William compound Xxxxx
T compound X
Comfort nmod Xxxxx
, punct ,
Jr. pobj Xx.
, punct ,
Mr. compound Xx.
Frank compound Xxxxx
Brienzi conj Xxxxx
, punct ,
Ms. compound Xx.
Dorian compound Xxxxx
Daley conj Xxxxx
, punct ,
Mr. compound Xx.
William compound Xxxxx
Corey compound Xxxxx
West conj Xxxx
and cc xxx
Mr. compound Xx.
Derek compound Xxxx

In [45]:
for token in doc:
    print(token.text, token.is_alpha, token.is_stop)

The True False
composition True False
of True True
the True True
Board True False
of True True
Directors True False
of True True
the True True
Company True False
( False False
“ False False
the True True
Board True False
” False False
) False False
as True True
on True True
March True False
31 False False
, False False
2011 False False
, False False
was True True
as True True
under True True
: False False
  False False
* False False
Only True False
the True True
Audit True False
Committee True False
and True True
Shareholders’ False False
Grievances True False
Committee True False
are True True
considered True False
. False False
All True False
Directorships True False
of True True
Mr. False False
William True False
T True False
Comfort True False
, False False
Jr. False False
, False False
Mr. False False
Frank True False
Brienzi True False
, False False
Ms. False False
Dorian True False
Daley True False
, False False
Mr. False False
William True False
Corey True False
West True False

## Exercise

#### identify all sections with a person

### Customizing the vector classes

**preprocessor**: a callable that takes an entire document as input (as a single string), and returns a possibly transformed version of the document, still as an entire string. This can be used to remove HTML tags, lowercase the entire document, etc.

**tokenizer**: a callable that takes the output from the preprocessor and splits it into tokens, then returns a list of these.

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(max_features=5)
vect.fit(text)
print(vect.get_feature_names())

ModuleNotFoundError: No module named 'sklearn'

In [None]:
def pre(s):
    return ' '.join([term[0] for term in nltk.pos_tag(s.split()) if term[1] == 'NNP'])

vect = CountVectorizer(max_features=5, preprocessor=pre)
vect.fit(text)
print(vect.get_feature_names())

In [None]:
reviews = ['fast battery', 'slow charger', 'the charge']

def pre(s):
    synonyms = {'charger':'battery','charge':'battery'}
    new_sentence = [synonyms.get(term, term) for term in s.split()]
    return ' '.join(new_sentence)

vect = CountVectorizer(max_features=5, preprocessor=pre)
vect.fit(reviews)
print(vect.get_feature_names())

In [None]:
# who are the (people) being talked about
# other companies being talked about?

In [None]:
synonyms = {'charger':'battery','charge':'battery'}
synonyms.get('charge', 's')

William T Comfort PERSON
Frank Brienzi PERSON
Dorian Daley PERSON
William Corey West PERSON
Derek H Williams PERSON
Frank Brienzi PERSON
Joseph John PERSON
Chaitanya Kamat PERSON
S Venkatachalam PERSON
William Corey West PERSON
Robert K Weiler PERSON


### NLTK

In [None]:
import nltk 

# download nltk parsers
# nltk.download()

html = 'And now for something completely different'
nltk.pos_tag(html.split())

pos_tags = nltk.pos_tag(text.split())
pos_tags

for chunk in nltk.ne_chunk(pos_tags):
    if 'PERSON' in str(chunk):
        print(chunk)
        
def ie_preprocess(document):
    pos_tags = nltk.pos_tag(document.split())
    
    for chunk in nltk.ne_chunk(pos_tags):
        if 'ORGANIZATION' in str(chunk):
            yield chunk

person_sections = []

for ind, row in df.iterrows():
    if ind > 50:
        break
    for chunk in ie_preprocess(row['section_text']):         
        person_sections.append([row['section_name'], str(chunk)])

pd.DataFrame(person_sections, columns=['section_name','person'])