# NLP

##### Author: Alex Sherman | alsherman@deloitte.com

In [17]:
import os
import configparser

config = configparser.ConfigParser()
config.read('../config.ini')
DB_PATH = config['NLP']['DB_PATH']

In [18]:
# confirm DB_PATH is correct db directory, otherwise the rest of the code will not work
DB_PATH

'sqlite:///C:\\Users\\alsherman\\Desktop\\PycharmProjects\\firm_initiatives\\ml_guild\\raw_data\\annual_report.db'

In [38]:
# read the oracle 10k documents 

import pandas as pd
from sqlalchemy import create_engine
engine = create_engine(DB_PATH)

df = pd.read_sql("SELECT * FROM annual_report WHERE COMPANY = 'oracle'", con=engine)
df.head()

Unnamed: 0,annual_report_id,company,report_name,report_year,section_name,section_text,section_type
0,211,oracle,oracle-corporation_annual_report_1994.docx,1994,ORACLE SYSTEMS,,bold
1,212,oracle,oracle-corporation_annual_report_1994.docx,1994,FORM 10-K,(Annual Report) Filed 07/27/94 for the...,bold
2,213,oracle,oracle-corporation_annual_report_1994.docx,1994,SECURITIES AND EXCHANGE COMMISSION,"Washington, D.C. 20549",bold
3,214,oracle,oracle-corporation_annual_report_1994.docx,1994,Form 10-K [X] ANNUAL REPORT PURSUANT TO SECTIO...,,bold
4,215,oracle,oracle-corporation_annual_report_1994.docx,1994,"FOR THE FISCAL YEAR ENDED MAY 31, 1994",OR,bold


### NER Named Entity Recognition

In [44]:
import nltk 

# download nltk parsers
# nltk.download()

[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]

In [None]:
html = 'And now for something completely different'
nltk.pos_tag(html.split())

In [134]:
text = df.section_text[2452]
text

'The composition of the Board of Directors of the Company (“the Board”) as on March 31, 2011, was as under:  * Only the Audit Committee and Shareholders’ Grievances Committee are considered. All Directorships of Mr. William T Comfort, Jr., Mr. Frank Brienzi, Ms. Dorian Daley, Mr. William Corey West and Mr. Derek H Williams are in foreign companies. None of the directors are related inter se. 1   Mr. Frank Brienzi was appointed as a Director in the Annual General Meeting held on August 25, 2010. 2   Mr. Joseph John was appointed as a Director and Whole‑time Director in the Annual General Meeting held on August 25, 2010. He ceased to be a director with effect from March 31, 2011. 3  Mr. Chaitanya Kamat was appointed as an Additional Director and as the Managing Director and CEO with effect from October 25, 2010 subject to the approval of the members of the Company. 4  Mr. S Venkatachalam was appointed as an Additional Director with effect from October 25, 2010. 5   Mr. William Corey West

In [91]:
pos_tags = nltk.pos_tag(text.split())
pos_tags

[('The', 'DT'),
 ('composition', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('Board', 'NNP'),
 ('of', 'IN'),
 ('Directors', 'NNP'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('Company', 'NNP'),
 ('(“the', 'NNP'),
 ('Board”)', 'NNP'),
 ('as', 'IN'),
 ('on', 'IN'),
 ('March', 'NNP'),
 ('31,', 'CD'),
 ('2011,', 'CD'),
 ('was', 'VBD'),
 ('as', 'IN'),
 ('under:', 'JJ'),
 ('*', 'NNP'),
 ('Only', 'RB'),
 ('the', 'DT'),
 ('Audit', 'NNP'),
 ('Committee', 'NNP'),
 ('and', 'CC'),
 ('Shareholders’', 'NNP'),
 ('Grievances', 'NNP'),
 ('Committee', 'NNP'),
 ('are', 'VBP'),
 ('considered.', 'VBG'),
 ('All', 'NNP'),
 ('Directorships', 'NNP'),
 ('of', 'IN'),
 ('Mr.', 'NNP'),
 ('William', 'NNP'),
 ('T', 'NNP'),
 ('Comfort,', 'NNP'),
 ('Jr.,', 'NNP'),
 ('Mr.', 'NNP'),
 ('Frank', 'NNP'),
 ('Brienzi,', 'NNP'),
 ('Ms.', 'NNP'),
 ('Dorian', 'NNP'),
 ('Daley,', 'NNP'),
 ('Mr.', 'NNP'),
 ('William', 'NNP'),
 ('Corey', 'NNP'),
 ('West', 'NNP'),
 ('and', 'CC'),
 ('Mr.', 'NNP'),
 ('Derek', 'NNP'),
 ('H', 'NNP'),
 ('Williams', '

In [109]:
for chunk in nltk.ne_chunk(pos_tags):
    if 'PERSON' in str(chunk):
        print(chunk)

(PERSON Mr./NNP William/NNP)
(PERSON Mr./NNP Frank/NNP)
(PERSON Dorian/NNP)
(PERSON Mr./NNP William/NNP Corey/NNP West/NNP)
(PERSON Mr./NNP Derek/NNP)
(PERSON Mr./NNP Frank/NNP Brienzi/NNP)
(PERSON Mr./NNP Joseph/NNP)
(PERSON John/NNP)
(PERSON Mr./NNP Chaitanya/NNP Kamat/NNP)
(PERSON Mr./NNP S/NNP Venkatachalam/NNP)
(PERSON Mr./NNP William/NNP Corey/NNP West/NNP)
(PERSON Mr./NNP Robert/NNP)
(PERSON Weiler/NNP)


#### identify all sections with a person

In [167]:
def ie_preprocess(document):
    pos_tags = nltk.pos_tag(document.split())
    
    for chunk in nltk.ne_chunk(pos_tags):
        if 'ORGANIZATION' in str(chunk):
            yield chunk

In [163]:
person_sections = []

for ind, row in df.iterrows():
    if ind > 50:
        break
    for chunk in ie_preprocess(row['section_text']):         
        person_sections.append([row['section_name'], str(chunk)])

pd.DataFrame(person_sections, columns=['section_name','person'])

Unnamed: 0,section_name,person
0,FORM 10-K,(ORGANIZATION Period/NNP)
1,FORM 10-K,(ORGANIZATION ORACLE/NNP)
2,FORM 10-K,(ORGANIZATION SIC/NNP)
3,FORM 10-K,(ORGANIZATION EDGAR/NNP)
4,FORM 10-K,(ORGANIZATION EDGAR/NNP)
5,FORM 10-K,(ORGANIZATION CORP/NNP)
6,[ ] TRANSITION REPORT PURSUANT TO SECTION 13 O...,(ORGANIZATION COMMISSION/NNP)
7,[ ] TRANSITION REPORT PURSUANT TO SECTION 13 O...,(ORGANIZATION FILE/NNP)
8,Oracle Systems Corporation,(ORGANIZATION ORACLE/NNP)
9,SECURITIES REGISTERED PURSUANT TO SECTION 12(B...,(ORGANIZATION Securities/NNPS Exchange/NNP Act...


### Customizing the vector classes

**preprocessor**: a callable that takes an entire document as input (as a single string), and returns a possibly transformed version of the document, still as an entire string. This can be used to remove HTML tags, lowercase the entire document, etc.

**tokenizer**: a callable that takes the output from the preprocessor and splits it into tokens, then returns a list of these.

In [74]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(max_features=5)
vect.fit(text)
print(vect.get_feature_names())

['and', 'cooperative', 'technology', 'the', 'three']


In [79]:
def pre(s):
    return ' '.join([term[0] for term in nltk.pos_tag(s.split()) if term[1] == 'NNP'])

vect = CountVectorizer(max_features=5, preprocessor=pre)
vect.fit(text)
print(vect.get_feature_names())

['Applications', 'Company', 'Cooperative', 'Development', 'End']


In [None]:
reviews = ['fast battery', 'slow charger', 'the charge']

def pre(s):
    synonyms = {'charger':'battery','charge':'battery'}
    new_sentence = [synonyms.get(term, term) for term in s.split()]
    return ' '.join(new_sentence)

vect = CountVectorizer(max_features=5, preprocessor=pre)
vect.fit(reviews)
print(vect.get_feature_names())

In [None]:
# who are the (people) being talked about
# other companies being talked about?

In [None]:
synonyms = {'charger':'battery','charge':'battery'}
synonyms.get('charge', 's')

In [None]:
### SpaCy

Installation:
- conda install -c conda-forge spacy
- Download Microsoft Visual C++: http://landinghub.visualstudio.com/visual-cpp-build-tools
- 