# NLP basics

In [14]:
import nltk

In [876]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

### Tokenizing

In [15]:
from nltk.tokenize import sent_tokenize, word_tokenize


In [767]:
text = "This is a sentence. This is another sentence. Woo hoo!"

In [855]:
stoks = sent_tokenize(text)

In [856]:
stoks

['This is a sentence.', 'This is another sentence.', 'Woo hoo!']

In [None]:
### EXCERCISE 1: use the word_tokenizer to split the text into word tokens.

In [857]:
wtoks = word_tokenize(text)

In [858]:
wtoks

['This',
 'is',
 'a',
 'sentence',
 '.',
 'This',
 'is',
 'another',
 'sentence',
 '.',
 'Woo',
 'hoo',
 '!']

### Stripping

#### remove stopwords from a text

In [35]:
text = "This is a very normal sentence but it has a lot of stopwords in it."

In [37]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [38]:
stopwords = nltk.corpus.stopwords.words('english')

In [42]:
toks = word_tokenize(text)

In [45]:
[w for w in toks if w.lower() not in stopwords]

['normal', 'sentence', 'lot', 'stopwords', '.']

### Stemming

In [124]:
stemmer = nltk.PorterStemmer()

In [125]:
text = ["apples", "animals", "animation", "several"]

In [126]:
[stemmer.stem(w) for w in text]

['appl', 'anim', 'anim', 'sever']

In [889]:
### EXCERCISE 2: There's another stemmer called the LancasterStemmer. Try it out and compare the results.

### Lemmatization

In [31]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lon\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [32]:
lem = nltk.WordNetLemmatizer()

In [33]:
[lem.lemmatize(w) for w in text]

['apple', 'animal', 'animation', 'several']

### POS Tagging

In [11]:
from nltk.tag import pos_tag

In [12]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\gus\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [13]:
nltk.pos_tag(wtoks)

[('This', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('sentence', 'NN'),
 ('.', '.'),
 ('This', 'DT'),
 ('is', 'VBZ'),
 ('another', 'DT'),
 ('sentence', 'NN'),
 ('.', '.'),
 ('Help', 'NN'),
 ('!', '.')]

### A reusable pre-processing function 

In [890]:
### Excercise 3: In the next examples, we're going to be doing a lot of initial preprocessing.
### Write a reusable function called "preprocess" that takes a document and returns a list of tokenized sentences
### Hint. First split it into sentences, then each sentence into words and then tag everything

In [40]:
### Preprocess a document for NLP.
### Takes a plain text document as input
### Returns a list of a list of POS-tagged tuples

def preprocess(document):
    sentences = sent_tokenize(document)
    sentences = [word_tokenize(sent) for sent in sentences]
    sentences = [pos_tag(sent) for sent in sentences]
    return sentences

### Noun-phrase chunking

In [894]:
text = """Software Development is such an important industry it is hard to imagine a world without it.
        Coal mining is also important, but it is usually a more dangerous job than software engineering."""

In [895]:
### user-defined regular expression to chunk noun-phrases

grammar = ("NP: {"           ### label: every noun-phrase that we detect will get this label
           "<DT>?"           ### zero or one determiners, 
           "<JJ>*"           ### followed by an optional number of adjectives, 
           "<NN.*>+"         ### followed by any type or number of nouns
           "}")
                                        
                                        

#### (helpful app to develop regex grammars)

In [896]:
nltk.app.chunkparser()

#### NP chunker

In [897]:
### takes a pre-processed text, and a grammar (string) and return a result with the noun phrases "chunked"
def np_chunker(text, grammar):
    parser = nltk.RegexpParser(grammar)
    pp_text = preprocess(text)
    result = []
    for sent in pp_text:
        result.append(parser.parse(sent))
    return result

In [898]:
chunked = np_chunker(text, grammar)

In [899]:
chunked

[Tree('S', [Tree('NP', [('Software', 'NNP'), ('Development', 'NNP')]), ('is', 'VBZ'), ('such', 'JJ'), Tree('NP', [('an', 'DT'), ('important', 'JJ'), ('industry', 'NN')]), ('it', 'PRP'), ('is', 'VBZ'), ('hard', 'JJ'), ('to', 'TO'), ('imagine', 'VB'), Tree('NP', [('a', 'DT'), ('world', 'NN')]), ('without', 'IN'), ('it', 'PRP'), ('.', '.')]),
 Tree('S', [Tree('NP', [('Coal', 'NN'), ('mining', 'NN')]), ('is', 'VBZ'), ('also', 'RB'), ('important', 'JJ'), (',', ','), ('but', 'CC'), ('it', 'PRP'), ('is', 'VBZ'), ('usually', 'RB'), ('a', 'DT'), ('more', 'RBR'), Tree('NP', [('dangerous', 'JJ'), ('job', 'NN')]), ('than', 'IN'), Tree('NP', [('software', 'NN'), ('engineering', 'NN')]), ('.', '.')])]

In [900]:
### now let's go through this result and print out only the noun phrase parts

for sentence in chunked:
    for n in sentence:
        if isinstance(n, nltk.tree.Tree):
            if n.label() == 'NP':
                print(n)
        

(NP Software/NNP Development/NNP)
(NP an/DT important/JJ industry/NN)
(NP a/DT world/NN)
(NP Coal/NN mining/NN)
(NP dangerous/JJ job/NN)
(NP software/NN engineering/NN)


### Named entity recognition

In [165]:
### in fact all the hard work has been done for us, with the standard ne_chunk function built into nltk

In [852]:
text = """ERNI Consulting AG is a technology consulting company.
        The Chief Executive is Daniel Leichti and the Managing Director of Switzerland is Christoph Aeschlimann."""

In [853]:
pp_text = preprocess(text)

In [854]:
for sentence in pp_text:
        print(nltk.ne_chunk(sentence))

(S
  (ORGANIZATION ERNI/NNP)
  Consulting/NNP
  AG/NNP
  is/VBZ
  a/DT
  technology/NN
  consulting/VBG
  company/NN
  ./.)
(S
  The/DT
  Chief/NNP
  Executive/NNP
  is/VBZ
  (PERSON Daniel/NNP Leichti/NNP)
  and/CC
  the/DT
  Managing/NNP
  Director/NNP
  of/IN
  (GPE Switzerland/NNP)
  is/VBZ
  (PERSON Christoph/NNP Aeschlimann/NNP)
  ./.)


In [179]:
print (entities.__repr__())    ### workaround if you get an error above because ghostscript is not installed

Tree('S', [('It', 'PRP'), ('was', 'VBD'), ('built', 'VBN'), ('in', 'IN'), ('honor', 'NN'), ('of', 'IN'), Tree('PERSON', [('George', 'NNP'), ('Washington', 'NNP')]), (',', ','), ('who', 'WP'), ('led', 'VBD'), ('the', 'DT'), ('country', 'NN'), ('to', 'TO'), ('independence', 'VB'), ('and', 'CC'), ('then', 'RB'), ('became', 'VBD'), ('its', 'PRP$'), ('first', 'JJ'), ('President', 'NNP'), ('.', '.')])


# TextRazor demo

An API key for up to 500 free API calls per day is available: https://www.textrazor.com

## Text Classification

In [810]:
from textrazor import TextRazor
from bs4 import BeautifulSoup
import requests

In [885]:
### get the wikipedia article on Donald Trump and use BeautifulSoup to extract only the text

url = 'https://en.wikipedia.org/wiki/Donald_Trump'
res = requests.get(url)
html = res.text
text = BeautifulSoup(html, 'lxml').get_text()

In [None]:
### create and initialise the TextRazor client

In [803]:
API_KEY = '5db1b63a67374cc2a133486d81d5bd0295fce7edd89b9040b777682e'

In [841]:
client = TextRazor(API_KEY, extractors=["topics", "categories", "relations"])

In [842]:
client.set_classifiers(['textrazor_newscodes'])

In [843]:
response = client.analyze(text[8736:16000])      # the main article starts at 8736 chars into the text

In [None]:
### assign the topics, categories and relations to variables for later use

In [844]:
topics = list(response.topics())

In [845]:
categories = list(response.categories())

In [846]:
relations = list(response.relations())

In [847]:
for topic in topics:
    if topic.score > 0.8:                     ### just the most relevant topics in the text
        print (topic.label)

Donald Trump
Donald Trump presidential campaign, 2016
Politics
United States
New York Military Academy
Politics of the United States
Government


In [877]:
for category in categories:
    if category.score > 0.5:
        print (category.label, category.score)

politics>election 0.7668
politics 0.7489
social issue>family 0.6464
politics>politics (general) 0.5866
politics>government 0.5528


## Relations detection

In [867]:
client = TextRazor(API_KEY, extractors=["words", "entities", "entailments", "relations"])

In [868]:
client.set_entity_freebase_type_filters(["/organization/organization"])
client.set_entity_dbpedia_type_filters(["Company"])

In [869]:
text = "Accenture to acquire OCTO Technology within 6 months."

In [870]:
response = client.analyze(text)

In [871]:
relations = response.relations()

In [872]:
### takes a list of words and returns the relations in the text
def get_relations(wordslist):
    relations_list = []
    for relation in relations:
        for word in relation.predicate_words:
            if word.lemma in wordslist:
                relations_list.append(relation)
                return relations_list

In [873]:
buy_relations = get_relations(["sell", "buy", "acquire"])

In [874]:
buy_relations

[TextRazor Relation at positions [TextRazor Word:"b'to'" at position 1, TextRazor Word:"b'acquire'" at position 2, TextRazor Word:"b'within'" at position 5]]

In [875]:
for relation in buy_relations:
    entity_params = []
    for param in relation.params:
        all_entities = list(param.entities())

        if all_entities:
            entity_params.append(all_entities[0])

    if len(entity_params) > 1:
        print ("Found valid sell relationship between: ", entity_params)

Found valid sell relationship between:  [TextRazor Entity b'Accenture' at positions [0], TextRazor Entity b'OCTO Technology' at positions [3, 4]]
