# NLP basics

In [None]:
import nltk

### Downloading the NLTK resources and models
The NLTK toolkit contains a lot of corpora (texts) which are useful for different tasks. It also contains all the pre-trained processing algorithms. If you are planning on working with it a lot and have enough hard drive space, then the best thing is simply to download everything from the downloader. The downloader is launched as follows:

In [None]:
nltk.download()

Important note: you must close the downloader when finished otherwise it will block the notebook from executing further commands.

### Tokenizing

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize


In [None]:
text = "This is a sentence. This is another sentence. Woo hoo!"

In [None]:
stoks = sent_tokenize(text)

In [None]:
stoks

### EXCERCISE 1: use the word_tokenizer to split the text into word tokens.

In [None]:
wtoks = word_tokenize(text)

In [None]:
wtoks

### Stripping

#### remove stopwords from a text

In [None]:
text = "This is a very normal sentence but it has a lot of stopwords in it."

In [None]:
nltk.download('stopwords')

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
toks = word_tokenize(text)

In [None]:
[w for w in toks if w.lower() not in stopwords]

### Stemming

In [None]:
stemmer = nltk.PorterStemmer()

In [None]:
text = ["apples", "animals", "animation", "several"]

In [None]:
[stemmer.stem(w) for w in text]

### EXCERCISE 2: There's another stemmer called the LancasterStemmer. Try it out and compare the results.

### Lemmatization

In [None]:
nltk.download('wordnet')

In [None]:
lem = nltk.WordNetLemmatizer()

In [None]:
[lem.lemmatize(w) for w in text]

### POS Tagging

In [None]:
from nltk.tag import pos_tag

In [None]:
nltk.download('averaged_perceptron_tagger')

In [None]:
nltk.pos_tag(wtoks)

### A reusable pre-processing function 

### Excercise 3: In the next examples, we're going to be doing a lot of initial preprocessing.
Write a reusable function called "preprocess" that takes a document and returns a list of tokenized sentences
Hint. First split it into sentences, then each sentence into words and then tag everything

In [None]:
### Preprocess a document for NLP.
### Takes a plain text document as input
### Returns a list of a list of POS-tagged tuples

def preprocess(document):
    sentences = sent_tokenize(document)
    sentences = [word_tokenize(sent) for sent in sentences]
    sentences = [pos_tag(sent) for sent in sentences]
    return sentences

### Noun-phrase chunking

In [None]:
text = """Software Development is such an important industry it is hard to imagine a world without it.
        Coal mining is also important, but it is usually a more dangerous job than software engineering."""

In [None]:
### user-defined grammar using regular expression to chunk noun-phrases

grammar = ("NP: {"           ### label: every noun-phrase that we detect will get this label
           "<DT>?"           ### zero or one determiners, 
           "<JJ>*"           ### followed by an optional number of adjectives, 
           "<NN.*>+"         ### followed by any type or number of nouns
           "}")
                                        
                                        

#### (helpful app to develop regex grammars)

In [None]:
nltk.app.chunkparser()

#### NP chunker

In [None]:
### takes a pre-processed text, and a grammar (string) and return a result with the noun phrases "chunked"
def np_chunker(text, grammar):
    parser = nltk.RegexpParser(grammar)
    pp_text = preprocess(text)
    result = []
    for sent in pp_text:
        result.append(parser.parse(sent))
    return result

In [None]:
chunked = np_chunker(text, grammar)

In [None]:
chunked

### Excercise 4 write a function that takes in a chunked text and outputs only the noun phrases

Useful: http://stackoverflow.com/questions/14841997/how-to-navigate-a-nltk-tree-tree
   
Hint: for each sentence in a chunked text, iterate through the tags. If the label of the tag is NP then append it to the list (return a list of nodes).

In [None]:
### solution:
def noun_phrase_extractor(chunked):
    noun_phrases = []
    for sentence in chunked:
        for n in sentence:
            if isinstance(n, nltk.tree.Tree):
                if n.label() == 'NP':
                    noun_phrases.append(n)
    return noun_phrases
        

In [None]:
noun_phrase_extractor(chunked)

### Named entity recognition

In [None]:
### in fact all the hard work has been done for us, with the standard ne_chunk function built into nltk

In [None]:
text = """ERNI Consulting AG is a technology consulting company.
        The Chief Executive is Daniel Liechti and the Managing Director of Switzerland is Christoph Aeschlimann."""

In [None]:
pp_text = preprocess(text)

In [None]:
for sentence in pp_text:
        print(nltk.ne_chunk(sentence))

In [None]:
for sentence in pp_text:
        print(nltk.ne_chunk(sentence).__str__)    ### workaround if you get an error above because ghostscript is not installed

# TextRazor demo

An API key for up to 500 free API calls per day is available: https://www.textrazor.com

## Text Classification

Preconditions:
1. Install beautifulsoup. e.g. pip install bs4
2. Install the textrazor python SDK: pip install textrazor (see https://github.com/TextRazor/textrazor-python)
3. Register for TextRazor and get an API key

In [None]:
from textrazor import TextRazor
from bs4 import BeautifulSoup
import requests

In [None]:
### get the wikipedia article on Donald Trump and use BeautifulSoup to extract only the text

url = 'https://en.wikipedia.org/wiki/Donald_Trump'
res = requests.get(url)
html = res.text
text = BeautifulSoup(html, 'lxml').get_text()

In [None]:
### create and initialise the TextRazor client

In [None]:
API_KEY = 'YOUR API KEY GOES HERE'

In [None]:
client = TextRazor(API_KEY, extractors=["topics", "categories", "relations"])

In [None]:
client.set_classifiers(['textrazor_newscodes'])

In [None]:
response = client.analyze(text[8736:16000])      # the main article starts at 8736 chars into the text

In [None]:
### assign the topics, categories and relations to variables for later use

In [None]:
topics = list(response.topics())

In [None]:
categories = list(response.categories())

In [None]:
relations = list(response.relations())

In [None]:
for topic in topics:
    if topic.score > 0.8:                     ### just the most relevant topics in the text
        print (topic.label)

In [None]:
for category in categories:
    if category.score > 0.5:
        print (category.label, category.score)

## Relations detection

In [None]:
client = TextRazor(API_KEY, extractors=["words", "entities", "entailments", "relations"])

In [None]:
client.set_entity_freebase_type_filters(["/organization/organization"])
client.set_entity_dbpedia_type_filters(["Company"])

In [None]:
text = "Accenture to acquire OCTO Technology within 6 months."

In [None]:
response = client.analyze(text)

In [None]:
relations = response.relations()

In [None]:
### takes a list of words and returns the relations in the text
def get_relations(wordslist):
    relations_list = []
    for relation in relations:
        for word in relation.predicate_words:
            if word.lemma in wordslist:
                relations_list.append(relation)
                return relations_list

In [None]:
buy_relations = get_relations(["sell", "buy", "acquire"])

In [None]:
buy_relations

In [None]:
for relation in buy_relations:
    entity_params = []
    for param in relation.params:
        all_entities = list(param.entities())

        if all_entities:
            entity_params.append(all_entities[0])

    if len(entity_params) > 1:
        print ("Found valid sell relationship between: ", entity_params)