# SECTION 1: Introduction

## SECTION 1.1: Corpora

Corpus contains raw text (ASCII/UTF-8) and metadata.

In [1]:
import nltk

In [4]:
from nltk.corpus import words

In [5]:
from nltk.corpus import reuters

In [8]:
from nltk.corpus import brown

In [9]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

## SECTION 1.2: Tokenization

2 types of words:
1. content words
2. stopwords

Pure Python, spaCy, or NLTK can be used.

In [10]:
import spacy

In [11]:
import en_core_web_sm

In [12]:
nlp = en_core_web_sm.load()
text = "Mary, don't slap the green witch"
print(
    [
        str(token) for token
        in nlp(text.lower())
    ]
)

['mary', ',', 'do', "n't", 'slap', 'the', 'green', 'witch']


In [14]:
from nltk.tokenize import \
TweetTokenizer

In [15]:
tweet = "Snow White and the Seven Degrees #MakeAMovieCold @midnight :-)"
tokenizer = TweetTokenizer()
print(
    tokenizer.tokenize(tweet.lower())
)

['snow', 'white', 'and', 'the', 'seven', 'degrees', '#makeamoviecold', '@midnight', ':-)']


NLTK tweet tokenizer preserves hashtags, handles, and smiles.

## SECTION 1.3: WordNet

WordNet is a large lexical database in English.

In [16]:
from nltk.corpus import wordnet

synonyms = []
antonyms = []

for syn in wordnet.synsets('good'):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(
                l.antonyms()[0].name()
            )

print(set(synonyms))
print(set(antonyms))

{'skillful', 'just', 'salutary', 'commodity', 'dependable', 'full', 'sound', 'near', 'in_effect', 'proficient', 'adept', 'in_force', 'serious', 'upright', 'honorable', 'undecomposed', 'ripe', 'beneficial', 'right', 'skilful', 'respectable', 'practiced', 'unspoilt', 'good', 'thoroughly', 'secure', 'dear', 'estimable', 'soundly', 'effective', 'unspoiled', 'expert', 'safe', 'honest', 'well', 'trade_good', 'goodness'}
{'evil', 'badness', 'bad', 'evilness', 'ill'}


## SECTION 1.4: Grammartical Analysis

In [17]:
doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text, token.pos_, token.dep_)

Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN dobj
startup NOUN dep
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj


## SECTION 1.5: Dependency Parsing

In [20]:
from spacy import displacy

displacy.render(doc, 
                style='dep', 
                jupyter='True', 
                options={'distance':90})

## SECTION 1.6: Named Entity Recognition (NER)

In [22]:
doc = nlp(
    'I just bought 2 shares at 9 am because the stock went up 30% in just 2 days according to the WSJ'
)
displacy.render(doc,
              style='ent',
              jupyter='True')