# CS5560 Knowledge Discovery Management

# 1- Generate the following NLP tasks for the following sentence manually
## Input: 
- The dog saw John in the park.
- The little bear saw the fine fat trout in the rocky brook.

In [1]:
t1 = "The dog saw John in the park"
t2 = "The little bear saw the fine fat trout in the rocky brook"
t3 = "Michael reads books, he loves reading them"

# NLP Tasks:


### Installing the libraries

In [3]:
import nltk
from nltk import wordpunct_tokenize, pos_tag, ne_chunk
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer

In [4]:
tt1 = nltk.word_tokenize(t1)

In [5]:
tt2 = nltk.word_tokenize(t2)

In [6]:
print("Text 1:", tt1)
print("Text 2:", tt2)

Text 1: ['The', 'dog', 'saw', 'John', 'in', 'the', 'park']
Text 2: ['The', 'little', 'bear', 'saw', 'the', 'fine', 'fat', 'trout', 'in', 'the', 'rocky', 'brook']


### a. Part-of-speech(POS) tagger

In [7]:
ta1 = nltk.pos_tag(tt1)

In [8]:
ta2 = nltk.pos_tag(tt2)

In [9]:
print("Text 1:", ta1)
print("Text 2:", ta2)

Text 1: [('The', 'DT'), ('dog', 'NN'), ('saw', 'VBD'), ('John', 'NNP'), ('in', 'IN'), ('the', 'DT'), ('park', 'NN')]
Text 2: [('The', 'DT'), ('little', 'JJ'), ('bear', 'NN'), ('saw', 'VBD'), ('the', 'DT'), ('fine', 'JJ'), ('fat', 'NN'), ('trout', 'NN'), ('in', 'IN'), ('the', 'DT'), ('rocky', 'JJ'), ('brook', 'NN')]


### b. Named entity recognizer (NER)

In [10]:
namedEnt1 = nltk.ne_chunk(ta1)
namedEnt1.draw()

In [11]:
namedEnt2 = nltk.ne_chunk(ta2)
print(namedEnt2)

(S
  The/DT
  little/JJ
  bear/NN
  saw/VBD
  the/DT
  fine/JJ
  fat/NN
  trout/NN
  in/IN
  the/DT
  rocky/JJ
  brook/NN)


### c.Co-reference resolution system

In [2]:
# Load your usual SpaCy model (one of SpaCy English models)
import spacy
nlp = spacy.load('en')

# load NeuralCoref and add it to the pipe of SpaCy's model
import neuralcoref
coref = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name='neuralcoref')

# You're done. You can now use NeuralCoref the same way you usually manipulate a SpaCy document and it's annotations.
doc = nlp(t3)

doc._.has_coref
doc._.coref_clusters

# 2- Create an NLP project for the following tasks using CoreNLP
## Input: 
- Choose Dataset from the web (a text file)

### Installing the libraries

In [1]:
from stanfordcorenlp import StanfordCoreNLP
import json

### Connecting with the API and Loading the file

In [2]:
host = 'http://localhost'
port = 9000
nlp = StanfordCoreNLP(host, port=port, timeout=30000)

s = open('C:/Users/Michael/Desktop/a.txt', 'r')
# # print(s.read())
text = s.read()
# text = text.lower().split()

### a. Part-of-speech(POS) tagger

In [3]:
p = nlp.pos_tag(text)
print('POS:', p)

POS: [('She', 'PRP'), ('was', 'VBD'), ('walking', 'VBG'), ('lazily', 'RB'), (',', ','), ('for', 'IN'), ('the', 'DT'), ('fierce', 'JJ'), ('April', 'NNP'), ('sun', 'NN'), ('was', 'VBD'), ('directly', 'RB'), ('overhead', 'JJ'), ('.', '.'), ('Her', 'PRP$'), ('umbrella', 'NN'), ('blocked', 'VBD'), ('its', 'PRP$'), ('rays', 'NNS'), ('but', 'CC'), ('nothing', 'NN'), ('blocked', 'VBD'), ('the', 'DT'), ('heat', 'NN'), ('-', ':'), ('the', 'DT'), ('sort', 'NN'), ('of', 'IN'), ('raw', 'JJ'), (',', ','), ('wild', 'JJ'), ('heat', 'NN'), ('that', 'WDT'), ('crushes', 'VBZ'), ('you', 'PRP'), ('with', 'IN'), ('its', 'PRP$'), ('energy', 'NN'), ('.', '.'), ('A', 'DT'), ('few', 'JJ'), ('buffalo', 'NN'), ('were', 'VBD'), ('tethered', 'JJ'), ('under', 'IN'), ('coconuts', 'NNS'), (',', ','), ('browsing', 'VBG'), ('the', 'DT'), ('parched', 'JJ'), ('verges', 'NNS'), ('.', '.'), ('Occasionally', 'RB'), ('a', 'DT'), ('car', 'NN'), ('went', 'VBD'), ('past', 'RB'), (',', ','), ('leaving', 'VBG'), ('its', 'PRP$'), (

### b. Named entity recognizer (NER)

In [4]:
t = nlp.ner(text)
print('Tokenize:', t)

Tokenize: [('She', 'O'), ('was', 'O'), ('walking', 'O'), ('lazily', 'O'), (',', 'O'), ('for', 'O'), ('the', 'O'), ('fierce', 'O'), ('April', 'DATE'), ('sun', 'DATE'), ('was', 'O'), ('directly', 'O'), ('overhead', 'O'), ('.', 'O'), ('Her', 'O'), ('umbrella', 'O'), ('blocked', 'O'), ('its', 'O'), ('rays', 'O'), ('but', 'O'), ('nothing', 'O'), ('blocked', 'O'), ('the', 'O'), ('heat', 'O'), ('-', 'O'), ('the', 'O'), ('sort', 'O'), ('of', 'O'), ('raw', 'O'), (',', 'O'), ('wild', 'O'), ('heat', 'O'), ('that', 'O'), ('crushes', 'O'), ('you', 'O'), ('with', 'O'), ('its', 'O'), ('energy', 'O'), ('.', 'O'), ('A', 'O'), ('few', 'O'), ('buffalo', 'O'), ('were', 'O'), ('tethered', 'O'), ('under', 'O'), ('coconuts', 'O'), (',', 'O'), ('browsing', 'O'), ('the', 'O'), ('parched', 'O'), ('verges', 'O'), ('.', 'O'), ('Occasionally', 'O'), ('a', 'O'), ('car', 'O'), ('went', 'O'), ('past', 'DATE'), (',', 'O'), ('leaving', 'O'), ('its', 'O'), ('treads', 'O'), ('in', 'O'), ('the', 'O'), ('melting', 'O'), ('

### c. Co-reference resolution system

In [12]:
res1 = nlp.annotate(text,
                   properties={
                       'annotators': 'coref',
                       'pinelineLanguage':'en',
                       'timeout': 30000,
                   })

In [13]:
print(res1)

{
  "sentences": [
    {
      "index": 0,
      "basicDependencies": [
        {
          "dep": "ROOT",
          "governor": 0,
          "governorGloss": "ROOT",
          "dependent": 13,
          "dependentGloss": "overhead"
        },
        {
          "dep": "nsubj",
          "governor": 3,
          "governorGloss": "walking",
          "dependent": 1,
          "dependentGloss": "She"
        },
        {
          "dep": "aux",
          "governor": 3,
          "governorGloss": "walking",
          "dependent": 2,
          "dependentGloss": "was"
        },
        {
          "dep": "csubj",
          "governor": 13,
          "governorGloss": "overhead",
          "dependent": 3,
          "dependentGloss": "walking"
        },
        {
          "dep": "advmod",
          "governor": 3,
          "governorGloss": "walking",
          "dependent": 4,
          "dependentGloss": "lazily"
        },
        {
          "dep": "punct",
          "governor": 3,
       

### c. Sentiment Analysis

In [17]:
res = nlp.annotate(text,
                   properties={
                       'annotators': 'sentiment',
#                        'outputFormat': 'json',
                       'timeout': 30000,
                   })

In [18]:
print(res)

{
  "sentences": [
    {
      "index": 0,
      "parse": "(ROOT\r\n  (S\r\n    (S\r\n      (NP (PRP She))\r\n      (VP (VBD was)\r\n        (VP (VBG walking)\r\n          (ADVP (RB lazily)))))\r\n    (, ,)\r\n    (S\r\n      (PP (IN for)\r\n        (NP (DT the) (JJ fierce) (NNP April) (NN sun)))\r\n      (VP (VBD was)\r\n        (ADJP (RB directly) (JJ overhead))))\r\n    (. .)))",
      "basicDependencies": [
        {
          "dep": "ROOT",
          "governor": 0,
          "governorGloss": "ROOT",
          "dependent": 3,
          "dependentGloss": "walking"
        },
        {
          "dep": "nsubj",
          "governor": 3,
          "governorGloss": "walking",
          "dependent": 1,
          "dependentGloss": "She"
        },
        {
          "dep": "aux",
          "governor": 3,
          "governorGloss": "walking",
          "dependent": 2,
          "dependentGloss": "was"
        },
        {
          "dep": "advmod",
          "governor": 3,
          "gove