# Pos-tag

In [1]:
from nltk.tag import StanfordPOSTagger
path_to_jar = 'files/stanford-postagger.jar'
english_model='files/english-left3words-distsim.tagger'


In [2]:
import os
java_path = "C:/Program Files/Java/jdk1.8.0_111/bin/java.exe"
os.environ['JAVAHOME'] = java_path

In [3]:
tagger = StanfordPOSTagger(english_model, path_to_jar, encoding='utf-8')


In [5]:
import pandas as pd

In [7]:
tagged_sent = tagger.tag("My name is Emna Amor".split())
df=pd.DataFrame(tagged_sent,columns=['token','Pos-Tag'])
df

Unnamed: 0,token,Pos-Tag
0,My,PRP$
1,name,NN
2,is,VBZ
3,Emna,NNP
4,Amor,NNP


# Parser

In [9]:
from nltk.parse import stanford
os.environ['STANFORD_PARSER'] = 'files/stanford-parser.jar'
os.environ['STANFORD_MODELS'] = 'files/stanford-parser-3.8.0-models.jar'



In [10]:
english_model='files/englishFactored.ser.gz'
parser=stanford.StanfordParser(model_path=english_model)

In [31]:
sentences=parser.raw_parse(("Barack Obama watched a game in Germany, the 2nd of august 1992."))

In [32]:
# you can draw the tree (sentence by sentence) or you can list te tree 
# here we have just one sentence
for line in sentences:
        #line.draw()
        print(list(line))

[Tree('S', [Tree('NP', [Tree('NNP', ['Barack']), Tree('NNP', ['Obama'])]), Tree('VP', [Tree('VBD', ['watched']), Tree('NP', [Tree('DT', ['a']), Tree('NN', ['game'])]), Tree('PP', [Tree('IN', ['in']), Tree('NP', [Tree('NP', [Tree('NNP', ['Germany'])]), Tree(',', [',']), Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('NN', ['2nd'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('NNP', ['august']), Tree('CD', ['1992'])])])])])])]), Tree('.', ['.'])])]


## NER

In [33]:
from nltk.tag.stanford import StanfordNERTagger

In [34]:
ner_jar='files/stanford-ner.jar'
classifier='files/english.muc.7class.distsim.crf.ser.gz'
st = StanfordNERTagger(classifier,ner_jar)

In [35]:
text="Harold Frederick Shipman is the most dangerous serial killer in 1970-2008 period."

In [38]:
import pandas as pd
import re
from nltk.corpus import stopwords
#clean_text=cleanupText(text)
#textt=re.split('[- :]', text)
classified_text = st.tag(text.split())
#print(classified_text)
df=pd.DataFrame(classified_text,columns=['token','entity'])

In [39]:
df

Unnamed: 0,token,entity
0,Harold,PERSON
1,Frederick,PERSON
2,Shipman,PERSON
3,is,O
4,the,O
5,most,O
6,dangerous,O
7,serial,O
8,killer,O
9,in,O


In [40]:
df = df[df['entity'] != 'O']
df

Unnamed: 0,token,entity
0,Harold,PERSON
1,Frederick,PERSON
2,Shipman,PERSON
10,1970-2008,DATE
11,period.,TIME


## coref resolution

In [41]:
# i will use core nlp server for coreference resolution 
# you can run the server using this command 
#java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer

# you can use the server for parsing , part of speech tagging or any other tool of stanford core nlp



In [45]:
from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')

In [52]:
def parsed_text(text):
    # The StanfordCoreNLP server is running on http://127.0.0.1:9000
    nlp = StanfordCoreNLP('http://127.0.0.1:9000')
    # Json response of all the annotations
    output = nlp.annotate(text, properties={
    'annotators': 'tokenize, ssplit, pos, depparse, parse,dcoref',
    'outputFormat': 'json'
    })
    #json_string=output['sentences'][0]['parse']
    #s =json_string.replace('\n', ' ').replace('\r', '')
    return output

In [53]:
s=parsed_text(text)

In [54]:
print (s['corefs']['1'][1]['text'],"refers to "+s['corefs']['1'][0]['text'])

the most dangerous serial killer in 1970-2008 period refers to Harold Frederick Shipman
