In [None]:
"""
# Command line
D:
cd Backup\fintech-assignments\AML\stanford-corenlp-full-2018-10-05

english
java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload tokenize,ssplit,pos,lemma,ner,parse,depparse -status_port 9000 -port 9000 -timeout 50000 & 

chinese:
java -Xmx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -serverProperties StanfordCoreNLP-chinese.properties -preload tokenize,ssplit,pos,lemma,ner,parse -status_port 9001  -port 9001 -timeout 50000


# Set environment variable
os.environ['STANFORD_MODELS'] = './stanford-corenlp-full-2018-10-05'
os.environ['CLASSPATH'] = './stanford-corenlp-full-2018-10-05/stanford-corenlp-3.9.2'
os.environ["JAVA_HOME"] = r'D:\Java\jre1.8.0_211'
"""

In [1]:
import os
import re 
import jieba
import pandas as pd
from opencc import OpenCC
from nltk.parse import CoreNLPParser

In [None]:
df = pd.read_csv(os.path.join('crawl_data_csv', 'news_data.csv'))
df.head()

In [2]:
parser = CoreNLPParser(url = 'http://localhost:9001', tagtype = 'ner')   # tagtype = 'pos' or 'ner'

In [6]:
content = '高雄醫學大學去年7月校務會議通過對第16屆董事會在8、9年前，數度修改捐助章程，不實登載「陳啟川為創辦人」，經提起偽造文書訴訟，5日首度在高雄地檢署開庭。10多位校友趕來聲援，拉布條、高喊「解散董事會」。高醫大董事會5日則聲明，無任何偽造文書事實，已故陳啟川先生捐出10餘甲土地，讓高醫大順利設校史實不容抹煞；高醫現有土地所有權狀大部分登載轉移原因為「捐贈」，是由陳啟川母親擔任董事長的南和興產公司捐贈。高醫大校友藍傳盛表示，台灣省教育廳1954年函轉教育部案由「私立高雄醫學院創辦人杜聰明呈送設院事項」，附件即載明杜聰明詳細履歷及擬聘董事名單，因此學校創辦人為杜聰明毫無疑問，校友會立場支持學校提告。'
keyword = '偽造文書'
sentences = []
for key in re.finditer(keyword, content):
    end_list = ['。', '！', '!', '？', '?']
    sentence_start = 0
    sentence_end = len(content) - 1
    for end in end_list:
        dis_to_keystart = content[:key.start()][::-1].find(end)
        if dis_to_keystart != -1: sentence_start = max(sentence_start, key.start() - dis_to_keystart) 
        dis_to_keyend = content[key.end():].find(end)
        if dis_to_keyend != -1: sentence_end = min(sentence_end, key.end() + dis_to_keyend)
    
    sentences.append(content[sentence_start: sentence_end+1])

print(sentences)

['高雄醫學大學去年7月校務會議通過對第16屆董事會在8、9年前，數度修改捐助章程，不實登載「陳啟川為創辦人」，經提起偽造文書訴訟，5日首度在高雄地檢署開庭。', '高醫大董事會5日則聲明，無任何偽造文書事實，已故陳啟川先生捐出10餘甲土地，讓高醫大順利設校史實不容抹煞；高醫現有土地所有權狀大部分登載轉移原因為「捐贈」，是由陳啟川母親擔任董事長的南和興產公司捐贈。']


In [7]:
# (Optional) convert from Simplified Chinese to Traditional Chinese
cc = OpenCC('t2s') 
for sentence in sentences:
    sentence = cc.convert(sentence)
    nltk_cut = list(parser.tokenize(sentence))
    jieba_cut = list(jieba.cut(sentence, cut_all = False))
    print('nltk_cut:', nltk_cut, '\n')
    print('jieba_cut:', jieba_cut, '\n')
    
    nltk_tag = list(parser.tag(nltk_cut))
    print('nltk_tag:', nltk_tag, '\n')
    
    per = []
    org = []
    for wordtag_pair in nltk_tag:
        index = nltk_tag.index(wordtag_pair)
        
        word = wordtag_pair[0]
        tag = wordtag_pair[1]
        if tag == 'PERSON':
            if nltk_tag[index-1][1] == 'PERSON': per[-1] += word
            else: per.append(word)
            
        if tag == 'ORGANIZATION':
            if nltk_tag[index-1][1] == 'ORGANIZATION': org[-1] += word
            else: org.append(word)
        
    print(org, '\n')
    print(per, '\n')
    
    # nltk_sentence = list(parser.parse(nltk_cut))
    # print('nltk_tag:', nltk_sentence, '\n''\n')

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\孟均\AppData\Local\Temp\jieba.cache
Loading model cost 2.635 seconds.
Prefix dict has been built succesfully.


nltk_cut: ['高雄', '医学', '大学', '去年', '7月', '校务', '会议', '通过', '对第', '16', '届', '董事会', '在', '8', '、', '9', '年', '前', '，', '数度', '修改', '捐助', '章程', '，', '不实', '登载', '「', '陈启川', '为', '创办人', '」', '，', '经', '提起', '伪造', '文书', '诉讼', '，', '5', '日', '首', '度', '在', '高雄', '地检署', '开庭', '。'] 

jieba_cut: ['高雄医学', '大学', '去年', '7', '月', '校务', '会议', '通过', '对', '第', '16', '届', '董事会', '在', '8', '、', '9', '年前', '，', '数度', '修改', '捐助', '章程', '，', '不', '实', '登载', '「', '陈启川', '为', '创办人', '」', '，', '经', '提起', '伪造文书', '诉讼', '，', '5', '日', '首度', '在', '高雄地', '检署', '开庭', '。'] 

nltk_tag: [('高雄', 'ORGANIZATION'), ('医学', 'ORGANIZATION'), ('大学', 'ORGANIZATION'), ('去年', 'DATE'), ('7月', 'DATE'), ('校务', 'O'), ('会议', 'O'), ('通过', 'O'), ('对第', 'MISC'), ('16', 'NUMBER'), ('届', 'MISC'), ('董事会', 'MISC'), ('在', 'O'), ('8', 'NUMBER'), ('、', 'MISC'), ('9', 'NUMBER'), ('年', 'MISC'), ('前', 'MISC'), ('，', 'O'), ('数度', 'O'), ('修改', 'O'), ('捐助', 'O'), ('章程', 'O'), ('，', 'O'), ('不实', 'O'), ('登载', 'O'), ('「', 'O'), ('陈启川', 'PERSON'), ('为

In [14]:
nltk_sentence = list(parser.parse(nltk_cut))
print('nltk_tag:', nltk_sentence, '\n''\n')

nltk_tag: [Tree('ROOT', [Tree('IP', [Tree('NP', [Tree('NP', [Tree('ADJP', [Tree('JJ', ['高'])]), Tree('NP', [Tree('NN', ['医大'])])]), Tree('NP', [Tree('NN', ['董事会'])])]), Tree('QP', [Tree('CD', ['5']), Tree('CLP', [Tree('M', ['日'])])]), Tree('VP', [Tree('ADVP', [Tree('AD', ['则'])]), Tree('VP', [Tree('VV', ['声明']), Tree('PU', ['，']), Tree('IP', [Tree('IP', [Tree('IP', [Tree('VP', [Tree('VE', ['无']), Tree('NP', [Tree('DP', [Tree('DT', ['任何'])]), Tree('CP', [Tree('IP', [Tree('VP', [Tree('VV', ['伪造'])])])]), Tree('NP', [Tree('NN', ['文书']), Tree('NN', ['事实'])])])])]), Tree('PU', ['，']), Tree('VP', [Tree('ADVP', [Tree('AD', ['已'])]), Tree('VP', [Tree('VV', ['故']), Tree('IP', [Tree('NP', [Tree('NR', ['陈启川']), Tree('NN', ['先生'])]), Tree('VP', [Tree('VV', ['捐出']), Tree('NP', [Tree('NP', [Tree('QP', [Tree('CD', ['10'])]), Tree('NP', [Tree('NN', ['余'])])]), Tree('QP', [Tree('CD', ['甲'])]), Tree('NP', [Tree('NN', ['土地'])])])])])])])]), Tree('PU', ['，']), Tree('IP', [Tree('VP', [Tree('VV', ['让']), Tr

In [15]:
for line in nltk_sentence:
    for sentence in line:
        sentence.draw()