In [11]:
from __future__ import unicode_literals
import pandas as pd
import regex as re
import jieba
import logging
import json

In [20]:
from stanfordcorenlp import StanfordCoreNLP

In [12]:
df = pd.read_csv('test.csv', usecols=['words'], encoding='utf-8')

In [13]:
df.head(10)

Unnamed: 0,words
0,中国移动
1,台湾
2,中国
3,你好新加坡几多
4,apa digital town
5,to digi square
6,Singtel welcomes you to Singapore
7,Enjoy free data roaming with XL when you buy


In [21]:
# Testing out plug and play codes
'''
A sample code usage of the python package stanfordcorenlp to access a Stanford CoreNLP server.
Written as part of the blog post: https://www.khalidalnajjar.com/how-to-setup-and-use-stanford-corenlp-server-with-python/ 
'''

class StanfordNLP:
    def __init__(self, host='http://localhost', port=9000):
        self.nlp = StanfordCoreNLP(host, port=port,
                                   timeout=30000)  # , quiet=False, logging_level=logging.DEBUG)
        self.props = {
            'annotators': 'tokenize,ssplit,pos,lemma,ner,parse,depparse,dcoref,relation',
            'pipelineLanguage': 'en',
            'outputFormat': 'json'
        }

    def word_tokenize(self, sentence):
        return self.nlp.word_tokenize(sentence)

    def pos(self, sentence):
        return self.nlp.pos_tag(sentence)

    def ner(self, sentence):
        return self.nlp.ner(sentence)

    def parse(self, sentence):
        return self.nlp.parse(sentence)

    def dependency_parse(self, sentence):
        return self.nlp.dependency_parse(sentence)

    def annotate(self, sentence):
        return json.loads(self.nlp.annotate(sentence, properties=self.props))

    @staticmethod
    def tokens_to_dict(_tokens):
        tokens = defaultdict(dict)
        for token in _tokens:
            tokens[int(token['index'])] = {
                'word': token['word'],
                'lemma': token['lemma'],
                'pos': token['pos'],
                'ner': token['ner']
            }
        return tokens

In [None]:
if __name__ == '__main__':
    sNLP = StanfordNLP()
    text = 'A blog post using Stanford CoreNLP Server. Visit www.khalidalnajjar.com for more details.'
    print "Annotate:", sNLP.annotate(text)
    print "POS:", sNLP.pos(text)
    print "Tokens:", sNLP.word_tokenize(text)
    print "NER:", sNLP.ner(text)
    print "Parse:", sNLP.parse(text)
    print "Dep Parse:", sNLP.dependency_parse(text)