In [3]:
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English

In [4]:
# Neither spaCy nor NLTK have any methods for filtering punctuations 
def remove_punctuations(normalized_tokens):
    punctuations=['?',':','!',',','.',';','|','(',')','--','\n']
    for word in normalized_tokens:
        if word in punctuations:
            normalized_tokens.remove(word)  
    return normalized_tokens

In [5]:
def normalized_text(doc:str):
    vocab = English()
    # Create a Tokenizer with the default settings for English
    tokenizer = vocab.tokenizer
    tokens = tokenizer(doc)
    # just keeping pos tagger and lemmatizer
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner',
                                            'tok2vec', 'attribute_ruler'])
    doc = nlp(doc)   
    lemma_list = []
    for token in doc:
        lemma_list.append(token.lemma_)
    normalized_tokens =[] 
    for word in lemma_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            normalized_tokens.append(word) 
    normalized_tokens = remove_punctuations(normalized_tokens)
    return normalized_tokens

In [6]:
def find_nouns(text):
    nlp2 = spacy.load("en_core_web_sm") # Load the English language model
    nouns = []
    for word in text:
        doc_nouns = nlp2(word)
        for token in doc_nouns:
            if token.pos_ == "NOUN" or token.pos_ == "VERB":
                nouns.append(token.text)
    return nouns


In [18]:


word_list = '''Xi Jinping is set to deepen his control of China's government and economy, as lawmakers meet in Beijing to pass far-reaching reforms.

The National People's Congress (NPC), a rubber-stamp parliament, will confirm Mr Xi's third term as president, and the appointments of his top team.

They will also name a new premier, the second-in-command after Mr Xi, as the incumbent Li Keqiang departs.

The Two Sessions, as the meetings are known, are an annual affair.

But this year's sessions are particularly significant as delegates are expected to reshape several key Communist Party and state institutions.

They will also tighten control over bodies overseeing the finance sector and scientific and technology work, while "strengthening party-building work" in private businesses, according to state media.

The moves will likely further blur the lines between the Chinese Communist Party and the government, and consolidate the party's control of the private sector.

This comes amid an ongoing corruption crackdown which has seen a string of high-profile businessmen disappear in recent years. The latest person to go missing was one of China's top dealmakers in the tech sector.'''



Tokenize+Lemmatize:
['Xi', 'Jinping', 'be', 'set', 'to', 'deepen', 'his', 'control', 'of', 'China', "'s", 'government', 'and', 'economy', ',', 'as', 'lawmaker', 'meet', 'in', 'Beijing', 'to', 'pass', 'far', '-', 'reach', 'reform', '.', '\n\n', 'the', 'National', 'People', "'s", 'Congress', '(', 'NPC', ')', ',', 'a', 'rubber', '-', 'stamp', 'parliament', ',', 'will', 'confirm', 'Mr', 'Xi', "'s", 'third', 'term', 'as', 'president', ',', 'and', 'the', 'appointment', 'of', 'his', 'top', 'team', '.', '\n\n', 'they', 'will', 'also', 'name', 'a', 'new', 'premier', ',', 'the', 'second', '-', 'in', '-', 'command', 'after', 'Mr', 'Xi', ',', 'as', 'the', 'incumbent', 'Li', 'Keqiang', 'depart', '.', '\n\n', 'the', 'two', 'Sessions', ',', 'as', 'the', 'meeting', 'be', 'know', ',', 'be', 'an', 'annual', 'affair', '.', '\n\n', 'but', 'this', 'year', "'s", 'session', 'be', 'particularly', 'significant', 'as', 'delegate', 'be', 'expect', 'to', 'reshape', 'several', 'key', 'Communist', 'Party', 'and', '

In [23]:
import spacy
nlp = spacy.load('en_core_web_sm')
def spacy_process(text):
    doc = nlp(text)
    
#Tokenization and lemmatization 
    lemma_list = []
    for token in doc:
        lemma_list.append(token.lemma_)
    #print("Tokenize+Lemmatize:")
    #print(lemma_list)
    
    #Filter the stopword
    filtered_sentence =[] 
    for word in lemma_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            filtered_sentence.append(word) 
    
    #Remove punctuation
    punctuations="?:!.,;"
    for word in filtered_sentence:
        if word in punctuations:
            filtered_sentence.remove(word)
    #print(" ")
    #3print("Remove stopword & punctuation: ")
    #print(filtered_sentence)
    return filtered_sentence


In [24]:
text = "Jeng is eating friedrice and Ping is eating noodle and Moss has ate apple"
spacy_process(text)

Tokenize+Lemmatize:
['Jeng', 'be', 'eat', 'friedrice', 'and', 'Ping', 'be', 'eat', 'noodle', 'and', 'Moss', 'have', 'eat', 'apple']
 
Remove stopword & punctuation: 
['Jeng', 'eat', 'friedrice', 'Ping', 'eat', 'noodle', 'Moss', 'eat', 'apple']


In [30]:
text = "Narin Sirinapuk is studying Software Development."
spacy_process(text)

Tokenize+Lemmatize:
['Narin', 'Sirinapuk', 'be', 'study', 'Software', 'Development', '.']
 
Remove stopword & punctuation: 
['Narin', 'Sirinapuk', 'study', 'Software', 'Development']


In [31]:
class SpacyProcessor:
    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm')
        
    def process(self, text):
        doc = self.nlp(text)
        
        # Tokenization and lemmatization 
        lemma_list = []
        for token in doc:
            lemma_list.append(token.lemma_)
        
        # Filter the stopword
        filtered_sentence =[] 
        for word in lemma_list:
            lexeme = self.nlp.vocab[word]
            if lexeme.is_stop == False:
                filtered_sentence.append(word) 

        # Remove punctuation
        punctuations="?:!.,;"
        filtered_sentence = [word for word in filtered_sentence if word not in punctuations]

        return filtered_sentence