### 1. Tokenization & Remove stopword without Stemming/Lemmatization.

In [26]:
import nltk
import re

def load_data(filepath):
    with open( filepath, 'r', encoding='UTF-8') as f:
        ff = f.read().lower()
    return ff

In [27]:
#read text and convert to lower character and then tokenize
data = load_data("carroll-alice.txt")
nltk.download('punkt')
# remove punctuation before tokenization
data = re.sub(r'[^\w\s]','',data)
# Tokenization
alice_tokens = nltk.word_tokenize(data)
print(alice_tokens)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [28]:
# remove stop words
stopword = nltk.download('stopwords')
from nltk.corpus import stopwords
nltk_stop = stopwords.words('english')
#print("nltk:",nltk_stop)
user_stop = ["cant", "shall", "dont", "im", "thats"]
stopwords = nltk_stop + user_stop
#by using .isalnum() to remove some special charactor term such as "?",".""
alice_no_stopword = [term for term in alice_tokens if term not in stopwords and term.isalnum()]
print("---------------------------------------")
print(alice_no_stopword)

nltk: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'th

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
def topword(data, n_top):
    ntop = nltk.FreqDist(data)
    return ntop.most_common(n_top)
    
top10word = topword(alice_no_stopword, 10)
print(top10word)

[('said', 462), ('alice', 385), ('little', 128), ('one', 101), ('know', 86), ('like', 85), ('would', 83), ('went', 83), ('could', 77), ('thought', 74)]


### 2. Tokenization & Remove stopwords with Stemming/Lemmatization.

###    (1) Porter stemming

In [30]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

alice_porter = []
for words in alice_no_stopword:
    alice_porter.append(ps.stem(words))

In [31]:
top10word = topword(alice_porter, 10)
print(top10word)

[('said', 462), ('alic', 397), ('littl', 128), ('look', 103), ('one', 103), ('like', 97), ('know', 90), ('would', 83), ('went', 83), ('thought', 80)]


###    (2) Lancaster stemming

In [32]:
from nltk.stem import LancasterStemmer

lan = LancasterStemmer()
alice_lancas = []
for words in alice_no_stopword:
    alice_lancas.append(lan.stem(words))

In [33]:
top10word = topword(alice_lancas, 10)
print(top10word)

[('said', 462), ('al', 400), ('littl', 128), ('on', 104), ('look', 103), ('lik', 97), ('know', 90), ('would', 83), ('went', 83), ('thought', 80)]


### (3)   Lemmatization function --- from nltk

##### NLTK 里这个词形还原工具的一个问题是需要手动指定词性，比如上面例子中的 "working" 这个词，如果不加后面那个 pos 参数，输出的结果将会是 "working" 本身。

In [34]:
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download("punkt")
nltk.download("maxent_treebank_pos_tagger")
nltk.download('averaged_perceptron_tagger')
 

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
    
def lemmatize_sentence(words):
    res = []
    lemmatizer = WordNetLemmatizer()
    for word, pos in pos_tag(words):
        wordnet_pos = get_wordnet_pos(pos) or wordnet.NOUN
        res.append(lemmatizer.lemmatize(word, pos=wordnet_pos))

    return res

lem = WordNetLemmatizer()
lem_and_porter = lemmatize_sentence(alice_porter)
lem_and_lancaster = lemmatize_sentence(alice_lancas)

print("Stemmed by Porter and Lemmatized top 10:\n\n",topword(lem_and_porter, 10),"\n\n")    
print("Stemmed by Lancaster and Lemmatized top 10:\n\n",topword(lem_and_lancaster, 10))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_treebank_pos_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Stemmed by Porter and Lemmatized top 10:

 [('say', 531), ('alic', 397), ('go', 173), ('littl', 128), ('think', 121), ('get', 112), ('look', 103), ('one', 103), ('know', 101), ('like', 97)] 


Stemmed by Lancaster and Lemmatized top 10:

 [('say', 531), ('al', 400), ('go', 160), ('think', 132), ('littl', 128), ('get', 112), ('on', 104), ('look', 103), ('know', 98), ('lik', 97)]


###  (4) Lemmatization function --- from spacy

In [35]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(data)
tokens = [token.text for token in doc]
tokens_nostop = [term for term in tokens if term not in stopwords and term.isalnum()]
#print(tokens_nostop)
porter_spacy = []
lancas_spacy = []
for words in tokens_nostop:
    porter_spacy.append(ps.stem(words))

for words in tokens_nostop:
    lancas_spacy.append(lan.stem(words))
    
print("Stemmed by Porter and Lemmatized by spacy top 10:\n\n",topword(porter_spacy, 10),"\n\n")    
print("Stemmed by Lancaster and Lemmatized by spacy 10:\n\n",topword(lancas_spacy, 10))

Stemmed by Porter and Lemmatized by spacy top 10:

 [('said', 462), ('alic', 397), ('nt', 216), ('littl', 128), ('look', 103), ('one', 103), ('like', 97), ('would', 96), ('know', 90), ('could', 86)] 


Stemmed by Lancaster and Lemmatized by spacy 10:

 [('said', 462), ('al', 400), ('nt', 216), ('littl', 128), ('on', 104), ('look', 103), ('lik', 97), ('would', 96), ('know', 90), ('could', 86)]


### (5)   Lemmatization function --- from pattern

#### If you encounter error : RuntimeError: generator raised StopIteration, please just rerun the current cell.
#### I think it may be the python version problem.

In [37]:
from pattern.en import lemma

porter_and_pattern = []
lancas_and_pattern = []
for words in alice_porter:
    porter_and_pattern.append(lemma(words))
    
for words in alice_lancas:
    lancas_and_pattern.append(lemma(words))

print("Stemmed by Porter and Lemmatized by \"pattern\" top 10:\n\n",topword(porter_and_pattern, 10),"\n\n")    
print("Stemmed by Lancaster and Lemmatized by \"pattern\" top 10:\n\n",topword(lancas_and_pattern, 10))
    

Stemmed by Porter and Lemmatized by "pattern" top 10:

 [('say', 531), ('alic', 397), ('go', 173), ('think', 144), ('littl', 128), ('get', 113), ('know', 105), ('look', 103), ('one', 103), ('like', 97)] 


Stemmed by Lancaster and Lemmatized by "pattern" top 10:

 [('say', 531), ('al', 400), ('go', 160), ('think', 144), ('littl', 128), ('get', 113), ('know', 105), ('on', 104), ('look', 103), ('lik', 97)]


### (6)   Lemmatization function --- from TextBlob

In [38]:
from textblob import TextBlob

porter_blob = []
lancas_blob = []

blob = TextBlob(data)
for word in blob.words:
    porter_blob.append(word.lemmatize())
    lancas_blob.append(word.lemmatize())

porter_blob = [term for term in porter_blob if term not in stopwords and term.isalnum()]
lancas_blob = [term for term in lancas_blob if term not in stopwords and term.isalnum()]

ps = PorterStemmer()
lan = LancasterStemmer()
porter_blob_f = []
lancas_blob_f = []

for words in porter_blob:
    porter_blob_f.append(ps.stem(words))
    
for words in lancas_blob:
    lancas_blob_f.append(lan.stem(words))

print("Stemmed by Porter and Lemmatized by \"TextBlob\" top 10:\n\n",topword(porter_blob_f, 10))
print("Stemmed by Lancaster and Lemmatized by \"TextBlob\" top 10:\n\n",topword(lancas_blob_f, 10))

Stemmed by Porter and Lemmatized by "TextBlob" top 10:

 [('said', 462), ('alic', 397), ('wa', 357), ('littl', 128), ('look', 103), ('one', 103), ('like', 97), ('know', 90), ('go', 84), ('would', 83)]
Stemmed by Lancaster and Lemmatized by "TextBlob" top 10:

 [('said', 462), ('al', 404), ('wa', 357), ('littl', 128), ('on', 104), ('look', 103), ('lik', 97), ('know', 90), ('would', 83), ('went', 83)]


### 3. Compare the result of question 1 (without steaming) and question 2 (with stemming and with Lemmatization).

#### After 