In [1]:
%matplotlib inline
import pandas as pd

df = pd.read_csv('./train.csv')
print(df.head(5))

   Id  Popularity                                       Page content
0   0          -1  <html><head><div class="article-info"> <span c...
1   1           1  <html><head><div class="article-info"><span cl...
2   2           1  <html><head><div class="article-info"><span cl...
3   3          -1  <html><head><div class="article-info"><span cl...
4   4          -1  <html><head><div class="article-info"><span cl...


In [2]:
orig_page_data = df['Page content'].values

## Preprocessing: Data Cleaning

In [3]:
import re
from bs4 import BeautifulSoup

def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

In [4]:
cleaned_page_data = []
for page in orig_page_data:
    cleaned_page_data.append(preprocessor(page))

cleaned_page_data 是去掉html tag的原文，還沒有tokenize

In [5]:
def tokenizer(text):
    return re.split('\s+', text.strip())

print(tokenizer('runners like running and thus they run'))

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']


## Preprocessing: Word Stemming

In [6]:
from nltk.stem.porter import PorterStemmer

def tokenizer_stem(text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in re.split('\s+', text.strip())]

print(tokenizer_stem('runners like running and thus they run'))

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']


In [7]:
stemmed_page_data = []
for page in cleaned_page_data:
    stemmed_page_data.append(tokenizer_stem(page))

stemmed_page_data: tokenize + stem過

## Preprocessing: Stop-Word Removal

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

print(tokenizer_stem_nostop('runners like running and thus they run'))

['runner', 'like', 'run', 'thu', 'run']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ericy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
stemmed_nostop_page_data = []
for page in cleaned_page_data:
    stemmed_nostop_page_data.append(tokenizer_stem_nostop(page))

stemmed_nostop_page_data : tokenize + stem + 去除stop word

In [13]:
import numpy as np
# np.save('stemmed_nostop_page_data', stemmed_nostop_page_data)
np.save('stemmed_page_data', stemmed_page_data)
np.save('cleaned_page_data', cleaned_page_data)