In [31]:
%matplotlib inline
import pandas as pd
import re
from pandas import DataFrame
from bs4 import BeautifulSoup
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import numpy as np
import scipy as sp
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#nltk.download('stopwords')
stop = stopwords.words('english')

def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text


def tokenizer_stem(text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in re.split('\s+', text.strip())]

def tokenizer_by_tense(sentence):
    tokens = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokens)
    important_words = []
    for t in tagged:
        tag_name = t[1][0:2]
        word = t[0].lower()
        if tag_name=='NN' or tag_name=='JJ' or tag_name=='VB' or tag_name=='RB':
            porter = PorterStemmer()
            important_words.append(word)
        
    return [porter.stem(w) for w in important_words if w not in stop and re.match('[a-zA-Z]+', w)]

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

df_train = pd.read_csv('datasets/train.csv')
df_test = pd.read_csv('datasets/test.csv')
#print(df_train.head(5))
#print(df_test.head(5))
page0 = df_train.loc[0,'Page content']
#print(page0)
df_small = df_train.sample(n=50,random_state=0)
#print(df_small.iloc[0]['Page content'])

In [38]:
# Parse datetime from html
from dateutil import parser

# Extract features from raw HTML
# Usage: 
# page0 = df.loc[0,'Page content']
# extract_feature(page0)
def extract_feature(html):
    
    bs = BeautifulSoup(html, 'html.parser')
    
    # link
    link_count = len(bs.findAll('a', href=True)) 
    
    # image
    img_count = len(bs.findAll('img'))
    
    # animation, chart or video
    iframe_count = len(bs.findAll('iframe'))
    
    # quote
    quote_count = len(bs.findAll('blockquote'))
    
    # tags
    tags = [] 
    for tag in bs.select('footer a'):
        tags.append(tag.string)
        
    # category
    category = ""
    for cat in bs.findAll('article'):
        try:
            category = cat['data-channel']
            break
        except:
            category = ""
            continue
            
    # author (seem not notable) -> discard(?)
    '''
    author_raw = bs.find("div", { "class" : "article-info" })
    try:
        author = author_raw.find('a')['href']
    except:
        pass
    
    try:
        author = author_raw.find('span',{'class':'author_name'}).string
    except:
        pass
    '''
    
    # title info. (h1) (must be helpful)
    title = bs.find("h1", { "class" : "title" }).string
    title_words_count = len(re.split(r'\s+', title))
    title_digits_count = len([int(s) for s in title.split() if s.isdigit()])
    title_question_mark = ('?' in title) # boolean
    title_exclamation_mark = ('!' in title) # boolean
    
    # sub-title (h2) (must be helpful)
    # count how many sub-title in the article
    try:
        h2 = bs.find('h2')
        sub_title_count = len(bs.find('h2'))
    except:
        sub_title_count = 0
    
    # get word set
    pre_text = preprocessor(html)
    tokens = tokenizer_by_tense(pre_text)
    total_word_count = len(tokens)
    
    # date
    try:
        datetime = bs.time['datetime']
        l = re.split(r'\s+', datetime)
        weekday = re.sub(',','',l[0])
        day = l[1]
        month = l[2]
        year = l[3]
        time = l[4]
        t = int(time.split(':')[0]) # 0~23
        if t in [0,3]: 
            time_interval = 1 # 0~5
        elif t in [4,7]: 
            time_interval = 2 # 6~11
        elif t in [8,11]: 
            time_interval = 3 # 12~17
        elif t in [12,15]:
            time_interval = 4 # 18~23  
        elif t in [16,19]:
            time_interval = 5
        else: 
            time_interval = 6
    except:
        weekday = ''
        day = 0
        month = ''
        year = 0
        time = 0
        time_interval = 0
    
    # return 
    tmp = []
    tmp.append(link_count)
    tmp.append(img_count)
    tmp.append(iframe_count)
    tmp.append(quote_count)
    tmp.append(tags)
    tmp.append(category)
    #tmp.append(author)
    tmp.append(total_word_count)
    
    #tmp.append(parsed_date)
    tmp.append(weekday)
    tmp.append(day)
    tmp.append(month)
    tmp.append(year)
    tmp.append(time)
    tmp.append(time_interval)
    
    tmp.append(title_words_count)
    tmp.append(title_digits_count)
    tmp.append(sub_title_count)
    tmp.append(title_question_mark)
    
    return tmp

In [40]:
%%time # about 25min
dsize = df_train.shape[0] #df_small.shape[0]
link_count=[]
img_count=[]
iframe_count=[]
quote_count=[]
tags=[]
categories=[]
authors=[]
total_word_count=[]
#parsed_date=[]
weekday=[]
day=[]
month=[]
year=[]
time=[]
time_interval=[]
title_words_count=[]
title_digits_count=[]
title_question_mark=[]
sub_title_count=[]

for i in range(dsize):
    features = extract_feature(df_train.iloc[i]['Page content'])
    link_count.append(features[0])
    img_count.append(features[1])
    iframe_count.append(features[2])
    quote_count.append(features[3])
    tags.append(features[4])
    categories.append(features[5])
    #authors.append(features[6])
    total_word_count.append(features[6])
    #parsed_date.append(features[8])
    weekday.append(features[7])
    day.append(features[8])
    month.append(features[9])
    year.append(features[10])
    time.append(features[11])
    time_interval.append(features[12])
    title_words_count.append(features[13])
    title_digits_count.append(features[14])
    sub_title_count.append(features[15])
    title_question_mark.append(features[16])

d = {'#link':link_count,
     '#img':img_count,
     '#iframe':iframe_count,
     '#quote':quote_count,
     'tags':tags,
     'categories':categories,
     #'authors':authors,
     '#total word':total_word_count,
     #'date':parsed_date,
     'weekday':weekday,
     'day':day,
     'month':month,
     'year':year,
     'time':time,
     'time interval(4hr)':time_interval,
     '#title word':title_words_count,
     '#title digits':title_digits_count,
     '#sub-title':sub_title_count,
     'If title contains "?"':title_question_mark,
     'popularity':df_train['Popularity'] #
    }

CPU times: user 25min 10s, sys: 1min 14s, total: 26min 24s
Wall time: 26min 36s


In [46]:
%%time
# import optimized pickle written in C for serializing and 
# de-serializing a Python object
import _pickle as pkl
import sys
sys.setrecursionlimit(10000)

df = DataFrame(data=d)

# dump to disk
pkl.dump(df, open('outputs/df.pkl', 'wb'))

# load from disk
#df = pkl.load(open('outputs/df.pkl', 'rb'))

CPU times: user 39.4 s, sys: 19.5 s, total: 58.9 s
Wall time: 1min 2s


In [47]:
df = pkl.load(open('outputs/df.pkl', 'rb'))

In [49]:
df

Unnamed: 0,#iframe,#img,#link,#quote,#sub-title,#title digits,#title word,#total word,"If title contains ""?""",categories,day,month,popularity,tags,time,time interval(4hr),weekday,year
0,0,1,22,0,0,0,8,351,False,world,19,Jun,-1,"[Asteroid, Asteroids, challenge, Earth, Space,...",15:04:30,4,Wed,2013
1,0,2,18,0,0,0,12,205,False,tech,28,Mar,1,"[Apps and Software, Google, open source, opn p...",17:40:55,3,Thu,2013
2,25,2,11,0,1,1,12,655,False,entertainment,07,May,1,"[Entertainment, NFL, NFL Draft, Sports, Televi...",19:15:20,4,Wed,2014
3,21,1,13,0,1,0,5,179,False,watercooler,11,Oct,-1,"[Sports, Video, Videos, Watercooler]",02:26:50,4,Fri,2013
4,1,52,16,1,1,0,10,994,False,entertainment,17,Apr,-1,"[Entertainment, instagram, instagram video, NF...",03:31:43,4,Thu,2014
5,0,1,30,0,0,0,7,452,False,startups,21,Nov,-1,"[government, internet, internet service provid...",18:00:42,4,Thu,2013
6,3,1,11,0,0,0,11,90,False,music,11,Aug,1,"[Entertainment, funny, Iggy Azalea, Music, par...",05:00:18,1,Mon,2014
7,0,2,13,1,0,0,12,213,False,entertainment,20,Nov,-1,"[bill cosby, Entertainment, Television]",00:30:41,1,Thu,2014
8,0,1,14,0,0,0,9,185,False,tech,30,Sep,1,"[Apps and Software, Gadgets, Mobile, Tech, ven...",04:30:01,4,Mon,2013
9,0,2,13,0,0,0,10,247,False,business,06,Feb,-1,"[Business, Media, The New York Times, paywall]",15:49:35,4,Thu,2014
