In [26]:
%matplotlib inline
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import numpy as np
import scipy as sp
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#nltk.download('stopwords')
stop = stopwords.words('english')

def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text


def tokenizer_stem(text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in re.split('\s+', text.strip())]

def tokenizer_by_tense(sentence):
    tokens = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokens)
    important_words = []
    for t in tagged:
        tag_name = t[1][0:2]
        word = t[0].lower()
        if tag_name=='NN' or tag_name=='JJ' or tag_name=='VB' or tag_name=='RB':
            porter = PorterStemmer()
            important_words.append(word)
        
    return [porter.stem(w) for w in important_words if w not in stop and re.match('[a-zA-Z]+', w)]

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

df = pd.read_csv('datasets/train.csv')
#print(df.head(5))
page0 = df.loc[0,'Page content']
#print(page0)

In [80]:
# Parse datetime from html
from dateutil import parser

# Extract features from raw HTML
# Usage: 
# page0 = df.loc[0,'Page content']
# extract_feature(page0)
def extract_feature(html):
    
    bs = BeautifulSoup(html, 'html.parser')
    
    # link
    link_count = len(bs.findAll('a', href=True)) 
    
    # image
    img_count = len(bs.findAll('img'))
    
    # animation, chart or video
    iframe_count = len(bs.findAll('iframe'))
    
    # quote
    quote_count = len(bs.findAll('blockquote'))
    
    # tags
    tags = [] 
    for tag in bs.select('footer a'):
        tags.append(tag.string)
        
    # category
    category = ""
    for cat in bs.findAll('article'):
        try:
            category = cat['data-channel']
            break
        except:
            category = ""
            continue
            
    # author
    author_raw = bs.find("div", { "class" : "article-info" })
    try:
        author = author_raw.find('a')['href']
    except:
        pass
    
    try:
        author = author_raw.find('span',{'class':'author_name'}).string
    except:
        pass
    
    
    # get word set
    pre_text = preprocessor(html)
    tokens = tokenizer_by_tense(pre_text)
    total_word_count = len(tokens)
    
    # date    
    date = bs.find('time').string
    try:
        parsed_date = parser.parse(date)
    # a very little fraction of data has broken time tag
    except:
        parsed_date = datetime.datetime.fromordinal(735305) #random
        #print('NO DATE')
    publish_date = parsed_date.toordinal()
    # publish_time = parsed_date.hour*24*60 + parsed_date.minute*60 + parsed_date.second
    
    print("link count: ",link_count)
    print("image count: ",img_count)
    print("iframe count: ",iframe_count)
    print("quote count: ",quote_count)
    print("tags: ",tags)
    print("category: ",category)
    print("author: ",author)
    print("total word count:", total_word_count)
    print("Date: ",parsed_date)

In [81]:
extract_feature(page0)

link count:  22
image count:  1
iframe count:  0
quote count:  0
tags:  ['Asteroid', 'Asteroids', 'challenge', 'Earth', 'Space', 'U.S.', 'World']
category:  world
author:  /publishers/space-com/
total word count: 351
Date:  2013-06-19 15:04:30+00:00
