In [77]:
%matplotlib inline
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.model_selection import train_test_split
import seaborn as sns
import pandas as pd
import re
from pandas import DataFrame
from bs4 import BeautifulSoup
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import numpy as np
import scipy as sp
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#nltk.download('stopwords')
stop = stopwords.words('english')

def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text


def tokenizer_stem(text):
    porter = PorterStemmer()
    return [porter.stem(word) for word in re.split('\s+', text.strip())]

def tokenizer_by_tense(sentence):
    tokens = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokens)
    important_words = []
    for t in tagged:
        tag_name = t[1][0:2]
        word = t[0].lower()
        if tag_name=='NN' or tag_name=='JJ' or tag_name=='VB' or tag_name=='RB':
            porter = PorterStemmer()
            important_words.append(word)
        
    return [porter.stem(w) for w in important_words if w not in stop and re.match('[a-zA-Z]+', w)]

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

df_train = pd.read_csv('datasets/train.csv')
df_test = pd.read_csv('datasets/test.csv')
#print(df_train.head(5))
#print(df_test.head(5))
page0 = df_train.loc[0,'Page content']
#print(page0)
df_small = df_train.sample(n=10000,random_state=0)
#print(df_small.iloc[0]['Page content'])

In [82]:
# Parse datetime from html
from dateutil import parser

# Extract features from raw HTML
# Usage: 
# page0 = df.loc[0,'Page content']
# extract_feature(page0)
def extract_feature(html):
    
    bs = BeautifulSoup(html, 'html.parser')
    
    # link
    link_count = len(bs.findAll('a', href=True)) 
    
    # image
    img_count = len(bs.findAll('img'))
    
    # animation, chart or video
    iframe_count = len(bs.findAll('iframe'))
    
    # quote
    quote_count = len(bs.findAll('blockquote'))
    
    # tags
    tags = [] 
    for tag in bs.select('footer a'):
        tags.append(tag.string)
        
    # category
    category = ""
    for cat in bs.findAll('article'):
        try:
            category = cat['data-channel']
            break
        except:
            category = ""
            continue
            
    # author (seem not notable) -> discard(?)
    '''
    author_raw = bs.find("div", { "class" : "article-info" })
    try:
        author = author_raw.find('a')['href']
    except:
        pass
    
    try:
        author = author_raw.find('span',{'class':'author_name'}).string
    except:
        pass
    '''
    
    # title info. (h1) (must be helpful)
    title = bs.find("h1", { "class" : "title" }).string
    title_words_count = len(re.split(r'\s+', title))
    title_digits_count = len([int(s) for s in title.split() if s.isdigit()])
    title_question_mark = ('?' in title) # boolean
    title_exclamation_mark = ('!' in title) # boolean
    
    # sub-title (h2) (must be helpful)
    # count how many sub-title in the article
    try:
        h2 = bs.find('h2')
        sub_title_count = len(bs.find('h2'))
    except:
        sub_title_count = 0
    
    # get word set
    pre_text = preprocessor(html)
    tokens = tokenizer_by_tense(pre_text)
    total_word_count = len(tokens)
    
    # date
    try:
        datetime = bs.time['datetime']
        l = re.split(r'\s+', datetime)
        weekday = re.sub(',','',l[0])
        day = l[1]
        month = l[2]
        year = l[3]
        time = l[4]
        t = int(time.split(':')[0]) # 0~23
        if t in [0,3]: 
            time_interval = 1 # 0~5
        elif t in [4,7]: 
            time_interval = 2 # 6~11
        elif t in [8,11]: 
            time_interval = 3 # 12~17
        elif t in [12,15]:
            time_interval = 4 # 18~23  
        elif t in [16,19]:
            time_interval = 5
        else: 
            time_interval = 6
    except:
        weekday = ''
        day = 0
        month = ''
        year = 0
        time = 0
        time_interval = 0
    
    # return 
    tmp = []
    tmp.append(link_count)
    tmp.append(img_count)
    tmp.append(iframe_count)
    tmp.append(quote_count)
    tmp.append(tags)
    tmp.append(category)
    #tmp.append(author)
    tmp.append(total_word_count)
    
    #tmp.append(parsed_date)
    tmp.append(weekday)
    tmp.append(day)
    tmp.append(month)
    tmp.append(year)
    tmp.append(time)
    tmp.append(time_interval)
    
    tmp.append(title_words_count)
    tmp.append(title_digits_count)
    tmp.append(sub_title_count)
    tmp.append(title_question_mark)
    
    return tmp

In [83]:
%%time 
# about 25min
dsize = df_train.shape[0] #df_small.shape[0] 
dd = df_train # df_small
link_count=[]
img_count=[]
iframe_count=[]
quote_count=[]
tags=[]
categories=[]
authors=[]
total_word_count=[]
#parsed_date=[]
weekday=[]
day=[]
month=[]
year=[]
time=[]
time_interval=[]
title_words_count=[]
title_digits_count=[]
title_question_mark=[]
sub_title_count=[]

for i in range(dsize):
    features = extract_feature(df_train.iloc[i]['Page content'])
    link_count.append(features[0])
    img_count.append(features[1])
    iframe_count.append(features[2])
    quote_count.append(features[3])
    tags.append(features[4])
    categories.append(features[5])
    #authors.append(features[6])
    total_word_count.append(features[6])
    #parsed_date.append(features[8])
    weekday.append(features[7])
    day.append(features[8])
    month.append(features[9])
    year.append(features[10])
    time.append(features[11])
    time_interval.append(features[12])
    title_words_count.append(features[13])
    title_digits_count.append(features[14])
    sub_title_count.append(features[15])
    title_question_mark.append(features[16])

d = {
     '#link':link_count,
     '#img':img_count,
     '#iframe':iframe_count,
     '#quote':quote_count,
     #'tags':tags,
     'categories':categories,
     #'authors':authors,
     '#total word':total_word_count,
     #'date':parsed_date,
     'weekday':weekday,
     #'day':day,
     #'month':month,
     'year':year,
     #'time':time,
     'time interval(4hr)':time_interval,
     '#title word':title_words_count,
     '#title digits':title_digits_count,
     '#sub-title':sub_title_count,
     'If title contains "?"':title_question_mark,
     'popularity':dd['Popularity'] #
    }

CPU times: user 27min 29s, sys: 23.1 s, total: 27min 52s
Wall time: 28min 22s


In [84]:
df = DataFrame(data=d)
'''
X = df.drop('popularity', 1)
y = df['popularity']

# split X into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)

print('#Training data points: %d' % X_train.shape[0])
print('#Testing data points: %d' % X_test.shape[0])
print('Class labels:', np.unique(y))
'''
display(df.head(5))
print(df.shape)

Unnamed: 0,#iframe,#img,#link,#quote,#sub-title,#title digits,#title word,#total word,"If title contains ""?""",categories,popularity,time interval(4hr),weekday,year
0,0,1,22,0,0,0,8,351,False,world,-1,4,Wed,2013
1,0,2,18,0,0,0,12,205,False,tech,1,6,Thu,2013
2,25,2,11,0,1,1,12,655,False,entertainment,1,5,Wed,2014
3,21,1,13,0,1,0,5,179,False,watercooler,-1,6,Fri,2013
4,1,52,16,1,1,0,10,994,False,entertainment,-1,1,Thu,2014


(27643, 14)


In [85]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

# encode categorical features
catego_features = ['If title contains "?"','categories','weekday','year']

catego_le = LabelEncoder()

# transform categorical values into numerical values
# be careful that '?' will also be encoded
# we have to replace it to NaN in numerical
num_values = []
for i in catego_features:
    df[i] = catego_le.fit_transform(df[i].values)
    classes_list = catego_le.classes_.tolist()
    
    # store the total number of values
    num_values.append(len(classes_list))
    
    # replace '?' with 'NaN'
    if '?' in classes_list:
        idx = classes_list.index('?')
        df[i] = df[i].replace(idx, np.nan)

display(df.head(5))
print(df.shape)

Unnamed: 0,#iframe,#img,#link,#quote,#sub-title,#title digits,#title word,#total word,"If title contains ""?""",categories,popularity,time interval(4hr),weekday,year
0,0,1,22,0,0,0,8,351,0,32,-1,4,6,0
1,0,2,18,0,0,0,12,205,0,28,1,6,4,0
2,25,2,11,0,1,1,12,655,0,7,1,5,6,1
3,21,1,13,0,1,0,5,179,0,31,-1,6,0,0
4,1,52,16,1,1,0,10,994,0,7,-1,1,4,1


(27643, 14)


In [95]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

# randomly sample 1000 examples
#df_small = df_train.sample(n=1000, random_state=0)

names = ['LogisticRegression', 
         'DecisionTreeClassifier',
         'RandomForestClassifier',
         'Perception',
         'SGDClassifier',
         'KNN',
         'SVC'
        ]

# LogisticRegression
pipe1 = Pipeline([('clf', LogisticRegression())])
# DecisionTreeClassifier
pipe2 = Pipeline([('clf', DecisionTreeClassifier())])
# RandomForest
pipe3 = Pipeline([('clf', RandomForestClassifier())])
# Perceptron
pipe4 = Pipeline([('clf', Perceptron())])
# SGDClassifier
pipe5 = Pipeline([('clf', SGDClassifier())])
# KNN
pipe6 = Pipeline([('clf', KNeighborsClassifier())])
# SVC
pipe7 = Pipeline([('clf', SVC())])

# CV
print('[auc (10-fold cv)]')
for name, clf in zip(names, [pipe1, pipe2, pipe3, pipe4, pipe5, pipe6, pipe7]):
    scores = cross_val_score(estimator=clf, X=df.drop('popularity', 1), y=df['popularity'], \
                         cv=10, scoring='roc_auc')
    print('%s: %.3f (+/-%.3f)' % (name, scores.mean(), scores.std()))

[auc (10-fold cv)]
LogisticRegression: 0.541 (+/-0.011)
DecisionTreeClassifier: 0.510 (+/-0.009)
RandomForestClassifier: 0.534 (+/-0.009)
Perception: 0.504 (+/-0.020)
SGDClassifier: 0.506 (+/-0.016)
KNN: 0.509 (+/-0.009)
SVC: 0.513 (+/-0.009)


In [16]:
%%time
def tfidf_generator(df):
    tfidf = TfidfVectorizer(ngram_range=(1,1),
                        preprocessor=preprocessor,
                        tokenizer=tokenizer_stem_nostop)
    doc = []
    for i in range(df.shape[0]):
        try:
            doc.append(df_train[df_train.Id == i].loc[i,'Page content'])
        except:
            pass
    
    tfidf.fit(doc)
    doc_tfidf = tfidf.transform(doc).toarray()
    return pd.DataFrame(doc_tfidf)

#df_small_tfidf = tfidf_generator(df_small)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 7.87 µs


In [46]:
%%time
# import optimized pickle written in C for serializing and 
# de-serializing a Python object
import _pickle as pkl
import sys
sys.setrecursionlimit(10000)

df = DataFrame(data=d)

# dump to disk
pkl.dump(df, open('outputs/df.pkl', 'wb'))

# load from disk
df = pkl.load(open('outputs/df.pkl', 'rb'))

CPU times: user 39.4 s, sys: 19.5 s, total: 58.9 s
Wall time: 1min 2s
