In [1]:
# Hyperparameters
debug = False
MIN = 1 # n-gram
MAX = 1 # n-gram
MAX_DF = 0.6
HASH_POWER = 10 # hash to 2**HASH_POWER features

In [2]:
%matplotlib inline

import numpy as np
import scipy as sp
import pandas as pd
import re
import nltk

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split

## 一. 資料前處理
首先先引入dataset

In [3]:
df = pd.read_csv('./input/train.csv')
if debug:
    df = df.iloc[:100] # debug
print(df.head(5))

   Id  Popularity                                       Page content
0   0          -1  <html><head><div class="article-info"> <span c...
1   1           1  <html><head><div class="article-info"><span cl...
2   2           1  <html><head><div class="article-info"><span cl...
3   3          -1  <html><head><div class="article-info"><span cl...
4   4          -1  <html><head><div class="article-info"><span cl...


### 1.1 清掉所有的html tag

In [4]:
def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

### 1.2 定義tokenize+波特詞幹還原演算法+刪除停用字

In [5]:
nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

print(tokenizer_stem_nostop('runners like running and thus they run')) # Test if it works

['runner', 'like', 'run', 'thu', 'run']


[nltk_data] Downloading package stopwords to /home/benny/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 1.3 html語法分析

In [6]:
def fetch_datetime(text=[]):
    day = []
    for tx in text:
        soup = BeautifulSoup(tx, "lxml")
        if soup.time.has_attr('datetime'):
            date = soup.time.attrs['datetime']
            day.append(' '+ date[0:3])
        else:
            day.append(' fuckday')
    return day

def fetch_channel(text=[]):
    channels = []
    for tx in text:
        soup = BeautifulSoup(tx, "lxml")
        channel = soup.article['data-channel']
        channels.append(channel)
    return channels

def fetch_img_count(text=[]):
    count = []
    for tx in text:
        soup = BeautifulSoup(tx, "lxml")
        c = 0
        find_all_images = soup.find_all('img')
        for i in find_all_images:
            c = c+1
        count.append(c)
    return count

def fetch_topics(text=[]):
    topics = []
    for tx in text:
        soup = BeautifulSoup(tx, "lxml")
        footer = soup.footer
        ta = footer.find_all('a')
        topic = []
        for t in ta:
            topic.append(t.get_text())
        ff = ' '.join(topic)
        topics.append(ff)
    return topics

def fetch_authors(text=[]):
    authors = []
    for tx in text:
        soup = BeautifulSoup(tx, "lxml")
        footer = soup.span
        if footer != None:
            ta = footer.findAll('a')
            author = []
            for t in ta:
                author.append(t.get_text())
            if len(author) == 0:
                ff = 'NaN'
            else:
                ff = ''.join(author)
        else:
            ff = 'NaN'
        authors.append(ff)
    return authors

def fetch_titles(text=[]):
    titles = []
    for tx in text:
        soup = BeautifulSoup(tx, "lxml")
        footer = soup.h1
        if footer != None:
            ff = footer.get_text()
        else:
            ff = 'NaN'
        titles.append(ff) 
    return titles

def fetch_social_media_count(text=[]):
    count = []
    for tx in text:
        soup = BeautifulSoup(tx, "lxml")
        c = 0
        for frame in soup("iframe"):
#             print(frame.get('src').split("."))
            if frame.get('src').find("youtube") != None:
                c = c+1
            elif frame.get('src').find("instagram") != None:
                c = c+1
            elif frame.get('src').find("vine") != None:
                c= c+1
            # apply new media here
        count.append(c)
    return count

這裡以下不必重跑

In [7]:
# fatch deature, 這裡可能需要15分鐘左右
topic_batch   = fetch_topics            (df[:]['Page content'])
channel_batch = fetch_channel           (df[:]['Page content'])
weekday_batch = fetch_datetime          (df[:]['Page content'])
author_batch  = fetch_authors           (df[:]['Page content'])
img_batch     = fetch_img_count         (df[:]['Page content'])
title_batch   = fetch_titles            (df[:]['Page content'])
media_batch   = fetch_social_media_count(df[:]['Page content'])

In [8]:
# re-consturct training data
df_feature = pd.DataFrame({'Id':df.Id[:],
                           'Popularity':df.Popularity[:],
                           'topic':topic_batch,
                           'channel':channel_batch,
                           'weekday':weekday_batch,
                           'author':author_batch,
                           'img count':img_batch,
                           'title':title_batch,
                           'media count': media_batch})

In [9]:
df_feature = df_feature.drop(['Id', 'Popularity'], axis=1)

In [10]:
display(df_feature.head(5))

Unnamed: 0,topic,channel,weekday,author,img count,title,media count
0,Asteroid Asteroids challenge Earth Space U.S. ...,world,Wed,,1,NASA's Grand Challenge: Stop Asteroids From De...,0
1,Apps and Software Google open source opn pledg...,tech,Thu,Christina Warren,2,Google's New Open Source Patent Pledge: We Won...,0
2,Entertainment NFL NFL Draft Sports Television,entertainment,Wed,Sam Laird,2,Ballin': 2014 NFL Draft Picks Get to Choose Th...,25
3,Sports Video Videos Watercooler,watercooler,Fri,Sam Laird,1,Cameraperson Fails Deliver Slapstick Laughs,21
4,Entertainment instagram instagram video NFL Sp...,entertainment,Thu,Connor Finnegan,52,NFL Star Helps Young Fan Prove Friendship With...,1


In [11]:
df = df.join(df_feature, lsuffix='_caller', rsuffix='_other')

In [12]:
df['Page content'] = df['Page content'].apply(preprocessor) # 此步驟約要花五分鐘

In [13]:
df.head(5)

Unnamed: 0,Id,Popularity,Page content,topic,channel,weekday,author,img count,title,media count
0,0,-1,clara moskowitz for space com 2013 06 19 15 0...,Asteroid Asteroids challenge Earth Space U.S. ...,world,Wed,,1,NASA's Grand Challenge: Stop Asteroids From De...,0
1,1,1,by christina warren2013 03 28 17 40 55 utcgoog...,Apps and Software Google open source opn pledg...,tech,Thu,Christina Warren,2,Google's New Open Source Patent Pledge: We Won...,0
2,2,1,by sam laird2014 05 07 19 15 20 utcballin 2014...,Entertainment NFL NFL Draft Sports Television,entertainment,Wed,Sam Laird,2,Ballin': 2014 NFL Draft Picks Get to Choose Th...,25
3,3,-1,by sam laird2013 10 11 02 26 50 utccameraperso...,Sports Video Videos Watercooler,watercooler,Fri,Sam Laird,1,Cameraperson Fails Deliver Slapstick Laughs,21
4,4,-1,by connor finnegan2014 04 17 03 31 43 utcnfl s...,Entertainment instagram instagram video NFL Sp...,entertainment,Thu,Connor Finnegan,52,NFL Star Helps Young Fan Prove Friendship With...,1


可以丟下去的有channel，weekday，img count，media count

In [14]:
del df_feature

In [30]:
df.to_csv('./input/input_feature.csv')

## 二. 特徵選擇

### 2.1 找出頻率最高的詞 (不需要重跑，不然要跑10多分鐘)

In [16]:
count = CountVectorizer(ngram_range=(1, 1), # (MIN, MAX)
                        preprocessor=preprocessor,
                        tokenizer=tokenizer_stem_nostop)
count.fit(["YEAH TIGER", "FIBER WIPER"]) # need to fit something first



CountVectorizer(preprocessor=<function preprocessor at 0x00000149A614A8B0>,
                tokenizer=<function tokenizer_stem_nostop at 0x00000149A1AE6940>)

In [17]:
doc = df['Page content']
doc_bag = count.fit_transform(doc).toarray()

print("[most frequent vocabularies]")
bag_cnts = np.sum(doc_bag, axis=0)
top = 10
# [::-1] reverses a list since sort is in ascending order
for tok, v in zip(count.inverse_transform(np.ones(bag_cnts.shape[0]).reshape(1, -1))[0][bag_cnts.argsort()[::-1][:top]], 
                  np.sort(bag_cnts)[::-1][:top]):
    print('{}: {}'.format(tok, v))

[most frequent vocabularies]
imag: 116996
also: 51161
new: 44159
one: 42492
video: 41798
see: 38955
like: 36858
time: 35997
use: 33510
app: 32685


### 2.2 TF-IDF

In [7]:
df = pd.read_csv('./input/input_feature.csv')

利用前面所定義的前處理方法產生tf-idf向量

In [8]:
doc = df['Page content']
tfidf = TfidfVectorizer(ngram_range=(MIN, MAX), 
                        preprocessor=preprocessor, 
                        tokenizer=tokenizer_stem_nostop, 
                        max_df=MAX_DF, 
                        min_df=0.0001)
tfidf.fit(doc)
doc_tfidf = tfidf.transform(doc).toarray()



In [9]:
print(doc_tfidf.shape)

(27643, 43634)


接著調查idf分數以及tf-idf值最大的10個單字

In [10]:
top = 10
# get idf score of vocabularies
idf = tfidf.idf_
print('[vocabularies with smallest idf scores]')
sorted_idx = idf.argsort()

for i in range(top):
    print('%s: %.2f' %(tfidf.get_feature_names()[sorted_idx[i]], idf[sorted_idx[i]]))


tfidf_sum = np.sum(doc_tfidf, axis=0)
print("\n[vocabularies with highest tf-idf scores]")
for tok, v in zip(tfidf.inverse_transform(np.ones(tfidf_sum.shape[0]).reshape(1, -1))[0][tfidf_sum.argsort()[::-1]][:top], \
                        np.sort(tfidf_sum)[::-1][:top]):
    print('{}: {}'.format(tok, v))

[vocabularies with smallest idf scores]
new: 1.56
like: 1.60
time: 1.61
make: 1.72
year: 1.73
world: 1.75
use: 1.77
get: 1.79
first: 1.85
take: 1.90

[vocabularies with highest tf-idf scores]
video: 733.5061786531768
app: 602.2534993946733
new: 500.9036433140395
googl: 455.2371136820567
game: 442.6232487233518
twitter: 414.0212743230195
facebook: 410.4951735218714
compani: 403.128044066731
appl: 401.1395615393847
time: 400.9980301873785


### 2.3 Feature Hashing (我記憶體很夠，應該不會使用這個方法)

In [11]:
# hash words to 1024 buckets
hashvec = HashingVectorizer(n_features=2**HASH_POWER,
                            preprocessor=preprocessor,
                            tokenizer=tokenizer_stem_nostop)

# no .fit needed for HashingVectorizer, since it's defined by the hash function
# transform sentences to vectors of dimension 1024
doc_hash = hashvec.transform(["YEAH TIGER", "FIBER WIPER"]) # test
print(doc_hash.shape)

(2, 1024)


### 2.4 One hot Encoding

In [12]:
from sklearn.preprocessing import OneHotEncoder

# channel
channel_ohe = OneHotEncoder(handle_unknown='ignore')
channel_str = channel_ohe.fit_transform(df['channel'].values.reshape(-1,1)).toarray()
print(channel_str.shape)

# weekday
weekday_ohe = OneHotEncoder(handle_unknown='ignore')
weekday_str = weekday_ohe.fit_transform(df['weekday'].values.reshape(-1,1)).toarray()
print(weekday_str.shape)

# ohe author
author_ohe = OneHotEncoder(handle_unknown='ignore')
author_str = author_ohe.fit_transform(df['author'].values.reshape(-1,1)).toarray()
print(author_str.shape)

(27643, 33)
(27643, 7)
(27643, 428)


把所有特徵給組合起來

In [13]:
img_count = df['img count'].values.reshape(-1,1)
media_count = df['media count'].values.reshape(-1,1)
df_y_train = df['Popularity'].to_numpy()
del df

In [None]:
df_X_train = np.concatenate([channel_str,
                             weekday_str, 
                             author_str, 
                             img_count,
                             media_count,
                             doc_tfidf], axis=1)

存成csv檔案就不用再重跑之前的code了

In [None]:
#df_X_train = pd.read_csv('./input/df_X_train.csv')
#df_y_train = pd.read_csv('./input/df_y_train.csv')

In [None]:
df_X_train.to_csv('./input/X_train.csv')
df_y_train.to_csv('./input/y_train.csv')

生成training set和testing set

In [23]:
X_train, X_test, y_train, y_test = train_test_split(df_X_train, df_y_train, test_size=0.2)

In [None]:
del df_X_train
del df_y_train

## 三. 模型訓練

In [24]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [25]:
pipe_lr = make_pipeline(StandardScaler(),
                        LogisticRegression(penalty='l2', C=0.08, random_state=1, solver='lbfgs', multi_class='ovr', verbose=1))

In [26]:
pipe_lr.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s finished


Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(C=0.08, multi_class='ovr', random_state=1,
                                    verbose=1))])

In [27]:
y_pred = pipe_lr.predict(X_test)
print('Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test))
scores = cross_val_score(estimator=clf, X=df_small['review'], y=df_small['sentiment'], cv=10, scoring='roc_auc')
print('AUC score %s: %.3f (+/-%.3f)' % (scores.mean(), scores.std()))

Test Accuracy: 0.558


## 四. 參數調整

## 五. 結果預測