In [1]:
# Hyperparameters
debug = False
MIN = 1 # n-gram
MAX = 1 # n-gram
MAX_DF = 0.6
HASH_POWER = 10 # hash to 2**HASH_POWER features

In [2]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import pandas as pd
import re
import nltk
import os
import xgboost as xgb
import datetime

from xgboost import XGBClassifier
from tqdm import tqdm
from bs4 import BeautifulSoup
from summa import keywords
from summa.summarizer import summarize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from time import strptime
from cup01 import *

In [3]:
if not os.path.exists("output/") : os.mkdir("output/")

## 一. 資料前處理
首先先引入dataset

### 1.1 清掉所有的html tag

In [4]:
def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

### 1.2 定義tokenize+波特詞幹還原演算法+刪除停用字

In [5]:
nltk.download('stopwords')
stop = stopwords.words('english')
stop = stop + extra_stopwords()

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

print(tokenizer_stem_nostop('runners like running and thus they run')) # Test if it works

['runner', 'like', 'run', 'thu', 'run']


[nltk_data] Downloading package stopwords to /home/benny/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 1.3 分析文章基本性質
例如作者,圖片數量等等

In [6]:
df = pd.read_csv('./input/train.csv')
if debug:
    df = df.iloc[:100] # debug

In [7]:
df.head(5)

Unnamed: 0,Id,Popularity,Page content
0,0,-1,"<html><head><div class=""article-info""> <span c..."
1,1,1,"<html><head><div class=""article-info""><span cl..."
2,2,1,"<html><head><div class=""article-info""><span cl..."
3,3,-1,"<html><head><div class=""article-info""><span cl..."
4,4,-1,"<html><head><div class=""article-info""><span cl..."


In [8]:
df_train_contents = df['Page content'].values.tolist()

In [9]:
days, pub_days, channels, img_counts, topics, authors, titles, social_media_counts, \
contents, num_hrefs, num_self_hrefs \
= get_all_datas(df_train_contents)

In [10]:
rate_positive_words, rate_negative_words, avg_positive_polarity, min_positive_polarity, max_positive_polarity, \
avg_negative_polarity, min_negative_polarity, max_negative_polarity \
= get_word_sentiment_features(contents)

In [11]:
n_tokens_titles, n_tokens_contents, n_unique_tokens, n_non_stop_words, n_non_stop_unique_tokens \
= get_some_n_features(titles, contents)

In [12]:
global_sentiment_polarity, global_subjectivity, title_subjectivity_list, title_sentiment_polarity_list, \
abs_title_subjectivity, abs_title_sentiment_polarity \
= get_sentiment_features(titles, contents)

combine the properties

In [13]:
df = pd.DataFrame({'Page content':df_train_contents,
                   'Id':df.Id[:],
                   'Popularity':df.Popularity[:],
                   'topic':topics,
                   'channel':channels,
                   'weekday':days,
                   'pub_date' : pub_days,
                   'author':authors,
                   'img count':img_counts,
                   'title':titles,
                   'content':contents,
                   'media count': social_media_counts,
                   'n_tokens_title' : n_tokens_titles,
                   'n_tokens_content': n_tokens_contents,
                   'n_unique_tokens' : n_unique_tokens,
                   'n_non_stop_words': n_non_stop_words,
                   'n_non_stop_unique_tokens': n_non_stop_unique_tokens,
                   'num_hrefs' : num_hrefs,
                   'num_self_hrefs' : num_self_hrefs,
                   'global_sentiment_polarity' : global_sentiment_polarity,
                   'global_subjectivity' : global_subjectivity,
                   'title_subjectivity' : title_subjectivity_list,
                   'title_sentiment_polarity' : title_sentiment_polarity_list,
                   'abs_title_subjectivity' : abs_title_subjectivity,
                   'abs_title_sentiment_polarity' : abs_title_sentiment_polarity,
                   'rate_positive_words' : rate_positive_words,
                   'rate_negative_words' : rate_negative_words,
                   'avg_positive_polarity' : avg_positive_polarity,
                   'min_positive_polarity' : min_positive_polarity,
                   'max_positive_polarity' : max_positive_polarity,
                   'avg_negative_polarity' : avg_negative_polarity,
                   'min_negative_polarity' : min_negative_polarity,
                   'max_negative_polarity' : max_negative_polarity})

### 1.4 時間

In [14]:
df['day_of_month'] = df['pub_date'].apply(lambda x: int(x.split()[1]))
df['month'] = df['pub_date'].apply(lambda x: strptime(x.split()[2], '%b').tm_mon)
df['hour'] = df['pub_date'].apply(lambda x: strptime(x.split()[4], '%X')[3])

In [15]:
del df_train_contents, df['pub_date']

### 1.5 產生關鍵字

In [16]:
tqdm.pandas()
df['Page content'] = df['Page content'].progress_apply(preprocessor) # 此步驟約要花五分鐘

100%|████████████████████████████████████| 27643/27643 [01:10<00:00, 389.72it/s]


In [17]:
tqdm.pandas()
df['keywords'] = df['Page content'].progress_apply(tokenizer_stem_nostop)

100%|████████████████████████████████████| 27643/27643 [01:52<00:00, 246.39it/s]


In [18]:
df['keywords'] = df['keywords'].progress_apply(lambda x: ' '.join(x))

100%|█████████████████████████████████| 27643/27643 [00:00<00:00, 236658.27it/s]


In [None]:
tqdm.pandas()
df['keywords'] = df['keywords'].progress_apply(lambda x: keywords.keywords(x).replace('\n', ' '))

 77%|████████████████████████████▌        | 21315/27643 [14:39<25:32,  4.13it/s]

In [None]:
df.head(5)

In [None]:
df.to_csv('./input/input_feature.csv')

## 二. 特徵選擇

In [None]:
df = pd.read_csv('./input/input_feature.csv')

### 2.1 找出頻率最高的詞

In [16]:
count = CountVectorizer(ngram_range=(1, 1), # (MIN, MAX)
                        preprocessor=preprocessor,
                        tokenizer=tokenizer_stem_nostop)
count.fit(["YEAH TIGER", "FIBER WIPER"]) # need to fit something first



CountVectorizer(preprocessor=<function preprocessor at 0x00000149A614A8B0>,
                tokenizer=<function tokenizer_stem_nostop at 0x00000149A1AE6940>)

In [17]:
doc = df['Page content']
doc_bag = count.fit_transform(doc).toarray()

print("[most frequent vocabularies]")
bag_cnts = np.sum(doc_bag, axis=0)
top = 10
# [::-1] reverses a list since sort is in ascending order
for tok, v in zip(count.inverse_transform(np.ones(bag_cnts.shape[0]).reshape(1, -1))[0][bag_cnts.argsort()[::-1][:top]], 
                  np.sort(bag_cnts)[::-1][:top]):
    print('{}: {}'.format(tok, v))

[most frequent vocabularies]
imag: 116996
also: 51161
new: 44159
one: 42492
video: 41798
see: 38955
like: 36858
time: 35997
use: 33510
app: 32685


### 2.2 基於整個文本的TF-IDF

利用前面所定義的前處理方法產生tf-idf向量

In [7]:
doc = df['Page content']
tfidf = TfidfVectorizer(ngram_range=(MIN, MAX), 
                        preprocessor=preprocessor, 
                        tokenizer=tokenizer_stem_nostop, 
                        max_df=MAX_DF, 
                        min_df=0.0001)
tfidf.fit(doc)
doc_tfidf = tfidf.transform(doc).toarray()



In [8]:
print(doc_tfidf.shape)

(27643, 43634)


In [14]:
np.savetxt("tfidf.csv", doc_tfidf, delimiter=",")

接著調查idf分數以及tf-idf值最大的10個單字

In [9]:
top = 10
# get idf score of vocabularies
idf = tfidf.idf_
print('[vocabularies with smallest idf scores]')
sorted_idx = idf.argsort()

for i in range(top):
    print('%s: %.2f' %(tfidf.get_feature_names()[sorted_idx[i]], idf[sorted_idx[i]]))


tfidf_sum = np.sum(doc_tfidf, axis=0)
print("\n[vocabularies with highest tf-idf scores]")
for tok, v in zip(tfidf.inverse_transform(np.ones(tfidf_sum.shape[0]).reshape(1, -1))[0][tfidf_sum.argsort()[::-1]][:top], \
                        np.sort(tfidf_sum)[::-1][:top]):
    print('{}: {}'.format(tok, v))

[vocabularies with smallest idf scores]
new: 1.56
like: 1.60
time: 1.61
make: 1.72
year: 1.73
world: 1.75
use: 1.77
get: 1.79
first: 1.85
take: 1.90

[vocabularies with highest tf-idf scores]
video: 733.5061786531768
app: 602.2534993946733
new: 500.9036433140395
googl: 455.2371136820567
game: 442.6232487233518
twitter: 414.0212743230195
facebook: 410.4951735218714
compani: 403.128044066731
appl: 401.1395615393847
time: 400.9980301873785


In [None]:
del doc_tfidf

### 2.3 基於關鍵字的tf-idf

In [27]:
doc = df['keywords']
tfidf = TfidfVectorizer(ngram_range=(1, 1), 
                        preprocessor=preprocessor, 
                        tokenizer=tokenizer_stem_nostop)
tfidf.fit(doc)
doc_tfidf = tfidf.transform(doc).toarray()



In [28]:
print(doc_tfidf.shape)

(27643, 39431)


In [29]:
top = 10
# get idf score of vocabularies
idf = tfidf.idf_
print('[vocabularies with smallest idf scores]')
sorted_idx = idf.argsort()

for i in range(top):
    print('%s: %.2f' %(tfidf.get_feature_names()[sorted_idx[i]], idf[sorted_idx[i]]))


tfidf_sum = np.sum(doc_tfidf, axis=0)
print("\n[vocabularies with highest tf-idf scores]")
for tok, v in zip(tfidf.inverse_transform(np.ones(tfidf_sum.shape[0]).reshape(1, -1))[0][tfidf_sum.argsort()[::-1]][:top], \
                        np.sort(tfidf_sum)[::-1][:top]):
    print('{}: {}'.format(tok, v))

[vocabularies with smallest idf scores]
imag: 1.94
new: 2.23
like: 2.35
time: 2.47
use: 2.55
video: 2.59
make: 2.61
year: 2.66
said: 2.70
compani: 2.79

[vocabularies with highest tf-idf scores]
imag: 639.4314030667334
new: 543.6068855046963
video: 487.298715875056
like: 460.2245497275674
time: 431.4077628796134
use: 427.29808775659825
compani: 402.0364173369534
make: 393.66282963562537
year: 391.85262234595297
said: 379.9360365379158


In [None]:
del doc_tfidf

### 2.4 Feature Hashing
hash words to 1024 or 2048 buckets

In [None]:
doc = df['keywords']
hashvec = HashingVectorizer(n_features=2**HASH_POWER)

no .fit needed for HashingVectorizer, since it's defined by the hash function. transform sentences to vectors of dimension 1024

In [None]:
doc_hash = hashvec.transform(doc).toarray()
print(doc_hash.shape)

In [None]:
doc = df['title']
hashvec = HashingVectorizer(n_features=2**HASH_POWER)

In [None]:
doc_hash_title = hashvec.transform(doc).toarray()
print(doc_hash_title.shape)

### 2.5 One hot Encoding

In [None]:
# channel
channel_ohe = OneHotEncoder(handle_unknown='ignore')
channel_str = channel_ohe.fit_transform(df['channel'].values.reshape(-1,1)).toarray()
print(channel_str.shape)

# weekday
weekday_ohe = OneHotEncoder(handle_unknown='ignore')
weekday_str = weekday_ohe.fit_transform(df['weekday'].values.reshape(-1,1)).toarray()
print(weekday_str.shape)

# ohe author
author_ohe = OneHotEncoder(handle_unknown='ignore')
author_str = author_ohe.fit_transform(df['author'].values.reshape(-1,1)).toarray()
print(author_str.shape)

把所有特徵給組合起來

In [None]:
img_count = df['img count'].values.reshape(-1,1)
media_count = df['media count'].values.reshape(-1,1)

In [None]:
def flatten(t):
    return [item for sublist in t for item in sublist]

df_X_train = []
for i in tqdm(range(len(channel_str))):
    temp = []
    temp.append(img_count[i])
    temp.append(media_count[i])
    temp.append(channel_str[i])
    temp.append(weekday_str[i])
    temp.append(author_str[i])
    temp.append(doc_hash[i])
    temp.append(doc_hash_title[i])
    temp = flatten(temp)
    temp.append(df['day_of_month'][i])
    temp.append(df['month'][i])
    temp.append(df['hour'][i])
    temp.append(df['n_tokens_title'][i])
    temp.append(df['n_tokens_content'][i])
    temp.append(df['n_unique_tokens'][i])
    temp.append(df['n_non_stop_words'][i])
    temp.append(df['n_non_stop_unique_tokens'][i])
    temp.append(df['num_hrefs'][i])
    temp.append(df['num_self_hrefs'][i])
    temp.append(df['global_sentiment_polarity'][i])
    temp.append(df['global_subjectivity'][i])
    temp.append(df['title_subjectivity'][i])
    temp.append(df['title_sentiment_polarity'][i])
    temp.append(df['abs_title_subjectivity'][i])
    temp.append(df['abs_title_sentiment_polarity'][i])
    temp.append(df['rate_positive_words'][i])
    temp.append(df['rate_negative_words'][i])
    temp.append(df['avg_positive_polarity'][i])
    temp.append(df['min_positive_polarity'][i])
    temp.append(df['max_positive_polarity'][i])
    temp.append(df['avg_negative_polarity'][i])
    temp.append(df['min_negative_polarity'][i])
    temp.append(df['max_negative_polarity'][i])
    df_X_train.append(temp)
    del temp

df_y_train = df['Popularity'].to_numpy()

In [None]:
del df, doc_hash, img_count, media_count, channel_str, weekday_str, author_str

In [None]:
print(len(df_X_train),len(df_X_train[0]))

存成csv檔案就不用再重跑之前的code了

In [None]:
pd.DataFrame(df_X_train).to_csv("./input/X_train.csv", index=False, header=False)
pd.DataFrame(df_y_train).to_csv("./input/y_train.csv", index=False, header=False)

生成training set和testing set

## 三. 模型訓練
直接從這裡開始跑

In [6]:
df_X_train = pd.read_csv('./input/X_train.csv', header=None).values
df_y_train = pd.read_csv('./input/y_train.csv', header=None).values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_X_train, df_y_train, test_size=0.2)

In [None]:
del df_X_train
del df_y_train

### 3.1 繪製解釋變異數圖形

In [None]:
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

In [None]:
cov_mat = np.cov(X_train_std.T) # 計算特徵值
eigen_vals, _ = np.linalg.eig(cov_mat)

In [None]:
# 把explained variance(lambda i / lambda和)由大排到小
tot = sum(eigen_vals)
var_exp = np.array([(i / tot) for i in sorted(eigen_vals, reverse=True)])
cum_var_exp = np.cumsum(var_exp) # 計算解釋變異數

# 繪圖
plt.bar(range(len(eigen_vals)), var_exp, alpha=0.5, align='center',
        label='individual explained variance')
plt.step(range(len(eigen_vals)), cum_var_exp, where='mid',
         label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component index')
plt.legend(loc='best')
plt.tight_layout()
# plt.savefig('images/05_02.png', dpi=300)
plt.show()

In [None]:
del X_train_std, X_test_std

### 3.2 Logistic Regression

In [38]:
pipe_lr2 = make_pipeline(StandardScaler(),
                         #PCA(n_components=1200),
                         LogisticRegression(penalty='l2',
                                            C=1e-5, 
                                            random_state=1, 
                                            solver='lbfgs', 
                                            multi_class='ovr', 
                                            verbose=0))

In [39]:
pipe_lr2.fit(X_train, y_train.ravel())

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(C=1e-05, multi_class='ovr',
                                    random_state=1))])

In [40]:
y_pred = pipe_lr2.predict(X_test)
print('Test Accuracy: %.3f' % pipe_lr2.score(X_test, y_test.ravel()))
scores = cross_val_score(estimator=pipe_lr2, 
                         X=X_test, 
                         y=y_test.ravel(), 
                         cv=10, 
                         verbose = 0,
                         scoring='roc_auc')
print('AUC score: %.3f (+/-%.3f)' % (scores.mean(), scores.std()))

Test Accuracy: 0.600
AUC score: nan (+/-nan)


Traceback (most recent call last):
  File "/home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 687, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 87, in __call__
    score = scorer._score(cached_call, estimator,
  File "/home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 362, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "/home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/sklearn/metrics/_ranking.py", line 542, in roc_auc_score
    return _average_binary_score(partial(_binary_roc_auc_score,
  File "/home/benny/anaconda3/envs/tf/lib/python3.8/site-packages/sklearn/metrics/_base.py", line 77, in _ave

In [17]:
del pipe_lr2

### 3.3 xgboost

In [65]:
X_train_xgb = np.array(X_train)
y_train_xgb = [0 if x==[-1] else x for x in y_train]
X_test_xgb = np.array(X_test)
#y_test_xgb = [0 if x==[-1] else x for x in y_test]
d_train = xgb.DMatrix(X_train_xgb, y_train_xgb)
d_test = xgb.DMatrix(X_test_xgb)
xgb_params = {'eta': 0.05, 
              'max_depth': 4, 
              'subsample': 0.7 ,
              'colsample_bytree': 0.7,
              'min_child_weight' : 4,
              'objective': 'binary:logistic', 
              'eval_metric': 'auc',
              'alpha': 0.005,
             }
xgb_model = xgb.XGBClassifier(**xgb_params)
xgb_params = xgb_model.get_xgb_params()

## 四. 參數調整

### 4.1 Logistic Regression

In [49]:
pipe_lr = make_pipeline(StandardScaler(),
                        #PCA(n_components=1000),
                        LogisticRegression(random_state=0))

In [50]:
import warnings
warnings.filterwarnings("ignore")

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0] 
param_grid = [{'logisticregression__C': param_range, 
               'logisticregression__penalty': ['l2'],
               'logisticregression__class_weight': ['balanced', None],
               'logisticregression__solver': ['saga', 'lbfgs']}]

# 使用網格搜尋法
gs = GridSearchCV(estimator=pipe_lr, 
                  param_grid=param_grid, 
                  scoring='roc_auc', 
                  cv=10,
                  n_jobs=-1)
gs = gs.fit(X_train, y_train.ravel())

# 輸出結果
print(gs.best_score_) 
print(gs.best_params_) # 最佳參數組合

0.70625
{'logisticregression__C': 0.01, 'logisticregression__class_weight': 'balanced', 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'saga'}


In [51]:
clf = gs.best_estimator_ # 取得剛剛算出的最佳參數
clf.fit(X_train, y_train) # 用這個參數train出model

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(C=0.01, class_weight='balanced',
                                    random_state=0, solver='saga'))])

### 4.2 XGB

In [66]:
cvresult = xgb.cv(xgb_params, 
                  d_train, 
                  num_boost_round=1000, 
                  verbose_eval=10, 
                  nfold=5, 
                  metrics=['auc'],
                  early_stopping_rounds=50, 
                  stratified=True)
xgb_model.set_params(n_estimators=cvresult.shape[0])

XGBClassifier(alpha=0.005, base_score=None, booster=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.7, eta=0.05, eval_metric='auc', gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=4,
              min_child_weight=4, missing=nan, monotone_constraints=None,
              n_estimators=2, n_jobs=None, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=0.7, tree_method=None,
              validate_parameters=None, verbosity=None)

In [67]:
xgb_model.fit(X_train_xgb, y_train_xgb, eval_metric='auc', verbose=True)

XGBClassifier(alpha=0.005, base_score=0.5, booster='gbtree',
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.7,
              eta=0.05, eval_metric='auc', gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.0500000007, max_delta_step=0, max_depth=4,
              min_child_weight=4, missing=nan, monotone_constraints='()',
              n_estimators=2, n_jobs=12, num_parallel_tree=1, random_state=0,
              reg_alpha=0.00499999989, reg_lambda=1, scale_pos_weight=1,
              subsample=0.7, tree_method='exact', validate_parameters=1,
              verbosity=None)

## 五. 結果預測

In [55]:
y_pred = clf.predict(X_test)
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))
scores = cross_val_score(estimator=clf, 
                         X=X_test, 
                         y=y_test, 
                         cv=10, 
                         verbose = 0,
                         scoring='roc_auc')
print('AUC score: %.3f (+/-%.3f)' % (scores.mean(), scores.std()))

Test Accuracy: 0.650
AUC score: nan (+/-nan)


In [71]:
y_pred = xgb_model.predict_proba(X_test_xgb)[:,1]
print('Test Accuracy: %.3f' % xgb_model.score(X_test_xgb, y_test))
scores = cross_val_score(estimator=xgb_model, 
                         X=X_test, 
                         y=y_test, 
                         cv=10, 
                         verbose = 0,
                         scoring='roc_auc')
print('AUC score: %.3f (+/-%.3f)' % (scores.mean(), scores.std()))

Test Accuracy: 0.200
AUC score: nan (+/-nan)
