In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('./datasets/train.csv')
print(df.head(5))

### Read and Split Data

In [None]:
print("Label:\n", df.loc[0, 'Popularity'])
print("Content:\n",df.loc[0,'Page content'])

In [None]:
X = df.loc[:, 'Page content'].to_numpy()
y = df.loc[:,'Popularity'].to_numpy()
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1)

print(X_train.shape)
print(y_train.shape)
print("Label:\n", y_train[0])
print("Content:\n", X_train[0])

### Feature Engineering

In [None]:
import re
from bs4 import BeautifulSoup

def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = r'(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)

    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub(r'[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

In [None]:
soup = BeautifulSoup(X_train[0], 'html.parser')
# soup = BeautifulSoup(X_train[0], 'html.parser')
print(soup.prettify())
# beautiful soup tools
print("Title: ", soup.find("h1", {"class": "title"}).text)
print("Time: ", re.search(r'(\d+:\d+:\d+)', soup.time.text).group(1))
print("Data Channel: ", soup.find("article").get("data-channel"))
print("Author: ", soup.find("span", {"class": "author_name"}).text.replace("By ", ""))
# print("Author: ", soup.find("span").text)
# print("Author: ", soup.find("a").text)
# print("Data Labels: ", soup.find('div', {'class': 'content-mash-video'}).get('data-labels'))
# print("Data Title: ", soup.find("div", {"class": 'content-mash-video'}).get('data-title'))
print(soup.find("footer", {"class": 'article-topics'}).text.replace(" Topics: ", "").split(","))

In [None]:
from collections import Counter
from string import punctuation
def feature_selection(data):
    feature_list = []
    idx = 0
    for html_text in data:
        soup = BeautifulSoup(html_text, 'html.parser')
        feature_map = {}  
        feature_map["Title"] = soup.find("h1", {"class": "title"}).text
        # Author formate may various from news website
        author = soup.find("span", {"class": "author_name"}) # default format
        if(author != None): 
            feature_map["Author"] = author.text.replace("By ", "")
        elif(soup.find("span") != None): 
            feature_map["Author"] = soup.find("span").text
        elif(soup.find("a") != None): 
            feature_map["Author"] = soup.find("a").text
        else: print(idx) # To check the undefine formate
        feature_map["Time"] = re.search(r'(\d+:\d+:\d+)', soup.time.text).group(1)
        feature_map["Channel"] = soup.find("article").get("data-channel")
        feature_map["Topics"] = soup.find("footer", {"class": 'article-topics'}).text.replace(" Topics: ", "").split(",")
         # Word Count
        text_p = (''.join(s.findAll(string=True))for s in soup.findAll('p'))
        c_p = Counter((x.rstrip(punctuation).lower() for y in text_p for x in y.split()))
        text_div = (''.join(s.findAll(string=True))for s in soup.findAll('div'))
        c_div = Counter((x.rstrip(punctuation).lower() for y in text_div for x in y.split()))
        total = c_div + c_p
        print("Total words: ", len(list(total.elements())))


        section = soup.find("section", {"class": "article-content"})

        # Video + Image count
        img_count = len(section.find_all("img")) + len(section.find_all("picture")) + len(section.find_all("figure"))
        video_count = len(section.find_all("video")) + len(section.find_all("iframe"))
        media_count = img_count + video_count
        print("Media count: ", media_count)

        # Appealing count
        link_count = len(section.find_all("a"))
        strong_count = len(section.find_all("strong"))
        appealing_count = link_count + strong_count
        print("Link count: ", appealing_count)
        # Check if there has any None value feature
        if(feature_map["Title"] == None): print(f"Title: {idx}")
        if(feature_map["Author"] == None): print(f"Author: {idx}")
        if(feature_map["Time"] == None): print(f"Time: {idx}")
        if(feature_map["Channel"] == None): print(f"Channel: {idx}")
        if(feature_map["Topics"] == None): print(f"Topics: {idx}")
        feature_list.append(feature_map)
        idx+=1
    return feature_list 

print(feature_selection(X_valid)[0])

In [None]:
def feature_selection(text):
    feature_str = ""
    soup = BeautifulSoup(text, 'html.parser')
    
    feature_str += soup.find("h1", {"class": "title"}).text + " "
    # Author formate may various from news website
    author = soup.find("span", {"class": "author_name"}) # default format
    if(author != None): 
        feature_str += author.text.replace("By ", "") + " "
    elif(soup.find("span") != None): 
        feature_str += soup.find("span").text + " "
    elif(soup.find("a") != None): 
        feature_str += soup.find("a").text + " "
    if(re.search(r'(\d+:\d+:\d+)', soup.time.text) == None):
        feature_str += ""
    else: feature_str += re.search(r'(\d+:\d+:\d+)', soup.time.text).group(1) + " "
    feature_str += soup.find("article").get("data-channel") + " "
    feature_str += soup.find("footer", {"class": 'article-topics'}).text.replace(" Topics: ", "").replace(",", "")
    
    return feature_str

print(feature_selection(X_valid[0]))

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split(r'\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

print(tokenizer_stem_nostop(feature_selection(X_valid[0])))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(1,2),
                        preprocessor=feature_selection,
                        tokenizer=tokenizer_stem_nostop)

tfidf.fit(X_train)

top = 10
# get idf score of vocabularies
idf = tfidf.idf_
print('[vocabularies with smallest idf scores]')
sorted_idx = idf.argsort()

for i in range(top):
    # When sklearn version <= 0.24.x, should use get_feature_names()
    # When sklearn version >= 1.0.x, should use get_feature_names_out()
    print('%s: %.2f' %(tfidf.get_feature_names_out()[sorted_idx[i]], idf[sorted_idx[i]]))


# doc_tfidf = tfidf.transform(X_train).toarray()
# tfidf_sum = np.sum(doc_tfidf, axis=0)
# print("\n[vocabularies with highest tf-idf scores]")
# for tok, v in zip(tfidf.inverse_transform(np.ones((1, tfidf_sum.shape[0])))[0][tfidf_sum.argsort()[::-1]][:top], \
#                         np.sort(tfidf_sum)[::-1][:top]):
#     print('{}: {}'.format(tok, v))

### Model Training

In [None]:
import math

def random_mini_batches(X, Y, mini_batch_size = 64):
    m = X.shape[0]  # number of training examples
    mini_batches = []
        
    permutation = list(np.random.permutation(m))
    shuffled_X = X[permutation]
    shuffled_Y = Y[permutation]
    
    # Cases with a complete mini batch size only i.e each of 64 examples.
    num_complete_minibatches = math.floor(m / mini_batch_size)
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[k * mini_batch_size : (k+1) * mini_batch_size]
        mini_batch_Y = shuffled_Y[k * mini_batch_size : (k+1) * mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    # For handling the end case (last mini-batch < mini_batch_size i.e less than 64)
    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[num_complete_minibatches * mini_batch_size:]
        mini_batch_Y = shuffled_Y[num_complete_minibatches * mini_batch_size:]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    
    return mini_batches


In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score

clf = SGDClassifier(loss='log_loss', alpha=0.01, max_iter=100, tol=1e-3)

In [None]:
train_auc, val_auc = [], []
iters = 3
batch_size = 512
classes = np.array([-1, 1])

# train_tfidf = tfidf.transform(X_train).toarray()
# valid_tfidf = tfidf.transform(X_valid).toarray()

for i in range(iters):
    print(f"epoch: {i+1}/{iters}")
    train_batches = random_mini_batches(X_train, y_train, batch_size)
    valid_batches = random_mini_batches(X_valid, y_valid, int(batch_size * 0.2))
    idx = 0
    for train_batch in train_batches:
        x_batch, y_batch = train_batch
        x_batch = tfidf.transform(x_batch)
        clf.partial_fit(x_batch, y_batch, classes=classes)
        train_score = roc_auc_score(y_batch, clf.predict_proba(x_batch)[:,1])
        train_auc.append(train_score)
        print(f'[{(idx+1)*batch_size}/{X_train.shape[0]}]')
        print(f'Train score: {train_score}')
        
        x_batch, y_batch = valid_batches[idx]
        valid_score = roc_auc_score(y_batch, clf.predict_proba(tfidf.transform(x_batch))[:,1])
        val_auc.append(valid_score)
        print(f'Valid score: {valid_score}')
        idx+=1

In [None]:
import os
if not os.path.exists('output'):
    os.mkdir('output')

import matplotlib.pyplot as plt

plt.plot(range(1, len(train_auc)+1), train_auc, color='blue', label='Train auc')
plt.plot(range(1, len(train_auc)+1), val_auc, color='red', label='Val auc')
plt.legend(loc="best")
plt.xlabel('#Batches')
plt.ylabel('Auc')
plt.tight_layout()
plt.show()

In [None]:
valid_score = roc_auc_score(y_valid, clf.predict_proba(tfidf.transform(X_valid))[:,1])
print(f'Valid score: {valid_score}')

In [None]:
print(clf.predict_proba(tfidf.transform(X_train[:10]))[:10,1])

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score

# loss='log' gives logistic regression
# sklearn version > 1.0 should use log_loss
clf = SGDClassifier(loss='log_loss', max_iter=100, tol=1e-3)
hashvec = HashingVectorizer(n_features=2**20,
                            preprocessor=feature_selection, tokenizer=tokenizer_stem_nostop)

In [None]:
batch_size = 512
classes = np.array([-1, 1])
train_auc, val_auc = [], []
# we use one batch for training and another for validation in each iteration
iters = 3

for i in range(iters):
    print(f"epoch: {i+1}/{iters}")
    train_batches = random_mini_batches(X_train, y_train, batch_size)
    valid_batches = random_mini_batches(X_valid, y_valid, int(batch_size * 0.2))
    idx = 0
    for train_batch in train_batches:
        x_batch, y_batch = train_batch
        x_batch = hashvec.transform(x_batch)
        clf.partial_fit(x_batch, y_batch, classes=classes)
        train_score = roc_auc_score(y_batch, clf.predict_proba(x_batch)[:,1])
        train_auc.append(train_score)
        print(f'[{(idx+1)*batch_size}/{X_train.shape[0]}]')
        print(f'Train score: {train_score}')
        
        x_batch, y_batch = valid_batches[idx]
        valid_score = roc_auc_score(y_batch, clf.predict_proba(hashvec.transform(x_batch))[:,1])
        val_auc.append(valid_score)
        print(f'Valid score: {valid_score}')
        idx+=1

In [None]:
import os
if not os.path.exists('output'):
    os.mkdir('output')

import matplotlib.pyplot as plt

plt.plot(range(1, len(train_auc)+1), train_auc, color='blue', label='Train auc')
plt.plot(range(1, len(train_auc)+1), val_auc, color='red', label='Val auc')
plt.legend(loc="best")
plt.xlabel('#Batches')
plt.ylabel('Auc')
plt.tight_layout()
plt.show()

In [None]:
valid_score = roc_auc_score(y_valid, clf.predict_proba(hashvec.transform(X_valid))[:,1])
print(f'Valid score: {valid_score}')

In [None]:
print(clf.predict_proba(hashvec.transform(X_train[:10]))[:10,1])

### Prediction

In [None]:
df = pd.read_csv('./datasets/test.csv')
print("Content:\n",df.loc[0])

Id = df.loc[:, 'Id'].to_numpy()
X_test = df.loc[:, 'Page content'].to_numpy()
print(X_test.shape)
print("\nId:\n", Id[0])
print("Content:\n", X_test[0])

In [None]:
y_pred = np.around(clf.predict_proba(hashvec.transform(X_test))[:,1], decimals=1)
print(y_pred.shape)
print(y_pred[0])

In [None]:
output_data = {'Id': Id, 'Popularity': y_pred}
output_dataframe = pd.DataFrame(output_data)
print(output_dataframe)

output_dataframe.to_csv("./datasets/y_pred.csv", index=None)