In [31]:
import pandas as pd
import numpy as np
import _pickle as pkl
from sklearn.model_selection import train_test_split
import re
import os
from bs4 import BeautifulSoup
from collections import Counter
from string import punctuation
from scipy.sparse import hstack
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk import pos_tag, word_tokenize

# Test and Train

In [32]:
df = pd.read_csv('./datasets/train.csv')
X = df.loc[:, 'Page content'].to_numpy()
y = df.loc[:,'Popularity'].to_numpy()

### Tokenizer

In [33]:
POSITIVE_WORDS = os.path.join(os.getcwd(), 'datasets', 'positive-words.txt')
NEGATIVE_WORDS = os.path.join(os.getcwd(), 'datasets', 'negative-words.txt')
pos_words = []
neg_words = []


for line in open(POSITIVE_WORDS, 'r').readlines()[35:]:
    word = line.rstrip()
    pos_words.append(word)

for line in open(NEGATIVE_WORDS, 'r').readlines()[35:]:
    word = line.rstrip()
    neg_words.append(word)

In [34]:
stop = stopwords.words('english')

def tokenizer_stem_neg(text):
    stemmer = LancasterStemmer()
    return [stemmer.stem(w) for w in re.split(r'\s+', text.strip()) \
            if w in neg_words and re.match('[a-zA-Z]+', w)]

def tokenizer_stem_pos(text):
    stemmer = LancasterStemmer()
    return [stemmer.stem(w) for w in re.split(r'\s+', text.strip()) \
            if w in pos_words and re.match('[a-zA-Z]+', w)]
def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split(r'\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

### Out of mem 

In [35]:
def get_stream(path, size):
    for chunk in pd.read_csv(path, chunksize=size):
        yield chunk
batch_size = 1024
iters = int((27643+batch_size-1)/(batch_size*2))

In [36]:
stream = get_stream(path='./datasets/train.csv', size=batch_size)
idx = 0
X_train, y_train = None, None
batch_X, batch_y = None, None
for z in range(iters):
    batch = next(stream)
    if(idx==0):
        X_train= batch['Page content']
        y_train = batch['Popularity']
        idx+=1
    else:
        X_train = np.concatenate((X_train, batch['Page content']))
        y_train = np.concatenate((y_train, batch['Popularity']))
    batch = next(stream)

### Features Selection


In [37]:
def feature_selection_part1(data):
    feature_str = ""
    soup = BeautifulSoup(data, 'html.parser')

    # Title
    feature_str += soup.find("h1", {"class": "title"}).text + " "

    # # Channel
    # feature_str += soup.find("article").get("data-channel") + " "

    # # Related Topics
    # feature_str += soup.find("footer", {"class": 'article-topics'}).text.replace(" Topics: ", "").replace(",", "")
    
    section = soup.find("section", {"class": "article-content"})
    paragraph = section.find_all("p")
    first = 0
    last =""
    last_2=""
    for tag in paragraph:
        if(not first):
            feature_str += tag.text + " "
            first = 1
        else:
            last_2=last
            last = tag.text
    if(last_2):feature_str += last_2 + " "
    feature_str = re.sub(r'[.:\']', '', feature_str.lower())
    
    return feature_str
print(feature_selection_part1(X[0]))

nasas grand challenge stop asteroids from destroying earth there may be killer asteroids headed for earth, and nasa has decided to do something about it the space agency announced a new "grand challenge" on june 18 to find all dangerous space rocks and figure out how to stop them from destroying our planet see also how it works nasa asteroid-captureresponses to the request for information, which also seeks ideas for detecting and mitigating asteroid threats, are due july 18the asteroid-retrieval mission, designed to provide the first deep-space mission for astronauts flying on nasas space launch system rocket and orion space capsule under development, has come under fire from lawmakers who would prefer that nasa return to the moona draft nasa authorization bill from the house space subcommittee, which is currently in debate, would cancel the mission and steer the agency toward other projects that bill will be discussed during a hearing wednesday, june 19 at 10 am edtsee also how it wor

In [39]:
import math

author_score =dict()
author_num = dict()
channel_score = dict()
channel_num = dict()
topic_score = dict()
topic_num = dict()
avg_author = 0
avg_channel = 0
avg_topic = 0

def feature_selection_part2(data, istraining):
    X = []
    idx=0
    global author_score
    global author_num 
    global channel_score 
    global channel_num 
    global topic_score 
    global topic_num 
    global avg_author 
    global avg_channel 
    global avg_topic 
    
    if(istraining):
        for html in data:
            soup = BeautifulSoup(html, 'html.parser')
            channel = soup.find("article").get("data-channel")
            if channel in channel_score:
                channel_score[channel] += 1 if int(y[idx])==1 else  0
                channel_num[channel] += 1
            else : 
                channel_score[channel] = 0
                channel_score[channel] += 1 if int(y[idx])==1 else  0
                channel_num[channel] = 1
            
            topics = soup.find("footer", {"class": 'article-topics'}).text.replace(" Topics: ", "").split(",")
            for topic in topics:
                if topic in topic_score:
                    topic_score[topic] += 1 if int(y[idx])==1 else  0
                    topic_num[topic] += 1
                else : 
                    topic_score[topic] = 0
                    topic_score[topic] += 1 if int(y[idx])==1 else  0
                    topic_num[topic] = 1


            # Author
            author_re = r'(?:By\s|by\s)?([a-zA-Z]+(\s[A-Z][a-z]+)*)'
            if soup.head.find("span") == None:
                continue
            else:
                author = re.search(author_re, soup.head.find("span").text).group(1)
                if author in author_score:
                    author_score[author] += 1 if int(y[idx])==1 else  0
                    author_num[author] += 1
                else : 
                    author_score[author] = 0
                    author_num[author] = 1
                    author_score[author] += 1 if int(y[idx])==1 else  0
            idx+=1
        total_channel = 0
        total_topic = 0
        total_author = 0
        for html in data:
            for c,s in channel_score.items():
                avg_channel+=s
                total_channel+=channel_num[c]
            avg_channel = avg_channel/total_channel

            for c,s in topic_score.items():
                avg_topic+=s
                total_topic+=topic_num[c]
            avg_topic = avg_topic/total_topic

            for c,s in author_score.items():
                avg_author+=s
                total_author+=author_num[c]
            avg_author = avg_author/total_author

    for html in data:
        soup = BeautifulSoup(html, 'html.parser')
        feature_list = []
        # Author score
        author_re = r'(?:By\s|by\s)?([a-zA-Z]+(\s[A-Z][a-z]+)*)'
        if soup.head.find("span") == None:
            feature_list.append(avg_author)
        else:
            author = re.search(author_re, soup.head.find("span").text).group(1)
            if author in author_score:
                feature_list.append(author_score[author]/author_num[author])
            else:
                feature_list.append(avg_author)
        # Channel score
        channel = soup.find("article").get("data-channel")
        if channel in channel_score:
            feature_list.append(channel_score[channel]/channel_num[channel])
        else:
            feature_list.append(avg_channel)
        # Topic score
        topics = soup.find("footer", {"class": 'article-topics'}).text.replace(" Topics: ", "").split(",")
        total_score = 0
        order = 0 
        order_denominator = 0
        for i in range(len(topics)):
            order_denominator += math.exp(-0.5*i)

        for topic in topics:
            order_coef = math.exp(-0.5*order)/order_denominator
            if topic in topic_score:
                total_score+=(topic_score[topic]/topic_num[topic])*order_coef
            else:
                total_score+=avg_topic*order_coef
            order+=1
        
        feature_list.append(total_score) 

        # word count of title
        feature_list.append(len(soup.find("h1", {"class": "title"}).text))
        # average word length and unique word rate
        words = re.findall(r'\w+', soup.get_text().lower())
        if words:
            total_words = len(words)
            unique_words = set(words)
            unique_word_count = len(unique_words)
            total_length = sum(len(word) for word in words)
            unique_rate = unique_word_count / total_words
            average_length = total_length / len(words)
            feature_list.append(unique_rate)
            feature_list.append(average_length)
        
        # Time
        if soup.time.text == None or soup.time.text == "":
            feature_list.append(0)
            feature_list.append(0)
        else:
            month = int(re.search(r'(\d+-\d+-\d+)', soup.time.text).group(1).split("-")[1])
            feature_list.append(month)
            hour = int(re.search(r'(\d+:\d+:\d+)', soup.time.text).group(1)[:5].split(":")[0])
            minute = int(re.search(r'(\d+:\d+:\d+)', soup.time.text).group(1)[:5].split(":")[1])
            feature_list.append(hour * 60 + minute)

        # Weekend
        if soup.time.text == None or soup.time.text == "":
            feature_list.append(0)
        else:
            feature_list.append(1 if soup.time.get("datetime")[:3] in ["Sat", "Sun"] else 0)

        # Word Count
        text_p = (''.join(s.findAll(string=True))for s in soup.findAll('p'))
        c_p = Counter((x.rstrip(punctuation).lower() for y in text_p for x in y.split()))
        text_div = (''.join(s.findAll(string=True))for s in soup.findAll('div'))
        c_div = Counter((x.rstrip(punctuation).lower() for y in text_div for x in y.split()))
        total = c_div + c_p
        feature_list.append(len(list(total.elements())))

        section = soup.find("section", {"class": "article-content"})

        # Video + Image count
        img_count = len(section.find_all("img")) + len(section.find_all("picture")) + len(section.find_all("figure"))
        video_count = len(section.find_all("video")) + len(section.find_all("iframe"))
        media_count = img_count + video_count
        feature_list.append(media_count)

        # Appealing count
        link_count = len(section.find_all("a"))
        strong_count = len(section.find_all("strong"))
        appealing_count = link_count + strong_count
        feature_list.append(appealing_count)

        # POS & NEG count
        paragraph = section.find_all("p")
        pos_count = 0
        neg_count = 0
        q_count = 0
        ex_count = 0
        for tag in paragraph:
            pos_count += len(tokenizer_stem_pos(tag.text))
            neg_count += len(tokenizer_stem_neg(tag.text))
            if tag.text.find("?") != -1:
                q_count += tag.text.find("?")
            if tag.text.find("!") != -1:
                ex_count += tag.text.find("!")
        feature_list.append(pos_count)    
        feature_list.append(neg_count)
        feature_list.append(q_count)
        feature_list.append(ex_count)

        X.append(feature_list)
    return np.array(X)
feature_selection_part2(X[0:10],True)

UnboundLocalError: cannot access local variable 'avg_channel' where it is not associated with a value

In [None]:
def feature_selection_part3(data):
    feature_str = ""
    soup = BeautifulSoup(data, 'html.parser')
    
    
    feature_str = re.sub(r'[.:\',$()`]', '', feature_str.lower())
    return feature_str

print(feature_selection_part3(X[0]))
    




In [None]:
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier

### Random + GBC

#### training vectorizer

In [None]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X, y, test_size=.2)
print('Begin vectorizer Training...')

tfidf = TfidfVectorizer(ngram_range=(1,2),
                        preprocessor=feature_selection_part1,
                        tokenizer=tokenizer_stem_nostop)
tfidf.fit(X)

# tfidf_type = TfidfVectorizer(ngram_range=(1,2),
#                         preprocessor=feature_selection_part3,
#                         tokenizer=tokenizer_stem_nostop)
# tfidf_type.fit(X)

Begin vectorizer Training...




KeyboardInterrupt: 

#### training model

In [None]:
# print('Begin Type Training...')
# type_model = SGDClassifier(loss='log_loss', max_iter=200)
# x_train_type = tfidf_type.transform(X_train)
# y_train = LabelEncoder().fit_transform(Y_train)
# type_model.fit(x_train_type, y_train)

print('Begin Word Training...')
word_model = RandomForestClassifier(n_estimator=100, max_features=0.4, max_depth=5, min_samples_leaf=5)
x_train_word = tfidf.transform(X_train)
y_train = LabelEncoder().fit_transform(Y_train)
word_model.fit(x_train_word, y_train)

print('Begin Stats Training...')
stats_model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
x_train_stats = feature_selection_part2(X_train, True)
stats_model.fit(x_train_stats, y_train)

#### evaluation

In [None]:
print('Begin Evaluation...')
y_valid = LabelEncoder().fit_transform(Y_valid)

# y_pred_type_train = type_model.predict_proba(x_train_type)[:,1]
# y_pred_type_valid = type_model.predict_proba(tfidf_type.transform(X_valid))[:,1]

y_pred_word_train = word_model.predict_proba(x_train_word)[:,1]
y_pred_word_valid = word_model.predict_proba(tfidf.transform(X_valid))[:,1]

y_pred_stats_train = stats_model.predict_proba(x_train_stats)[:,1]
y_pred_stats_valid = stats_model.predict_proba(feature_selection_part2(X_valid, False))[:,1]

# print("Type Model")
# print(f'Train score: {roc_auc_score(y_train, y_pred_type_train)}')
# print(f'Valid score: {roc_auc_score(y_valid, y_pred_type_valid)}')

print("Word Model")
print(f'Train score: {roc_auc_score(y_train, y_pred_word_train)}')
print(f'Valid score: {roc_auc_score(y_valid, y_pred_word_valid)}')

print("Stats Model")
print(f'Train score: {roc_auc_score(y_train, y_pred_stats_train)}')
print(f'Valid score: {roc_auc_score(y_valid, y_pred_stats_valid)}')

#### find best para

In [None]:
train_score = []
valid_score = []
best_coef = []
best_train = 0
best_valid = 0
coef_ = np.linspace(0,.5,21)
print('Find best para...')
for i in coef_:
    # for j in coef_:
    # train_score = roc_auc_score(y_train, y_pred_word_train*(i) + y_pred_stats_train*(1-i) + y_pred_type_train*(1-i-j))
    # valid_score = roc_auc_score(y_valid, y_pred_word_valid*(i) + y_pred_stats_valid*(1-i) + y_pred_type_valid*(1-i-j))
    train_score = roc_auc_score(y_train, y_pred_word_train*(i) + y_pred_stats_train*(1-i))
    valid_score = roc_auc_score(y_valid, y_pred_word_valid*(i) + y_pred_stats_valid*(1-i))
    if valid_score > best_valid:
        best_valid = valid_score
        best_train = train_score
        best_coef = [i,1-i]
        # best_coef = [i,j,1-i-j]

print(f'Train score: {best_train}')
print(f'Valid score: {best_valid}')

### back up model

In [None]:
pkl.dump(word_model, open("output/backup_word.pickle", "wb"))
pkl.dump(stats_model, open("output/backup_stats.pickle", "wb"))
# pkl.dump(type_model, open('output/backup_topic.pickle', "wb"))
pkl.dump(tfidf, open('output/tfidf_part1.pickle', "wb"))
# pkl.dump(tfidf_type, open('output/tfidf_first_paragraph.pickle', "wb"))

### Prediction

In [None]:
df = pd.read_csv('./datasets/test.csv')
print("Content:\n",df.loc[0])

Id = df.loc[:, 'Id'].to_numpy()
X_test = df.loc[:, 'Page content'].to_numpy()
print(X_test.shape)
print("\nId:\n", Id[0])
print("Content:\n", X_test[0])

In [None]:
# x_test_type = tfidf_type.transform(X_test)
x_test_word = tfidf.transform(X_test)
x_test_stats = feature_selection_part2(X_test)

# y_pred_type_test = type_model.predict_proba(x_test_type)[:,1]
y_pred_word_test = word_model.predict_proba(x_test_word)[:,1]
y_pred_stats_test = stats_model.predict_proba(x_test_stats)[:,1]
y_pred = np.around(y_pred_word_test*(best_coef[0]) + y_pred_stats_test*(best_coef[1]), decimals=3)

In [None]:
output_data = {'Id': Id, 'Popularity': y_pred}
output_dataframe = pd.DataFrame(output_data)
print(output_dataframe)

output_dataframe.to_csv("./datasets/y_pred.csv", index=None)