In [40]:
from datetime import datetime
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup

In [41]:
loc_prefix = '.'

def save_to(df, file_name):
    df.to_csv(loc_prefix + '/data/' + file_name, index=False)
def read_csv(file_name):
    return pd.read_csv(loc_prefix + '/data/' + file_name)

def read_train_test(rel_path_train, rel_path_test):
    dfs = []
    for path in [rel_path_train, rel_path_test]:
        df = read_csv(path)
        print(f'{path} {df.shape}')
        display(df.head(5))
        
        dfs.append(df)
    
    return dfs[0], dfs[1]

In [42]:
# df_train = pd.read_csv('./data/train.csv')
# df_test = pd.read_csv('./data/test.csv')
# print(df.shape)
# display(df.head(5))

df_train, df_test = read_train_test('train/train.csv', 'test/test.csv')

train/train.csv (27643, 3)


Unnamed: 0,Id,Popularity,Page content
0,0,-1,"<html><head><div class=""article-info""> <span c..."
1,1,1,"<html><head><div class=""article-info""><span cl..."
2,2,1,"<html><head><div class=""article-info""><span cl..."
3,3,-1,"<html><head><div class=""article-info""><span cl..."
4,4,-1,"<html><head><div class=""article-info""><span cl..."


test/test.csv (11847, 2)


Unnamed: 0,Id,Page content
0,27643,"<html><head><div class=""article-info""><span cl..."
1,27644,"<html><head><div class=""article-info""><span cl..."
2,27645,"<html><head><div class=""article-info""><span cl..."
3,27646,"<html><head><div class=""article-info""><span cl..."
4,27647,"<html><head><div class=""article-info""><span cl..."


In [43]:
# df_train = df_train[:100]

In [44]:
def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

In [45]:
def capture_attribute(df):
    df_new = df
    
    channels = []
    titles = []
    raw_contents = []
    content_lens = []
    avg_word_lens = []
    topics = []
    unique_topics = []
    authors = []
    times = []
    count = 0
    # get_text = lambda tags: [for t in tags: t.get_text()]
    df = df.loc[:, ]
    for row in df.loc[:, 'Page content']:
        soup = BeautifulSoup(row, 'html.parser')

        # data-channel
        data_channel = soup.article
        if data_channel != None:
#             print(data_channel['data-channel'])
            data_channel = data_channel['data-channel']
        else:
            data_channel = 'None'
        channels.append(data_channel)

        # title
        title = soup.select_one('h1.title').text
        titles.append(title)

        # raw content
        content = soup.select_one('.article-content').text
        raw_contents.append(content)

        # content length
        content_len = len(content.split(' '))
        content_lens.append(content_len)

        # avg word length
        content_words = content.split(' ')
        word_lens = [len(word) for word in content_words if len(word) > 0]
        # print(content_words)
        # print(word_lens)
        avg_word_len = sum(word_lens) / (len(word_lens) + 1e-9)
        avg_word_lens.append(avg_word_len)

        # topic
        topic_arr = [t.get_text().lower() for t in soup.footer.find_all('a')]
        topic = ','.join(topic_arr)
        topics.append(topic)
        unique_topics.extend(topic_arr)

        # author
        author = soup.select_one('.author_name')
        if author != None:
            author = author.a
            if author != None:
                author = author['href']
                if author != None:
                    author = author.split('/')[-2]
        authors.append(author)

        # time
        time = soup.time
        if time != None and time.has_attr('datetime'):
            time = time['datetime']
            if time != None:
                time = datetime.strptime(time, '%a, %d %b %Y %H:%M:%S %z')
                if time != None:
                    time = [time.year, time.month, time.day, time.hour, time.minute, time.second]
                else:
                    time = [0, 0, 0, 0, 0, 0]
            else:
                time = [0, 0, 0, 0, 0, 0]
        else:
            time = [0, 0, 0, 0, 0, 0]
            
        times.append(time)

        if(not count % 1000):
            print(f'Parse Progress: {count} th')
        count = count + 1

    unique_topics = np.unique(unique_topics)
    print(f'Unique Topics {len(unique_topics)}')

    df_new['data-channel'] = channels
    df_new['title'] = titles
    df_new['raw_content'] = raw_contents
    df_new['content_length'] = content_lens
    df_new['avg_word_length'] = avg_word_lens
    df_new['topics'] = topics
    df_new['author'] = authors
    df_new[['year', 'month', 'day', 'hour', 'min', 'sec']] = times

    print(f'Converted DF Shape: {df_new.shape}')
    
    return df_new

In [46]:
print(f'df_train {df_train.shape}')
df_train = capture_attribute(df_train)
print(f'df_test {df_test.shape}')
df_test = capture_attribute(df_test)

df_train (27643, 3)
Parse Progress: 0 th
Parse Progress: 1000 th
Parse Progress: 2000 th
Parse Progress: 3000 th
Parse Progress: 4000 th
Parse Progress: 5000 th
Parse Progress: 6000 th
Parse Progress: 7000 th
Parse Progress: 8000 th
Parse Progress: 9000 th
Parse Progress: 10000 th
Parse Progress: 11000 th
Parse Progress: 12000 th
Parse Progress: 13000 th
Parse Progress: 14000 th
Parse Progress: 15000 th
Parse Progress: 16000 th
Parse Progress: 17000 th
Parse Progress: 18000 th
Parse Progress: 19000 th
Parse Progress: 20000 th
Parse Progress: 21000 th
Parse Progress: 22000 th
Parse Progress: 23000 th
Parse Progress: 24000 th
Parse Progress: 25000 th
Parse Progress: 26000 th
Parse Progress: 27000 th
Unique Topics 14012
Converted DF Shape: (27643, 16)
df_test (11847, 2)
Parse Progress: 0 th
Parse Progress: 1000 th
Parse Progress: 2000 th
Parse Progress: 3000 th
Parse Progress: 4000 th
Parse Progress: 5000 th
Parse Progress: 6000 th
Parse Progress: 7000 th
Parse Progress: 8000 th
Parse Pro

In [47]:
print(f'df_train {df_train.shape}')
display(df_train.head(5))
print(f'df_test {df_test.shape}')
display(df_test.head(5))

df_train (27643, 16)


Unnamed: 0,Id,Popularity,Page content,data-channel,title,raw_content,content_length,avg_word_length,topics,author,year,month,day,hour,min,sec
0,0,-1,"<html><head><div class=""article-info""> <span c...",world,NASA's Grand Challenge: Stop Asteroids From De...,There may be killer asteroids headed for Eart...,583,5.214905,"asteroid,asteroids,challenge,earth,space,u.s.,...",,2013,6,19,15,4,30
1,1,1,"<html><head><div class=""article-info""><span cl...",tech,Google's New Open Source Patent Pledge: We Won...,Google took a stand of sorts against patent-l...,309,5.032787,"apps and software,google,open source,opn pledg...",christina,2013,3,28,17,40,55
2,2,1,"<html><head><div class=""article-info""><span cl...",entertainment,Ballin': 2014 NFL Draft Picks Get to Choose Th...,You've spend countless hours training to be a...,1360,4.750225,"entertainment,nfl,nfl draft,sports,television",sam-laird,2014,5,7,19,15,20
3,3,-1,"<html><head><div class=""article-info""><span cl...",watercooler,Cameraperson Fails Deliver Slapstick Laughs,Tired of the same old sports fails and ne...,476,4.841727,"sports,video,videos,watercooler",sam-laird,2013,10,11,2,26,50
4,4,-1,"<html><head><div class=""article-info""><span cl...",entertainment,NFL Star Helps Young Fan Prove Friendship With...,"At 6-foot-5 and 298 pounds, All-Pro NFL star ...",1937,5.08965,"entertainment,instagram,instagram video,nfl,sp...",connor-finnegan,2014,4,17,3,31,43


df_test (11847, 15)


Unnamed: 0,Id,Page content,data-channel,title,raw_content,content_length,avg_word_length,topics,author,year,month,day,hour,min,sec
0,27643,"<html><head><div class=""article-info""><span cl...",entertainment,Soccer Star Gets Twitter Death Threats After T...,Note to humanity: One Direction fandom ai...,622,5.220114,"entertainment,music,one direction,soccer,sports",sam-laird,2013,9,9,19,47,2
1,27644,"<html><head><div class=""article-info""><span cl...",tech,Google Glass Gets an Accessory Store,Shortly after announcing a hardware upgrade f...,149,4.739437,"gadgets,glass,google,google glass,google glass...",stan-schroeder,2013,10,31,9,25,2
2,27645,"<html><head><div class=""article-info""><span cl...",business,OUYA Gaming Console Already Sold Out on Amazon,"Well, that was quick. Just hours after going ...",168,5.018293,"amazon,amazon kindle,business,gaming",todd-wasserman,2013,6,25,12,54,54
3,27646,"<html><head><div class=""article-info""><span cl...",film,'Between Two Ferns' Mocks Oscar Nominees,Between Two Ferns: Oscar Buzz Edition Part 1...,162,5.581699,"between two ferns,movies,the oscars,oscars 201...",neha-prakash,2013,2,13,3,30,21
4,27647,"<html><head><div class=""article-info""><span cl...",entertainment,'American Sniper' Trailer: Looks Like Eastwood...,Ever since The Hurt Locker it seems like ...,225,5.041096,"american sniper,awards,bradley cooper,clint ea...",josh-dickey,2014,10,3,1,34,54


In [48]:
# np_t = np.array(times)
# print(np_t)
# print(np_t.shape)
# for i in range(np_t.shape[0]):
#     np_t[i] = np.array(np_t[i])
#     if np_t[i].shape[0] != 6:
#         print("ERR")
#         df.loc[i, ['year', 'month', 'day', 'hour', 'min', 'sec']] = np.array([0, 0, 0, 0, 0, 0])
#     else:
#         df.loc[i, ['year', 'month', 'day', 'hour', 'min', 'sec']] = np_t[i]

# print(np_t.shape)
# print(np_t)

In [49]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
stop = stopwords.words('english')

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

def tokenizer_lemm_nostop(text):
    lemm = WordNetLemmatizer()
    return [lemm.lemmatize(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

print(tokenizer_stem_nostop('runners like running and thus they run'))
print(tokenizer_lemm_nostop('runners like running and thus they run'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pp20/pp20s02/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/pp20/pp20s02/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['runner', 'like', 'run', 'thu', 'run']
['runner', 'like', 'running', 'thus', 'run']


In [50]:
# stemmed_nostop_page_data = []
# for page in cleaned_page_data:
#     stemmed_nostop_page_data.append(tokenizer_stem_nostop(page))

In [51]:
save_to(df_test, 'test/feats_test.csv')
save_to(df_train, 'train/feats_train.csv')