In [1]:
import pandas as pd
import numpy as np

In [2]:
import urllib
from bs4 import BeautifulSoup
import webbrowser

In [3]:
import gensim 
from gensim.parsing.preprocessing import preprocess_string, remove_stopwords, stem, strip_non_alphanum, strip_numeric, strip_tags, strip_punctuation, strip_short
CUSTOM_FILTERS = [lambda x: x.lower(), remove_stopwords, 
                  stem, strip_non_alphanum, strip_numeric, 
                  strip_tags, strip_punctuation, strip_short]

In [4]:
from scipy import spatial

In [6]:
!cp ~/Library/Application\ Support/Google/Chrome/Default/History ./ # Google Chrome history database location on Mac OS X
## please change the location according to your OS (Linux/Winwin/MAC)

In [7]:
# sqlite query to extract the browsing data
!mv ./chrome_history.txt ./chrome_history_temp.txt
!sqlite3 History "select datetime(last_visit_time/1000000-11644473600,'unixepoch','localtime'),url from  urls where last_visit_time > 0 order by last_visit_time desc" >> ./chrome_history.txt

In [76]:
# re-format the data

# Open file
with open('chrome_history.txt') as f:
    content = f.readlines()
# Strip whitespace then split on first occurrence of pipe character
raw_data = [line.split('|', 1) for line in [x.strip() for x in content]]

In [10]:
data = pd.DataFrame(raw_data, columns=['datetime', 'url'])
data['pages_desc'] = data.url

In [12]:
data.datetime = pd.to_datetime(data.datetime.values)

In [14]:
# sparse the URLs
from urllib.parse import urlparse
netloc_parser = lambda u: urlparse(u).netloc
data.url = data.url.apply(netloc_parser)

In [16]:
# extract the website of interests and the category 
# (works for m.phys.org, change according to the targeted website, 
# e.g. have a look to the page source from Chrome)
# You should have some historic on 'm.phys.org' for it to work
key_website = 'm.phys.org'
bool_website = data.pages_desc.str.contains(key_website)
key_pages = '/news/'
bool_pages = data.pages_desc.str.contains(key_pages)
data[bool_website&bool_pages].head(3)

Unnamed: 0,datetime,url,pages_desc
145,2018-05-21 18:10:30,m.phys.org,https://m.phys.org/news/2018-05-weve-nucleolus...
146,2018-05-21 18:10:30,m.phys.org,https://m.phys.org/news/2018-05-magnonic-inter...
147,2018-05-21 18:10:30,m.phys.org,https://m.phys.org/news/2018-05-chemistry-smar...


In [17]:
pages_list = data[bool_website&bool_pages].pages_desc.values.tolist()

In [18]:
pages_list[:3]

['https://m.phys.org/news/2018-05-weve-nucleolus-left-school.html',
 'https://m.phys.org/news/2018-05-magnonic-interferometer-paves-energy-efficient-devices.html',
 'https://m.phys.org/news/2018-05-chemistry-smart-drugs-smarter.html']

In [19]:
# extract the topics only (title here)
topics_list = list()
for weblink in pages_list:
    topics_list.append(str(BeautifulSoup(urllib.request.urlopen(weblink),"html5lib").title.string))

In [20]:
topics_list[:3]

["What we've learned about the nucleolus since you left school",
 'Magnonic interferometer paves way toward energy-efficient information processing devices',
 'Researchers develop new chemistry to make smart drugs smarter']

In [21]:
print(len(topics_list))

79


In [22]:
# load word2vec model, here GoogleNews is used
# warning: *.bin can be heavy for your RAM (tried on 8GB RAM without any problem or drag)
model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)

In [23]:
# define the topics representation from their components (words)

index2word_set = set(model.index2word)

# sentence/topic representation through averaging of the words representation
def avg_feature_vector(words, model=model, num_features=300, index2word_set=index2word_set):
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model.get_vector(word))
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

In [24]:
# pre-process the topics list for avoiding unuseful words/particles
preprocess_topics_list = list()
for itopics in topics_list:
    preprocess_topics_list.append(preprocess_string(itopics, CUSTOM_FILTERS))
preprocess_topics_list[:3]

[['learn', 'nucleolu', 'left', 'school'],
 ['magnon',
  'interferomet',
  'pave',
  'wai',
  'energy',
  'effici',
  'inform',
  'process',
  'devic'],
 ['research', 'develop', 'new', 'chemistri', 'smart', 'drug', 'smarter']]

In [25]:
topics_list_prev = list()
with open('./topics_list.txt','r') as fp:
    for itopics in fp:
        topics_list_prev.append(itopics.split())

In [26]:
len(topics_list_prev)

79

In [27]:
# merge the new topics with previous for uniqueness check and saving
topics_list = [list(i) for i in set(tuple(i) for i in (preprocess_topics_list+topics_list_prev))]

In [28]:
len(topics_list)

79

In [29]:
topics_list[:3]

[['amateur', 'mathematician', 'partial', 'solv', 'year', 'old', 'problem'],
 ['evolut', 'language', 'there', 'app'],
 ['artifici', 'intellig', 'acceler', 'discoveri', 'metal', 'glass']]

In [30]:
# save/re-write in memory a topics list
!rm -f ./topics_list.txt
with open('./topics_list.txt','w') as fp: 
    for itopics in topics_list:
        fp.write(' '.join(itopics)+'\n')

In [67]:
# extract topics title, URLs
topics_list_new = list()
page_titles_list = list()
page_urls_list = list()
max_page_number = 1
for page_number in range(1,max_page_number+1):
    page_titles = BeautifulSoup(urllib.request.urlopen('http://m.phys.org/page{}.html'.format(page_number)),
                                "html5lib").find_all('h3', {"class": "ui-li-heading"})
    page_urls = BeautifulSoup(urllib.request.urlopen('http://m.phys.org/page{}.html'.format(page_number)),
                                "html5lib").find_all('a', {"class": "ui-link-inherit list-img"}, href=True)
    for page_title in page_titles:
        page_titles_list.append(str(page_title.text))
        topics_list_new.append(preprocess_string(str(page_title.text), CUSTOM_FILTERS))
    for c, page_url in enumerate(page_urls):
        page_urls_list.append('<a href="{}">{}</a>'.format(page_url['href'], page_titles[c]))

In [68]:
# compute the distances between past and new topics in the space of representation

score_list = list()

s1_afv = list()
for topic in topics_list:
    s1_afv.append(avg_feature_vector(topic))
    
s2_afv = list()
for topic in topics_list_new:    
    s2_afv.append(avg_feature_vector(topic))
    
for vec2 in s2_afv:
    score_list_tmp = list()
    for vec1 in s1_afv:
        if np.sum(np.power(vec2,2)) != 0. and np.sum(np.power(vec1,2)) != 0.:
            sim = 1-spatial.distance.cosine(np.array(vec1), np.array(vec2)) # distance/similarity between articles' topics
        else:
            sim = 0.
        score_list_tmp.append(sim)
    score_list.append(np.max(score_list_tmp))

In [69]:
# save to a dataframe
importance_article = pd.DataFrame(
                                 {
                                     'page_titles': page_titles_list,
                                     'score': score_list, 
                                     'page_urls': page_urls_list
                                 }
                                 )

In [70]:
importance_article = importance_article.sort_values(by=['score'], ascending=False) # rank by the score
importance_article = importance_article[importance_article.score<1.] # avoid already read articles
importance_article.head(5)

Unnamed: 0,page_titles,page_urls,score
4,Nanoparticles could offer a new way to help eradicate polio worldwide,"<a href=""https://m.phys.org/news/2018-05-nanoparticles-eradicate-polio-worldwide.html""><h3 class=""ui-li-heading"">Nanoparticles could offer a new way to help eradicate polio worldwide</h3></a>",0.620211
5,Major fossil study sheds new light on emergence of early animal life 540 million years ago,"<a href=""https://m.phys.org/news/2018-05-major-fossil-emergence-early-animal.html""><h3 class=""ui-li-heading"">Major fossil study sheds new light on emergence of early animal life 540 million years ago</h3></a>",0.556796
2,A better way to control crystal vibrations,"<a href=""https://m.phys.org/news/2018-05-crystal-vibrations.html""><h3 class=""ui-li-heading"">A better way to control crystal vibrations</h3></a>",0.552698
6,Chemists synthesize millions of proteins not found in nature,"<a href=""https://m.phys.org/news/2018-05-chemists-millions-proteins-nature.html""><h3 class=""ui-li-heading"">Chemists synthesize millions of proteins not found in nature</h3></a>",0.520514
0,"Humans account for little next to plants, worms, bugs","<a href=""https://m.phys.org/news/2018-05-humans-account-worms-bugs.html""><h3 class=""ui-li-heading"">Humans account for little next to plants, worms, bugs</h3></a>",0.508767


In [75]:
# display the dataframe as an html page with clickable hyperlinks
old_width = pd.get_option('display.max_colwidth')
pd.set_option('display.max_colwidth', -1) # avoid the truncation of the urls
importance_article.drop(['page_titles'],axis=1).set_index('score').to_html('new_topics_list.html', escape=False)
webbrowser.open_new('file:///Users/<user_name>/<working_directory>/new_topics_list.html') # change for your OS
pd.set_option('display.max_colwidth', old_width)

## the result is not beautiful but useful. Please, feel free to fork it, change it to your tastes
## I know there are redundancies in the code, but I'm sure you'll simply improve it ;) 
## Have fun and a sane newsfeed!