In [4]:
import feedparser
import pandas as pd
import gensim
from gensim.models import Word2Vec
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk import word_tokenize
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
stop_words = stopwords.words('english')
from pyemd import emd
from scipy.cluster.hierarchy import fclusterdata
from sklearn.cluster import DBSCAN, AffinityPropagation, AgglomerativeClustering
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
import pickle
import time
import hashlib

#from itertools import combinations
#from tqdm import tqdm_notebook
#from scipy.stats import skew, kurtosis
#from sklearn.preprocessing import StandardScaler
#from sklearn.decomposition import PCA

In [2]:
bbc_rss = ['http://feeds.bbci.co.uk/news/rss.xml', 
           'http://feeds.bbci.co.uk/news/world/rss.xml', 
           'http://feeds.bbci.co.uk/news/uk/rss.xml', 
           'http://feeds.bbci.co.uk/news/business/rss.xml', 
           'http://feeds.bbci.co.uk/news/politics/rss.xml', 
           'http://feeds.bbci.co.uk/news/health/rss.xml', 
           'http://feeds.bbci.co.uk/news/education/rss.xml', 
           'http://feeds.bbci.co.uk/news/science_and_environment/rss.xml', 
           'http://feeds.bbci.co.uk/news/technology/rss.xml', 
           'http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml']

In [3]:
class Embeddings:
    
    def __init__(self, create=False, ser_model_path='W2VModel',
                 embeddings='GoogleNews-vectors-negative300.bin.gz',
                 model_fun=gensim.models.KeyedVectors.load_word2vec_format, binary=True, norm=True):
        self.ser_model = ser_model_path
        self.embeddings = embeddings
        self.model_fun = model_fun
        self.binary = binary
        self.norm = norm
        
        if create == False:
            self.model = self.load_model()
        else:
            self.model = self.create_model()
        
            
    def create_model(self):
        model = self.model_fun(self.embeddings, binary=self.binary)
        if self.norm:
            model.init_sims(replace=True)
        return model
            
    def load_model(self):
        with open(self.ser_model, 'rb') as file:
            model = pickle.load(file)
        return model

In [180]:
class News_Vectorizer:
    
    def __init__(self, model, news=None):
        self.news = news #array of strings
        self.model = model #Word2Vec model
        if self.news is not None:
            self.news_vectors = self.news2vec(self.news) #vector representations
        else:
            self.news_vectors = None
        self.cos_dist = None #cosine distance matrix
        self.wm_dist = None #wmd-matrix
    
    def wmd(self, q1, q2):
        q1 = str(q1).lower().split()
        q2 = str(q2).lower().split()
        q1 = [w for w in q1 if w not in stop_words]
        q2 = [w for w in q2 if w not in stop_words]
        return self.model.wmdistance(q1, q2)
    
    def sent2vec(self, s):
        words = str(s).lower()
        words = word_tokenize(words)
        words = [w for w in words if not w in stop_words]
        words = [w for w in words if w.isalpha()]
        M = []
        for w in words:
            try:
                M.append(self.model[w])
            except:
                continue
        M = np.array(M)
        v = M.sum(axis=0)
        return v / np.sqrt((v ** 2).sum())
    
    def news2vec(self, news):
        # update self.news, self.news_vectors
        news_vectors = np.array([self.sent2vec(text) for text in news])
        self.news = news
        self.news_vectors = news_vectors
        #if self.news is None:
        #    self.news = news
        #if self.news_vectors is None:
        #    self.news_vectors = news_vectors
        return news_vectors
    
    def dist_vec(self, news_item, news=None, metric='cosine'):
        #computes distances between given item and news (or self.news)
        news_item = self.sent2vec(news_item)
        if news is not None:
            news = self.news2vec(news)
        else:
            news = self.news_vectors
        if news is None:
            return 'no news to compute distances'
        if metric == 'cosine':
            dist_vec = np.array([cosine(news_item, i) for i in news])
        elif metric == 'wmd':
            dist_vec = np.array([self.wmd(news_item, i) for i in news])
        return dist_vec
    
    #TODO: triangle matrix -> optimize
    def cosine_matrix(self): 
        cdist = np.zeros((len(self.news_vectors), len(self.news_vectors)))
        for n, i in enumerate(self.news_vectors):
            for m, j in enumerate(self.news_vectors):
                cdist[n, m] = cosine(i, j)
        self.cos_dist = cdist
        return cdist
    
    def wmd_matrix(self): #list (news)
        wmdist = np.zeros((len(self.news), len(self.news)))
        for n, i in enumerate(self.news):
            for m, j in enumerate(self.news):
                wmdist[n, m] = self.wmd(i, j)
        self.wm_dist = wmdist
        return wmdist

In [42]:
class RSS_Feeds:
    
    def __init__(self, urls):
        self.urls = urls
        self.feeds = self.get_feeds()
        self.df_news = self.create_df()
        self.df_unique_news = self.create_unique()
        
    def get_feeds(self):
        return [feedparser.parse(feed) for feed in self.urls]
    
    def get_category(self, feed):
        # sources may have different category names - agg categories?
        return feed.feed['title']

    def get_title_summary(self, feed, sep='. '): #get and join title and summary for each entry in feed
        titles = [entry['title'] for entry in feed['entries']]
        summaries = [entry['summary'] for entry in feed['entries']]
        title_summary = [entry['title'] + sep + entry['summary'] for entry in feed['entries']]
        return titles, summaries, title_summary
    
    def get_date(self, feed): #(year, month, day) for each entry in feed
        return([entry['published_parsed'][:3] for entry in feed['entries']])
    
    def get_time(self, feed): #(hour, min, sec) for each entry in feed
        return([entry['published_parsed'][3:6] for entry in feed['entries']])
    
    def get_datetime_nparsed(self, feed): #not parsed date and time for each entry in feed
        return([entry['published'] for entry in feed['entries']])
    
    def get_link(self, feed): # link for each entry in feed
        return([entry['link'] for entry in feed['entries']])
    
    def str2hash(self, s):
        return hashlib.md5(s.encode()).hexdigest()
    
    def create_df(self): 
        news, title, summary, category, pdate, ptime, fdatetime, links  = [], [], [], [], [], [], [], []
        for feed in self.feeds:
            cat = self.get_category(feed)
            titles, summaries, texts = self.get_title_summary(feed)
            d_ymd, t_hms = self.get_date(feed), self.get_time(feed)
            fdt = self.get_datetime_nparsed(feed)
            news_links = self.get_link(feed)
            
            cat = np.resize([cat], len(texts))
            news.extend(texts)
            title.extend(titles)
            summary.extend(summaries)
            pdate.extend(d_ymd)
            ptime.extend(t_hms)
            fdatetime.extend(fdt)
            links.extend(news_links)
            category.extend(cat)
        df_news = pd.DataFrame({'news':news, 
                                'category':category,
                                'title':title, 
                                'summary':summary,
                                'link':links,
                                'date':pdate, 
                                'time':ptime, 
                                'datetime':fdatetime})
        df_news['ID'] = df_news.news.apply(self.str2hash)
        self.df_news = df_news
        return df_news
    
    def create_unique(self):
        df_unique_news = self.df_news.groupby('news').agg({'category':list, 
                                                           'title': np.unique, 
                                                           'summary': np.unique, 
                                                           'link': np.unique, 
                                                           'date': np.unique, 
                                                           'time': np.unique, 
                                                           'datetime': np.unique, 
                                                           'ID': np.unique})
        df_unique_news.reset_index(inplace=True)
        self.df_unique_news = df_unique_news
        return df_unique_news
    
    def get_unique_news(self):
        return self.df_unique_news.news.values

In [7]:
#TODO: Weights!
#TODO: not only labels but also distances - to suggest the most interesting items?
class Aggregator:
    
    def __init__(self, clusterizer, classifier, labeled_data=None, labels=None, clust_weights=None):
        self.clusterizer = clusterizer
        self.classifier = classifier
        self.labeled_data = labeled_data #already clustered viewed, vector representations as ndarray
        self.labels = labels #clust nums of labeled_data, ndarray
        self.clust_weights = clust_weights # DataFrame, colnames=['clust', 'weight']
        
    def clusterize(self, data):
        labels = self.clusterizer.fit_predict(data)
        return data, labels
    
    def classify(self, new_data): #if one sample: reshape sent2vec output to (1, 300)
        try:
            predicted = self.classifier.predict(new_data)
        except NotFittedError as e:
            return(repr(e))
        return predicted
    
    def fit_classifier(self):
        X, y = self.labeled_data, self.labels
        self.classifier.fit(X, y)
        return self.classifier
    
    def prep_data(self, new_data=None):
        if self.labeled_data is None and new_data is None:
            return None
        else:
            try:
                ldata = pd.DataFrame(self.labeled_data)
            except:
                ldata = None
            try:
                ndata = pd.DataFrame(new_data)
            except:
                ndata=None
            try:
                data = pd.concat([ldata, ndata]).values
                return data
            except:
                return None
    
    def update_weights(self): #sum weights = 1 required in News_Finder
        unique, counts = np.unique(self.labels, return_counts=True)
        weights = counts/counts.sum() #smth like this
        weights = np.asarray((unique, weights)).T # [label, weight]
        self.clust_weights = pd.DataFrame({'clust': weights[:,0].astype(int), 'weight': weights[:,1]})
        return self.clust_weights
    
    def update_aggregator(self, new_data):
        data = self.prep_data(new_data=new_data)
        if data is None:
            return 'no data'
        else:
            self.labeled_data, self.labels = self.clusterize(data)    
            self.fit_classifier()
            self.update_weights()
            return 'updated'
        

In [177]:
class News_Finder():
    
    def __init__(self, df_news, news_vectorizer): #df_news: DF with non-viewed news items; News_Vectorizer instance
        self.df_news = df_news
        self.df_unique_news = self.create_unique()
        self.news_vectorizer = news_vectorizer
        
    def update_news(self, df_news):
        self.df_news = df_news
        self.df_unique_news = self.create_unique()
        return 'updated'
    
    def create_unique(self):
        df_unique_news = self.df_news.groupby('ID').agg({'news': np.unique, 
                                                         'category':list, 
                                                         'title': np.unique, 
                                                         'summary': np.unique, 
                                                         'link': np.unique, 
                                                         'date': np.unique, 
                                                         'time': np.unique, 
                                                         'datetime': np.unique})
        df_unique_news.reset_index(inplace=True)
        return df_unique_news
    
    def get_from_categories(self, n=5):
        # returns n top news from each category
        return self.df_news.groupby('category').head(n) 
    
    def get_similar(self, news_item, metric='cosine', n=5):
        # returns the n most similar news to news_item
        all_news = self.df_unique_news.query('ID != @news_item.ID').copy()
        dist_vec = self.news_vectorizer.dist_vec(news_item.news, all_news.news.values, metric=metric)
        all_news['dist'] = dist_vec
        return all_news.nsmallest(n, 'dist')
    
    def get_interesting(self, aggregator, n_min=20): #fitted Aggregator instance for classification & weights
        # should return n_min or more news based on user preferencies
        # TODO: if there are not enough news in clusters (for weights), return another news?
        all_news = self.df_unique_news.copy()
        news_vec = self.news_vectorizer.news2vec(all_news.news.values)
        #print(news_vec.shape)
        weights = aggregator.clust_weights
        labels = aggregator.classify(news_vec)
        all_news['label'] = labels
        # TODO: filter 'outlier' cluster
        # ?TODO: distances to choose the most relevant items in cluster 
        n_from_cluster = np.ceil((aggregator.clust_weights.weight*n_min)).astype(int)
        dflist = []
        for cluster, n in zip(weights.clust, n_from_cluster):
            dflist.append(all_news.query('label == @cluster').head(n))
        interesting = pd.concat(dflist)
        return interesting

In [103]:
#TODO: delete, (prepare?)
class Data_Manager:
    
    def __init__(self, path_dict=None): #path_dict {'csv':{obj:path}, 'serialized':{obj:path}}
        self.path_dict = path_dict
        if self.path_dict is not None:
            self.data_dict = self.load_data()
        else:
            self.data_dict = {}
        
    def load_data(self):
        data_dict = {}
        try:
            for obj_name, path in self.path_dict['csv'].items():
                data_dict[obj_name] = pd.read_csv(path, index_col=0)
        except:
            print('something is not ok with "csv" key or it does not exist')
        try:
            for obj_name, path in self.path_dict['serialized'].items():
                with open(path, 'rb') as file:
                    data_dict[obj_name] = pickle.load(file)
        except:
            print('something is not ok with "serialized" key or it does not exist')
        return data_dict
    
    def delete_old(self, n_recent):
        #del old data, except n_recent
        pass
    def prep_data(self):
        #maybe some data manipulations
        pass
    
    def get_data_item(self, obj_name):
        return self.data_dict.get(obj_name, 'Does not exist')
    
    def update_data_item(self, obj_name, new_data):
        data = self.get_data_item(obj_name)
        if data != 'Does not exist':
            if type(data) == pd.core.frame.DataFrame:
                try:
                    data = pd.concat([data, new_data])
                    self.data_dict[obj_name] = data
                    return 'updated'
                except:
                    return 'could not update'
            elif type(data) == np.ndarray:
                try:
                    data = np.vstack([data, new_data])
                    self.data_dict[obj_name] = data
                    return 'updated'
                except:
                    return 'could not update'
            else:
                self.data_dict[obj_name] = new_data
                return 'upd: obj = new_data (not an array or DF)'
        else:
            self.data_dict[obj_name] = new_data
            return 'upd: obj = new_data (obj did not exist yet)'
    
    def save_model(self, data_items='all'): #data_items: 'all' or list of keys for data_dict
        if data_items == 'all':
            data_items = self.data_dict.keys()
        for obj_name in data_items:
            data = self.data_dict[obj_name]
            if type(data) == pd.core.frame.DataFrame:
                data.to_csv(obj_name + '.csv')
            elif type(data) == np.ndarray:
                # are there any ndarrays?..
                # write csv...
                pass
            else:
                with open(obj_name, 'wb') as file:
                    pickle.dump(data, file)
        return 'saved'

In [99]:
NV = News_Vectorizer(emb.model)

In [100]:
NV.sent2vec('some another sentence to check functionality').shape

(1, 300)

In [181]:
nf = News_Finder(RSS_Feeds(bbc_rss).df_news, News_Vectorizer(emb.model))

In [183]:
nf.get_interesting(aggregator=agg).shape

(231, 300)


(24, 10)

In [186]:
nf.get_similar(viewed.iloc[5,:])

Unnamed: 0,ID,news,category,title,summary,link,date,time,datetime,dist
103,6c2120fc61d2e7af113243c7318a98e9,Carrickfergus: Glen Quinn death may be linked ...,[BBC News - UK],Carrickfergus: Glen Quinn death may be linked ...,The man in his 40s who was found dead in a hou...,https://www.bbc.co.uk/news/uk-northern-ireland...,"(2020, 1, 6)","(11, 51, 41)","Mon, 06 Jan 2020 11:51:41 GMT",0.390763
29,16812d9b25f78b8eb9f9fb82ac130d81,Llanelli deaths: Gary Williams stabbed more th...,[BBC News - UK],Llanelli deaths: Gary Williams stabbed more th...,Gary Williams was found in the same house as J...,https://www.bbc.co.uk/news/uk-wales-51008201,"(2020, 1, 6)","(11, 45, 23)","Mon, 06 Jan 2020 11:45:23 GMT",0.41015
100,638b7feeb816fd935d6ff1073303c970,Ayia Napa: Raab urges Cyprus to 'do the right ...,[BBC News - UK Politics],Ayia Napa: Raab urges Cyprus to 'do the right ...,The foreign secretary says his priority is to ...,https://www.bbc.co.uk/news/uk-50998866,"(2020, 1, 5)","(11, 37, 18)","Sun, 05 Jan 2020 11:37:18 GMT",0.416625
105,700000852b5434e8178145f68806983c,Duffield stabbings: Estranged wife double murd...,[BBC News - UK],Duffield stabbings: Estranged wife double murd...,Helen Hancock and Martin Griffiths were found ...,https://www.bbc.co.uk/news/uk-england-derbyshi...,"(2020, 1, 6)","(12, 50, 54)","Mon, 06 Jan 2020 12:50:54 GMT",0.434796
80,5099789a6ed6326a9afb2020809aff32,"Katherine Jenkins mugging: Girl, 15, pleads gu...","[BBC News - Home, BBC News - UK, BBC News - En...","Katherine Jenkins mugging: Girl, 15, pleads gu...",The Welsh mezzo-soprano was attacked on her wa...,https://www.bbc.co.uk/news/entertainment-arts-...,"(2020, 1, 6)","(14, 36, 0)","Mon, 06 Jan 2020 14:36:00 GMT",0.437851


In [189]:
viewed.iloc[5,:]

news        Sunken chest syndrome: 'I'm being strangled in...
category                                        BBC News - UK
link        https://www.bbc.co.uk/news/uk-england-devon-50...
date                                             (2020, 1, 5)
time                                               (0, 1, 15)
datetime                        Sun, 05 Jan 2020 00:01:15 GMT
ID                           e93f24321fa5c27e2da2a1dce5880b2c
label                                                       3
Name: 81, dtype: object

In [90]:
nf.update_news(RSS_Feeds(bbc_rss).df_news)

'updated df_news'

In [101]:
nf.similar(df_news.iloc[0,:], News_Vectorizer(emb.model))

Unnamed: 0,ID,news,category,title,summary,link,date,time,datetime,dist
232,ff625e6352a576d18ff6e3b74fba4772,Trump: US killed Soleimani to 'stop a war' wit...,[BBC News - World],Trump: US killed Soleimani to 'stop a war' wit...,US President Trump gives a statement after ord...,https://www.bbc.co.uk/news/world-us-canada-509...,"(2020, 1, 3)","(20, 36, 2)","Fri, 03 Jan 2020 20:36:02 GMT",0.260237
47,2ac8479941adf0e08bd422461ba37488,The Papers: White House threat and PM's Iran '...,"[BBC News - Home, BBC News - UK]",The Papers: White House threat and PM's Iran '...,Monday's papers report Boris Johnson's respons...,https://www.bbc.co.uk/news/blogs-the-papers-51...,"(2020, 1, 6)","(5, 31, 25)","Mon, 06 Jan 2020 05:31:25 GMT",0.344058
26,139c10e05a94c3fe747ea8d9e15c54ef,Qasem Soleimani: Dominic Raab says he found ou...,"[BBC News - UK, BBC News - UK Politics]",Qasem Soleimani: Dominic Raab says he found ou...,The UK was not warned about US plans to assass...,https://www.bbc.co.uk/news/uk-politics-50998327,"(2020, 1, 5)","(10, 51, 10)","Sun, 05 Jan 2020 10:51:10 GMT",0.363699
18,0ea768c5fb8b354e583f8f622b4643cd,Qasem Soleimani: Boris Johnson not told about ...,[BBC News - UK Politics],Qasem Soleimani: Boris Johnson not told about ...,The UK prime minister was not told in advance ...,https://www.bbc.co.uk/news/uk-politics-50981719,"(2020, 1, 3)","(18, 36, 42)","Fri, 03 Jan 2020 18:36:42 GMT",0.365677
153,a90599fc7f044e800c2e54ec0de674b2,Trump threatens Iraq with sanctions if US troo...,[BBC News - World],Trump threatens Iraq with sanctions if US troo...,"The US president warns Iraq of sanctions ""like...",https://www.bbc.co.uk/news/world-middle-east-5...,"(2020, 1, 6)","(8, 14, 0)","Mon, 06 Jan 2020 08:14:00 GMT",0.384127


In [8]:
rss = RSS_Feeds(bbc_rss)

In [48]:
nitem = rss.df_news.iloc[0,:]

In [50]:
nitem.ID

'0c286cdcba4aa9236cd3f53a92ee998e'

In [51]:
df_news = rss.df_news

In [55]:
df_news.shape

(285, 9)

In [66]:
u.shape

(221, 9)

In [63]:
nv = News_Vectorizer(emb.model)

In [65]:
nv.dist_vec(nitem.news, u.news.values).shape

(221,)

In [9]:
rss.df_news.head()

Unnamed: 0,news,category,link,date,time,datetime,ID
0,Soleimani: Huge crowds pack Tehran for command...,BBC News - Home,https://www.bbc.co.uk/news/world-middle-east-5...,"(2020, 1, 6)","(9, 15, 55)","Mon, 06 Jan 2020 09:15:55 GMT",0c286cdcba4aa9236cd3f53a92ee998e
1,Qasem Soleimani: Boris Johnson and European le...,BBC News - Home,https://www.bbc.co.uk/news/uk-51004218,"(2020, 1, 6)","(9, 10, 11)","Mon, 06 Jan 2020 09:10:11 GMT",869ca611e8bca9be85f962c4b4ae800d
2,Golden Globes 2020: British stars have golden ...,BBC News - Home,https://www.bbc.co.uk/news/entertainment-arts-...,"(2020, 1, 6)","(10, 46, 33)","Mon, 06 Jan 2020 10:46:33 GMT",c8fe7150ff26a9694a0088591a7217d9
3,FTSE chief executives 'earn average salary wit...,BBC News - Home,https://www.bbc.co.uk/news/business-51000217,"(2020, 1, 6)","(0, 1, 2)","Mon, 06 Jan 2020 00:01:02 GMT",2f6690ebc6a03a99e8506dafb60326aa
4,New car registrations at lowest level since 20...,BBC News - Home,https://www.bbc.co.uk/news/business-50985412,"(2020, 1, 6)","(9, 20, 26)","Mon, 06 Jan 2020 09:20:26 GMT",fc4662a7a224c31ba1ae54efb5a3820e


In [10]:
emb = Embeddings()

In [24]:
nv = News_Vectorizer(emb.model)

In [28]:
nv.news2vec(rss.get_unique_news())

array([[-0.02345757,  0.02462658,  0.04443118, ..., -0.11020525,
         0.04113361,  0.03566668],
       [ 0.07758831,  0.0578831 ,  0.01188256, ..., -0.04717001,
         0.08506233, -0.01283889],
       [ 0.0482512 , -0.00875419,  0.0113892 , ...,  0.00814444,
         0.03839172, -0.09693049],
       ...,
       [ 0.06636593,  0.08467283, -0.05472225, ..., -0.04520165,
         0.0150347 ,  0.01927825],
       [ 0.02927748,  0.02805086, -0.05400386, ..., -0.02469139,
         0.07916237, -0.03276701],
       [ 0.08161105,  0.04468193,  0.07157842, ..., -0.02819903,
         0.02868958,  0.04304279]], dtype=float32)

In [29]:
nv.news_vectors.shape

(10, 300)

In [104]:
manager = Data_Manager()

In [105]:
manager = Data_Manager(path_dict={'csv':{'news':'news.csv', 'viewed':'viewed.csv'}, 'serialized':{'classifier':'classifier', 'clusterizer':'clusterizer'}})

In [110]:
manager.get_data_item('classifier')

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='cosine',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='distance')

In [115]:
new_data = nv.news2vec(rss.get_unique_news())

In [117]:
labeled_data = nv.news2vec(manager.get_data_item('viewed').news.values)

In [119]:
labels = manager.get_data_item('viewed').label

In [123]:
labels

0      2
1      1
2      1
48     2
49     4
81     3
110    1
111    1
149    1
150    1
193    0
195    5
213    1
214    0
215    4
232    6
233    0
234    1
259    4
260    4
281    0
282    3
283    0
Name: label, dtype: int64

In [124]:
agg = Aggregator(manager.get_data_item('clusterizer'),
                 manager.get_data_item('classifier'), labeled_data, labels)

In [125]:
agg.update_weights()

Unnamed: 0,clust,weight
0,0,0.217391
1,1,0.347826
2,2,0.086957
3,3,0.086957
4,4,0.173913
5,5,0.043478
6,6,0.043478


In [153]:
weights = agg.clust_weights

In [166]:
n_from_clust = np.ceil((agg.clust_weights.weight*20)).astype(int)

In [147]:
for i, j in zip(agg.clust_weights.clust, n_from_clust):
    print(i, j)

0 4
1 2
2 2
3 1
4 2
5 1
6 1


In [167]:
dflist = []
for cluster, n in zip(weights.clust, n_from_clust):
    dflist.append(viewed.query('label == @cluster').head(n))

In [170]:
viewed.shape

(23, 8)

In [169]:
pd.concat(dflist).shape

(19, 8)

In [150]:
viewed = manager.get_data_item('viewed')

Unnamed: 0,news,category,link,date,time,datetime,ID,label
0,Soleimani assassination: Mourners flood the st...,BBC News - Home,https://www.bbc.co.uk/news/world-middle-east-5...,"(2020, 1, 5)","(9, 15, 6)","Sun, 05 Jan 2020 09:15:06 GMT",92b52890a83cf10f253652b35a697c9e,2
1,Qasem Soleimani: Raab urges Iran to take diplo...,BBC News - Home,https://www.bbc.co.uk/news/uk-politics-50996630,"(2020, 1, 5)","(10, 30, 15)","Sun, 05 Jan 2020 10:30:15 GMT",e20e8caf41be90713ac71dd95bfb9b87,1
2,"HS2 costs out of control, says review's deputy...",BBC News - Home,https://www.bbc.co.uk/news/business-50995116,"(2020, 1, 5)","(10, 2, 23)","Sun, 05 Jan 2020 10:02:23 GMT",4d0d1202cdfc4539394138a76d7100a1,1
48,Qasem Soleimani: Mourning begins in Iran. The ...,BBC News - World,https://www.bbc.co.uk/news/world-middle-east-5...,"(2020, 1, 5)","(8, 25, 15)","Sun, 05 Jan 2020 08:25:15 GMT",b5f8537288db3951cc22f3cbac30d8aa,2
49,Australia bushfires: Fundraiser reaches A$20m ...,BBC News - World,https://www.bbc.co.uk/news/world-australia-509...,"(2020, 1, 5)","(7, 24, 8)","Sun, 05 Jan 2020 07:24:08 GMT",b8c7c52984d793c0ab9b94b75e872365,4
81,Sunken chest syndrome: 'I'm being strangled in...,BBC News - UK,https://www.bbc.co.uk/news/uk-england-devon-50...,"(2020, 1, 5)","(0, 1, 15)","Sun, 05 Jan 2020 00:01:15 GMT",e93f24321fa5c27e2da2a1dce5880b2c,3
193,'Outdated' IT leaves NHS staff with 15 differe...,BBC News - Health,https://www.bbc.co.uk/news/health-50972123,"(2020, 1, 4)","(0, 55, 57)","Sat, 04 Jan 2020 00:55:57 GMT",86c0f0b33e76adfe1b16a8c43fa67392,0
195,Welsh hospitals: Pest control called for rats ...,BBC News - Health,https://www.bbc.co.uk/news/uk-wales-50737114,"(2020, 1, 5)","(1, 37, 36)","Sun, 05 Jan 2020 01:37:36 GMT",59e61dd3214326a7d6fa3a155f4e6468,5
214,'Confusion' over Welsh free childcare scheme. ...,BBC News - Family & Education,https://www.bbc.co.uk/news/uk-wales-politics-5...,"(2020, 1, 5)","(1, 30, 35)","Sun, 05 Jan 2020 01:30:35 GMT",0d1dd500cc332d542401d14edcd8acc5,0
215,Violinist Nicola Benedetti launches music work...,BBC News - Family & Education,https://www.bbc.co.uk/news/uk-scotland-glasgow...,"(2020, 1, 4)","(16, 26, 36)","Sat, 04 Jan 2020 16:26:36 GMT",4cf5cd61a4618f70f88db44fcc666b85,4


In [128]:
agg.update_aggregator(new_data)

'updated'

In [133]:
np.unique(agg.labels)

array([0, 1, 2, 3, 4, 5, 6], dtype=int64)

In [252]:
manager.update_data_item('news', rss.df_news)

'upd: obj = new_data (obj did not exist yet)'

In [254]:
manager.get_data_item('news').head()

Unnamed: 0,news,category,link,date,time,datetime,ID
0,Qasem Soleimani: PM 'will not lament' Iranian ...,BBC News - Home,https://www.bbc.co.uk/news/uk-51001236,"(2020, 1, 5)","(19, 5, 52)","Sun, 05 Jan 2020 19:05:52 GMT",c6feecfbdefd901ce4b7b404f50afa9c
1,Iran rolls back nuclear deal commitments. Iran...,BBC News - Home,https://www.bbc.co.uk/news/world-middle-east-5...,"(2020, 1, 5)","(19, 8, 1)","Sun, 05 Jan 2020 19:08:01 GMT",11ce682d0bf7650eef652a43d4fdf2da
2,Qasem Soleimani: Mourning begins in Iran. The ...,BBC News - Home,https://www.bbc.co.uk/news/world-middle-east-5...,"(2020, 1, 5)","(8, 25, 15)","Sun, 05 Jan 2020 08:25:15 GMT",b5f8537288db3951cc22f3cbac30d8aa
3,Labour leadership: Contenders set out stalls o...,BBC News - Home,https://www.bbc.co.uk/news/uk-politics-50996799,"(2020, 1, 5)","(15, 59, 9)","Sun, 05 Jan 2020 15:59:09 GMT",0ea3549d868732a9fd67f59f374bed69
4,Finsbury Park stabbing: Manhunt as killed Deli...,BBC News - Home,https://www.bbc.co.uk/news/uk-england-london-5...,"(2020, 1, 5)","(15, 36, 53)","Sun, 05 Jan 2020 15:36:53 GMT",3511d139f23538e5b9fada328ccbbb3f


In [255]:
manager.save_model()

'saved'

In [274]:
aggregator = Aggregator(agg_clust, knn)
nv = News_Vectorizer(df_viewed.news.values, model=emb.model)

In [275]:
aggregator.update_aggregator(nv.news_vectors)

'updated'

In [276]:
aggregator.labels

array([2, 1, 1, 2, 4, 3, 1, 1, 1, 1, 0, 5, 1, 0, 4, 6, 0, 1, 4, 4, 0, 3,
       0], dtype=int64)

In [277]:
df_viewed['label'] = aggregator.labels

In [279]:
manager.update_data_item('viewed', df_viewed)

'upd: obj = new_data (obj did not exist yet)'

In [280]:
manager.update_data_item('classifier', aggregator.classifier)

'upd: obj = new_data (not an array or DF)'

In [281]:
manager.update_data_item('clusterizer', aggregator.clusterizer)

'upd: obj = new_data (obj did not exist yet)'

In [283]:
manager.update_data_item('weights')

dict_keys(['news', 'classifier', 'viewed', 'clusterizer'])

In [284]:
manager.save_model()

'saved'

In [286]:
manager.data_dict.keys()

dict_keys(['news', 'viewed', 'classifier', 'clusterizer'])

In [43]:
def from_categories(df_news, n=5):
    return df_news.groupby('category').head(n)       

In [143]:
def show_news(df_chosen_news):
    for cat in df_chosen_news.category.unique():
        print('Category: ' + cat)
        #for news in df_chosen_news.query('category == @cat'):
        #    print(news[['news', 'link', 'datetime']])
        temp = df_chosen_news.query('category == @cat')[['news', 'link', 'datetime', 'ID']]
        print(temp)

In [74]:
show_news(from_categories(rss.df_news))

Category: BBC News - Home
                                                news  \
0  Trump says US ready to strike 52 Iranian sites...   
1  Qasem Soleimani: Royal Navy to protect UK ship...   
2  HS2 costs out of control, says review's deputy...   
3  Australia bushfires: Fundraiser reaches A$20m ...   
4  Sunken chest syndrome: 'I'm being strangled in...   

                                                link  \
0  https://www.bbc.co.uk/news/world-middle-east-5...   
1    https://www.bbc.co.uk/news/uk-politics-50996630   
2       https://www.bbc.co.uk/news/business-50995116   
3  https://www.bbc.co.uk/news/world-australia-509...   
4  https://www.bbc.co.uk/news/uk-england-devon-50...   

                        datetime  
0  Sun, 05 Jan 2020 06:38:58 GMT  
1  Sun, 05 Jan 2020 09:04:02 GMT  
2  Sun, 05 Jan 2020 00:42:42 GMT  
3  Sun, 05 Jan 2020 07:24:08 GMT  
4  Sun, 05 Jan 2020 00:01:15 GMT  
Category: BBC News - World
                                                 news  \
47  Tr

In [156]:
def append_viewed_news(news_id, category, df_news, df_viewed):
    if news_id not in df_viewed.ID.values:
        df_viewed = pd.concat([df_viewed, df_news.query('ID == @news_id and category == @category')])
    return df_viewed

In [207]:
def subset_not_viewed(df_news, df_viewed):
    df_news = df_news.query('ID not in @df_viewed.ID.values')
    return df_news

In [167]:
sample = from_categories(rss.df_news, 3)[['ID', 'category']]
df_viewed = pd.DataFrame(columns=rss.df_news.columns)

In [172]:
sample.ID.nunique()

23

In [173]:
emb = Embeddings()

In [177]:
agg_clust = AgglomerativeClustering(n_clusters=7, affinity='cosine', linkage='complete')

In [178]:
knn = KNeighborsClassifier(n_neighbors=3, weights='distance', metric='cosine')

In [182]:
aggregator = Aggregator(clusterizer=agg_clust, classifier=knn)

In [183]:
nv = News_Vectorizer(df_viewed.news.values, emb.model)

In [184]:
aggregator.update_aggregator(nv.news_vectors)

'updated'

In [192]:
news_to_class = [nv.sent2vec(i) for i in rss.df_news.groupby('category').head(10).iloc[7:9,:].news.values]

In [193]:
aggregator.classify(news_to_class)

array([4, 3], dtype=int64)

In [196]:
aggregator.labeled_data.shape

(23, 300)

In [158]:
rss.df_news.query('ID == @nid')

Unnamed: 0,news,category,link,date,time,datetime,ID
0,Soleimani assassination: Mourners flood the st...,BBC News - Home,https://www.bbc.co.uk/news/world-middle-east-5...,"(2020, 1, 5)","(9, 15, 6)","Sun, 05 Jan 2020 09:15:06 GMT",92b52890a83cf10f253652b35a697c9e
47,Soleimani assassination: Mourners flood the st...,BBC News - World,https://www.bbc.co.uk/news/world-middle-east-5...,"(2020, 1, 5)","(9, 15, 6)","Sun, 05 Jan 2020 09:15:06 GMT",92b52890a83cf10f253652b35a697c9e


In [160]:
df_viewed = append_viewed_news(nid, 'BBC News - World', rss.df_news, df_viewed)

In [33]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
iris = load_iris()

In [171]:
sent = nv.sent2vec('Fresh Cambridge Analytica leak shows global manipulation is out of control. Company’s work in 68 countries laid bare with release of more than 100,000 documents')