In [1]:
import feedparser
import pandas as pd
import gensim
from gensim.models import Word2Vec
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk import word_tokenize
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
stop_words = stopwords.words('english')
from pyemd import emd
from scipy.cluster.hierarchy import fclusterdata
from sklearn.cluster import DBSCAN, AffinityPropagation, AgglomerativeClustering
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
import pickle
import time
import hashlib
from sklearn.exceptions import NotFittedError

#from itertools import combinations
#from tqdm import tqdm_notebook
#from scipy.stats import skew, kurtosis
#from sklearn.preprocessing import StandardScaler
#from sklearn.decomposition import PCA

In [2]:
bbc_rss = ['http://feeds.bbci.co.uk/news/rss.xml', 
           'http://feeds.bbci.co.uk/news/world/rss.xml', 
           'http://feeds.bbci.co.uk/news/uk/rss.xml', 
           'http://feeds.bbci.co.uk/news/business/rss.xml', 
           'http://feeds.bbci.co.uk/news/politics/rss.xml', 
           'http://feeds.bbci.co.uk/news/health/rss.xml', 
           'http://feeds.bbci.co.uk/news/education/rss.xml', 
           'http://feeds.bbci.co.uk/news/science_and_environment/rss.xml', 
           'http://feeds.bbci.co.uk/news/technology/rss.xml', 
           'http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml']

In [58]:
class Embeddings:
    
    def __init__(self, create=False, ser_model_path='W2VModel',
                 embeddings='GoogleNews-vectors-negative300.bin.gz',
                 model_fun=gensim.models.KeyedVectors.load_word2vec_format, binary=True, norm=True):
        self.ser_model = ser_model_path
        self.embeddings = embeddings
        self.model_fun = model_fun
        self.binary = binary
        self.norm = norm
        
        if create == False:
            self.model = self.load_model()
        else:
            self.model = self.create_model()
        
            
    def create_model(self):
        model = self.model_fun(self.embeddings, binary=self.binary)
        if self.norm:
            model.init_sims(replace=True)
        return model
            
    def load_model(self):
        with open(self.ser_model, 'rb') as file:
            model = pickle.load(file)
        return model

In [57]:
# ? TODO: dist_matrix(x,y) from two news vectors (if we want some classification based only on distances, + use wmd!)
class News_Vectorizer:
    
    def __init__(self, model, news=None):
        self.news = news #array of strings
        self.model = model #Word2Vec model
        if self.news is not None:
            self.news_vectors = self.news2vec(self.news) #vector representations
        else:
            self.news_vectors = None
        self.cos_dist = None #cosine distance matrix
        self.wm_dist = None #wmd-matrix
    
    def wmd(self, q1, q2):
        q1 = str(q1).lower().split()
        q2 = str(q2).lower().split()
        q1 = [w for w in q1 if w not in stop_words]
        q2 = [w for w in q2 if w not in stop_words]
        return self.model.wmdistance(q1, q2)
    
    def sent2vec(self, s):
        words = str(s).lower()
        words = word_tokenize(words)
        words = [w for w in words if not w in stop_words]
        words = [w for w in words if w.isalpha()]
        M = []
        for w in words:
            try:
                M.append(self.model[w])
            except:
                continue
        M = np.array(M)
        v = M.sum(axis=0)
        return v / np.sqrt((v ** 2).sum())
    
    def news2vec(self, news):
        # update self.news, self.news_vectors
        news_vectors = np.array([self.sent2vec(text) for text in news])
        self.news = news
        self.news_vectors = news_vectors
        #if self.news is None:
        #    self.news = news
        #if self.news_vectors is None:
        #    self.news_vectors = news_vectors
        return news_vectors
    
    def dist_vec(self, news_item, news=None, metric='cosine'):
        #computes distances between given item and news (or self.news)
        news_item = self.sent2vec(news_item)
        if news is not None:
            news = self.news2vec(news)
        else:
            news = self.news_vectors
        if news is None:
            return 'no news to compute distances'
        if metric == 'cosine':
            dist_vec = np.array([cosine(news_item, i) for i in news])
        elif metric == 'wmd':
            dist_vec = np.array([self.wmd(news_item, i) for i in news])
        return dist_vec
    
    def cosine_matrix(self): 
        cdist = np.zeros((len(self.news_vectors), len(self.news_vectors)))
        for n, i in enumerate(self.news_vectors):
            for m, j in enumerate(self.news_vectors):
                cdist[n, m] = cosine(i, j)
        self.cos_dist = cdist
        return cdist
    
    def wmd_matrix(self): #list (news)
        wmdist = np.zeros((len(self.news), len(self.news)))
        for n, i in enumerate(self.news):
            for m, j in enumerate(self.news):
                wmdist[n, m] = self.wmd(i, j)
        self.wm_dist = wmdist
        return wmdist

In [56]:
class RSS_Feeds:
    
    def __init__(self, urls):
        self.urls = urls
        self.feeds = self.get_feeds()
        self.df_news = self.create_df()
        self.df_unique_news = self.create_unique()
        
    def get_feeds(self):
        return [feedparser.parse(feed) for feed in self.urls]
    
    def get_category(self, feed):
        # sources may have different category names - agg categories?
        return feed.feed.get('title', '')

    def get_title_summary(self, feed, sep='. '): #get and join title and summary for each entry in feed
        titles = [entry['title'] for entry in feed['entries']]
        summaries = [entry['summary'] for entry in feed['entries']]
        title_summary = [entry['title'] + sep + entry['summary'] for entry in feed['entries']]
        return titles, summaries, title_summary
    
    def get_date(self, feed): #(year, month, day) for each entry in feed
        return([entry['published_parsed'][:3] for entry in feed['entries']])
    
    def get_time(self, feed): #(hour, min, sec) for each entry in feed
        return([entry['published_parsed'][3:6] for entry in feed['entries']])
    
    def get_datetime_nparsed(self, feed): #not parsed date and time for each entry in feed
        return([entry['published'] for entry in feed['entries']])
    
    def get_link(self, feed): # link for each entry in feed
        return([entry['link'] for entry in feed['entries']])
    
    def str2hash(self, s):
        return hashlib.md5(s.encode()).hexdigest()
    
    def create_df(self): 
        news, title, summary, category, pdate, ptime, fdatetime, links  = [], [], [], [], [], [], [], []
        for feed in self.feeds:
            cat = self.get_category(feed)
            titles, summaries, texts = self.get_title_summary(feed)
            d_ymd, t_hms = self.get_date(feed), self.get_time(feed)
            fdt = self.get_datetime_nparsed(feed)
            news_links = self.get_link(feed)
            
            cat = np.resize([cat], len(texts))
            news.extend(texts)
            title.extend(titles)
            summary.extend(summaries)
            pdate.extend(d_ymd)
            ptime.extend(t_hms)
            fdatetime.extend(fdt)
            links.extend(news_links)
            category.extend(cat)
        df_news = pd.DataFrame({'news':news, 
                                'category':category,
                                'title':title, 
                                'summary':summary,
                                'link':links,
                                'date':pdate, 
                                'time':ptime, 
                                'datetime':fdatetime})
        df_news['ID'] = df_news.news.apply(self.str2hash)
        self.df_news = df_news
        return df_news
    
    def create_unique(self):
        df_unique_news = self.df_news.groupby('news').agg({'category':list, 
                                                           'title': np.unique, 
                                                           'summary': np.unique, 
                                                           'link': np.unique, 
                                                           'date': np.unique, 
                                                           'time': np.unique, 
                                                           'datetime': np.unique, 
                                                           'ID': np.unique})
        df_unique_news.reset_index(inplace=True)
        self.df_unique_news = df_unique_news
        return df_unique_news
    
    def get_unique_news(self):
        return self.df_unique_news.news.values

In [55]:
#TODO: Weights!
#TODO: Classifier, Clusterizer -> with outlier clusters
#TODO: number of clusters - determine dynamically? Or choose another clusterizer...
# ? TODO: not only labels but also distances - to suggest the most interesting items?
class Aggregator:
    
    def __init__(self, clusterizer, classifier, labeled_data=None, labels=None, clust_weights=None):
        self.clusterizer = clusterizer
        self.classifier = classifier
        self.labeled_data = labeled_data #already clustered viewed, vector representations as ndarray
        self.labels = labels #clust nums of labeled_data, ndarray
        self.clust_weights = clust_weights # DataFrame, colnames=['clust', 'weight']
        
    def clusterize(self, data):
        labels = self.clusterizer.fit_predict(data)
        return data, labels
    
    def classify(self, new_data): #if one sample: reshape sent2vec output to (1, 300)
        try:
            predicted = self.classifier.predict(new_data)
        except NotFittedError as e:
            return(repr(e))
        return predicted
    
    def fit_classifier(self):
        X, y = self.labeled_data, self.labels
        self.classifier.fit(X, y)
        return self.classifier
    
    def prep_data(self, new_data=None):
        if self.labeled_data is None and new_data is None:
            return None
        else:
            try:
                ldata = pd.DataFrame(self.labeled_data)
            except:
                ldata = None
            try:
                ndata = pd.DataFrame(new_data)
            except:
                ndata=None
            try:
                data = pd.concat([ldata, ndata]).values
                return data
            except:
                return None
    
    def update_weights(self): #sum weights = 1 required in News_Finder
        unique, counts = np.unique(self.labels, return_counts=True)
        weights = counts/counts.sum() #smth like this
        weights = np.asarray((unique, weights)).T # [label, weight]
        self.clust_weights = pd.DataFrame({'clust': weights[:,0].astype(int), 'weight': weights[:,1]})
        return self.clust_weights
    
    def update_aggregator(self, new_data):
        data = self.prep_data(new_data=new_data)
        if data is None:
            return 'no data'
        else:
            self.labeled_data, self.labels = self.clusterize(data)    
            self.fit_classifier()
            self.update_weights()
            return 'updated'
        

In [7]:
class News_Finder():
    
    def __init__(self, df_news, news_vectorizer): #df_news: DF with non-viewed news items; News_Vectorizer instance
        self.df_news = df_news
        self.df_unique_news = self.create_unique()
        self.news_vectorizer = news_vectorizer
        
    def update_news(self, df_news):
        self.df_news = df_news
        self.df_unique_news = self.create_unique()
        return 'updated'
    
    def create_unique(self):
        df_unique_news = self.df_news.groupby('ID').agg({'news': np.unique, 
                                                         'category':list, 
                                                         'title': np.unique, 
                                                         'summary': np.unique, 
                                                         'link': np.unique, 
                                                         'date': np.unique, 
                                                         'time': np.unique, 
                                                         'datetime': np.unique})
        df_unique_news.reset_index(inplace=True)
        return df_unique_news
    
    def get_from_categories(self, n=5):
        # returns n top news from each category
        return self.df_news.groupby('category').head(n) 
    
    def get_similar(self, news_item, metric='cosine', n=5):
        # returns the n most similar news to news_item
        all_news = self.df_unique_news.query('ID != @news_item.ID').copy()
        dist_vec = self.news_vectorizer.dist_vec(news_item.news, all_news.news.values, metric=metric)
        all_news['dist'] = dist_vec
        return all_news.nsmallest(n, 'dist')
    
    def get_interesting(self, aggregator, n=20): #fitted Aggregator instance for classification & weights
        # TODO: return n news
        # TODO: if there are not enough news in clusters (for weights), return another news?
        all_news = self.df_unique_news.copy()
        news_vec = self.news_vectorizer.news2vec(all_news.news.values)
        #print(news_vec.shape)
        weights = aggregator.clust_weights
        labels = aggregator.classify(news_vec)
        all_news['label'] = labels
        # TODO: filter 'outlier' cluster
        # ?TODO: distances to choose the most relevant items in cluster 
        n_from_cluster = np.ceil((aggregator.clust_weights.weight*n)).astype(int)
        dflist = []
        for cluster, n in zip(weights.clust, n_from_cluster):
            dflist.append(all_news.query('label == @cluster').head(n))
        interesting = pd.concat(dflist)
        return interesting

In [60]:
class Data_Manager:
    
    def __init__(self, path_dict=None): #path_dict {'csv':{obj:path}, 'serialized':{obj:path}}
        self.path_dict = path_dict
        if self.path_dict is not None:
            self.data_dict = self.load_data()
        else:
            self.data_dict = {}
        
    def load_data(self):
        data_dict = {}
        try:
            for obj_name, path in self.path_dict['csv'].items():
                data_dict[obj_name] = pd.read_csv(path, index_col=0)
        except:
            print('something is not ok with "csv" key or it does not exist')
        try:
            for obj_name, path in self.path_dict['serialized'].items():
                with open(path, 'rb') as file:
                    data_dict[obj_name] = pickle.load(file)
        except:
            print('something is not ok with "serialized" key or it does not exist')
        return data_dict
    
    def delete_old(self, obj_name, n_recent=100):
        #del old data, except n_recent
        data = self.get_data_item(obj_name)
        if type(data) == pd.core.frame.DataFrame and data.shape[0] > n_recent:
            data = data.tail(n_recent)
            self.update_data_item(obj_name, data, concat=False)
            return('old entries removed')
        return('not enough entries to delete or is not DF')
    
    #def prep_data(self):
    #    #maybe some data manipulations
    #    pass
    
    def get_data_item(self, obj_name):
        return self.data_dict.get(obj_name, 'Does not exist')
    
    def update_data_item(self, obj_name, new_data, concat=False): #concat [True, False] - if concat data
        if concat == False:
            self.data_dict[obj_name] = new_data
            return('upd: set data_dict[obj] = new_data')
        elif concat == True:
            if obj_name in self.data_dict.keys():
                data = self.get_data_item(obj_name)
                if type(data) == pd.core.frame.DataFrame:
                    try:
                        data = pd.concat([data, new_data], sort=False)
                        self.data_dict[obj_name] = data
                        return 'updated'
                    except:
                        return 'could not update'
                elif type(data) == np.ndarray:
                    try:
                        data = np.vstack([data, new_data])
                        self.data_dict[obj_name] = data
                        return 'updated'
                    except:
                        return 'could not update'
                else:
                    self.data_dict[obj_name] = new_data
                    return 'upd: obj = new_data (not an array or DF)'
            else:
                self.data_dict[obj_name] = new_data
                return 'upd: obj = new_data (obj did not exist yet)'
    
    def save_model(self, data_items='all'): #data_items: 'all' or list of keys for data_dict
        if data_items == 'all':
            data_items = self.data_dict.keys()
        for obj_name in data_items:
            data = self.data_dict[obj_name]
            if type(data) == pd.core.frame.DataFrame:
                data.to_csv(obj_name + '.csv')
            elif type(data) == np.ndarray:
                # are there any ndarrays?..
                # TODO: write csv...
                pass
            else:
                with open(obj_name, 'wb') as file:
                    pickle.dump(data, file)
        return 'saved'

In [9]:
#TODO: print it readable...
def show_news(df_chosen_news):
    for cat in df_chosen_news.category.unique():
        print('***', 'Category: ' + cat, '\n***')
        #for news in df_chosen_news.query('category == @cat'):
        #    print(news[['news', 'link', 'datetime']])
        temp = df_chosen_news.query('category == @cat')[['news', 'link', 'datetime', 'ID']]
        print(temp, '\n')
        
#TODO: show interesting news (user preferencies)

In [10]:
#def append_viewed_news(news_id, category, df_news, df_viewed):
#    if news_id not in df_viewed.ID.values:
#        df_viewed = pd.concat([df_viewed, df_news.query('ID == @news_id and category == @category')])
#    return df_viewed

In [11]:
#def subset_not_viewed(df_news, df_viewed):
#    df_news = df_news.query('ID not in @df_viewed.ID.values')
#    return df_news

In [13]:
model = Embeddings().model

In [15]:
agg_clust = AgglomerativeClustering(n_clusters=5, affinity='cosine', linkage='complete')

In [16]:
knn = KNeighborsClassifier(n_neighbors=3, weights='distance', metric='cosine')

In [14]:
THRESHOLD = 20

`while`-loop für sehr grobe Demonstration, wie das Ganze mit dem Benutzer interagiert.  

_Eingabe - Ergebnis_
* `exit` - beendet die Schleife  
* `init` - startet ein neues, leeres System, das noch über keine benutzerspezifischen Daten verfügt, zeigt Nachrichten in Kategorien an
* `start` - liest bereits vorhandene Daten und Modelle ein und startet das System, zeigt Nachrichten in Kategorien und in der weiteren Kategorie interessante für Benutzer Meldungen an
* `upd` - bekommt erneut Nachrichten aus rss feeds, zeigt Nachrichten an
* `save` - speichert Daten und Modelle
* Nachrichten-ID (hash, z.B. `af930491116a582f5569e5df1fd9250f`), mit Leerzeichen getrennt, werden als ID von bereits gelesenen Nachrichten wahrgenommen - als ob der Benutzer den Link angeklickt hätte und das System entsprechenden Nachricht-ID bekommen hätte - diese Nachrichten werden als "viewed" gespeichert und nicht mehr angezeigt.

_Weitere Anmerkungen über Funktionsweise_

* Wenn die Anzahl der gelesenen Nachrichten den definierten `THRESHOLD` erreicht, werden diese für das Update des Modells verwendet.
* Beim Start werden alte Einträge in "viewed"-Tabelle gelöscht, außer vorgegebenen `n_recent` Eintragen.
* ... sicher viel vergessen


In [61]:
#TODO: while-loop logic -> to managing class for integration with GUI
# ? TODO: with Data_Manager: check if data_item is ok
while True:
    u_input = input()
    if u_input == 'exit':
        break
    elif u_input == 'init':
        #init new system
        #model = Embeddings().model
        DM = Data_Manager()
        AGG = Aggregator(agg_clust, knn)
        RSS = RSS_Feeds(bbc_rss)
        NVec = News_Vectorizer(model=model)
        
        DM.update_data_item('df_news', RSS.df_news, concat=False)
        DM.update_data_item('df_viewed', pd.DataFrame(columns=RSS.df_news.columns), concat=False) # DF for viewed news
        #DM.update_data_item
        
        NFind = News_Finder(DM.get_data_item('df_news'), News_Vectorizer(model=model))
        chosen_news = NFind.get_from_categories()
        show_news(df_chosen_news=chosen_news)

    elif u_input == 'start':
        # load data and start already existing system
        #model = Embeddings().model
        path_dict = {'csv':{'df_viewed':'df_viewed.csv', 
                            'df_labeled':'df_labeled.csv', 
                            'clust_weights':'clust_weights.csv'}, 
                     'serialized':{'classifier':'classifier', 
                                   'clusterizer':'clusterizer'}}
        DM = Data_Manager(path_dict=path_dict)
        DM.delete_old('df_labeled')
        NVec = News_Vectorizer(model=model)
        AGG = Aggregator(clusterizer=DM.get_data_item('clusterizer'), 
                         classifier=DM.get_data_item('classifier'), 
                         labeled_data=NVec.news2vec(DM.get_data_item('df_labeled').news.values), 
                         labels=DM.get_data_item('df_labeled').label.values, 
                         clust_weights=DM.get_data_item('clust_weights'))
        RSS = RSS_Feeds(bbc_rss)
        
        # filter already viewed news
        viewed_id = np.hstack([DM.get_data_item('df_labeled').ID.values, DM.get_data_item('df_viewed').ID.values])
        DM.update_data_item('df_news', RSS.df_news.query('ID not in @viewed_id'), concat=False)
        
        #show news from categories
        NFind = News_Finder(DM.get_data_item('df_news'), News_Vectorizer(model=model))
        chosen_news = NFind.get_from_categories()
        show_news(df_chosen_news=chosen_news)
        
        #show "interesting"
        if AGG.labels is not None:
            print('Interesting news (based on viewed):')
            print(NFind.get_interesting(AGG, n=10)[['title', 'summary', 'link', 'ID']])        

    elif u_input == 'upd':
        # update news
        RSS = RSS_Feeds(bbc_rss)
        # check in viewed and labeled
        # filter already viewed news
        viewed_id = np.hstack([DM.get_data_item('df_labeled').ID.values, DM.get_data_item('df_viewed').ID.values])
        DM.update_data_item('df_news', RSS.df_news.query('ID not in @viewed_id'), concat=False)
        
        #show news from categories
        NFind.update_news(DM.get_data_item('df_news'))
        chosen_news = NFind.get_from_categories()
        show_news(df_chosen_news=chosen_news)
        
        #show "interesting"
        if AGG.labels is not None:
            print('Interesting news (based on viewed):')
            print(NFind.get_interesting(AGG, n=10)[['title', 'summary', 'link', 'ID']])     
    
    elif u_input == 'save':
        # save model und data
        DM.save_model()
        
    else: #viewed news id input
        # get news ids and put unique viewed news into df_viewed
        n_id = u_input.strip().split()
        viewed = NFind.df_unique_news.query('ID in @n_id')
        NFind.update_news(NFind.df_news.query('ID not in @n_id')) #filter already viewed and update data
        DM.update_data_item('df_viewed', viewed, concat=True)
        DM.update_data_item('df_news', NFind.df_news, concat=False)
        chosen_news = NFind.get_from_categories()
        
        #show news -> TODO: print it readable!
        show_news(df_chosen_news=chosen_news)
        if AGG.labels is not None:
            print('Interesting news (based on viewed):')
            print(NFind.get_interesting(AGG, n=10)[['title', 'summary', 'link', 'ID']])
        
        if DM.get_data_item('df_viewed').shape[0] >= THRESHOLD:
            #update aggregator
            data = DM.get_data_item('df_viewed')
            colnames = data.columns
            AGG.update_aggregator(NVec.news2vec(data.news))
            
            #update df_labeled 
            data = pd.concat([DM.get_data_item('df_labeled').drop('label', axis=1), data])
            data['label'] = AGG.labels
            DM.update_data_item('df_labeled', data, concat=False)
            
            #update df_viewed (empty)
            DM.update_data_item('df_viewed', pd.DataFrame(columns=colnames), concat=False)
            
            #update data_items: classifier, clusterizer etc
            DM.update_data_item('classifier', AGG.classifier, concat=False)
            DM.update_data_item('clusterizer', AGG.clusterizer, concat=False)
            DM.update_data_item('clust_weights', AGG.clust_weights, concat=False)
            
            # find and show interesting news -> TODO: print it readable!
            # in GUI: update only "custom" category
            print('Interesting news (based on viewed):')
            print(NFind.get_interesting(AGG, n=10)[['title', 'summary', 'link', 'ID']])     
            
        

start
*** Category: BBC News - Home 
***
                                                news  \
0  Iran plane downing: Second day of protests tur...   
4  Philippines volcano: Thousands evacuated as Ta...   
5  Cheryl Grimmer: Missing toddler police offer A...   
6  Tributes pour in as Oman mourns Sultan Qaboos....   
7  Australia bushfires: The race to save animal c...   

                                                link  \
0  https://www.bbc.co.uk/news/world-middle-east-5...   
4     https://www.bbc.co.uk/news/world-asia-51083515   
5  https://www.bbc.co.uk/news/uk-england-bristol-...   
6  https://www.bbc.co.uk/news/world-middle-east-5...   
7  https://www.bbc.co.uk/news/world-australia-510...   

                        datetime                                ID  
0  Sun, 12 Jan 2020 16:59:08 GMT  77d5833dcba26526c0bed8609b9f1990  
4  Sun, 12 Jan 2020 16:49:24 GMT  8243c0032c56a4bfbdc234be49a646b1  
5  Sun, 12 Jan 2020 08:38:56 GMT  e14f5efa42e83e5a5b8d09ce17f2bf14  
6  Sun, 1

                                                title  \
1   CES 2020: Juno 'reverse microwave oven' cools ...   
3   My Money: 'I made it through the day without s...   
6   New MPs attend Prime Minister's Questions for ...   
8   Tafida Raqeeb: Brain-damaged girl in High Cour...   
9   SpaceX sends 60 more Starlink satellites into ...   
0               What 2020 holds for Scottish politics   
5               Australia to cull thousands of camels   
7   HRW chief 'denied entry to Hong Kong' ahead of...   
22   A chatbot pulled me out of a 'really dark place'   
32  Healthy habits 'deliver extra disease-free dec...   
4   Stolen Colchester KFC Colonel Sanders found un...   
16  Bang Bang: The artist who’s tattooed LeBron Ja...   
2      Primary league tables: How did your school do?   

                                              summary  \
1   The device rapidly chills packaged drinks mean...   
3   As part of a new BBC blog series, Chelsea Thom...   
6   New MPs describe what the 

In [62]:
len(DM.get_data_item('df_labeled'))

20

In [41]:
' '.join(NFind.get_from_categories().query('category == "BBC News - Business"').ID.values)

'3585244cb51c18395102d277995bde90 218e22ae8fee387e5b5199938a4eaa5c 3229c6e66bdabcd6645acb186c4d5e01 58febab89aff2617305ab1bc329d2d26 70c47f22cc967bf3a6302f8be7da851d'

In [29]:
len(DM.get_data_item('df_viewed'))

20

In [24]:
' '.join(NFind.get_from_categories().ID.values[10:30])

'af930491116a582f5569e5df1fd9250f daf237c88ea2cd161fabeb02f04cb9f6 8bf8ab43d48448a8f294ba8d3ad8d00c 9d36a6adf59d46dce15844c7366e34ab 73f03216d09b697af9dabcb7cdc7f222 5f0295a9d9421268dc31a89b84047141 9d36a6adf59d46dce15844c7366e34ab dfe802773e865a7b6002352f3c41ee66 0af77bc0017ec66debff7783bcd1166b fdd698f45fef40c5002439dba714b43e 73f03216d09b697af9dabcb7cdc7f222 792eabe28e2d3d2e3b6127413165e3f3 27baa71e817fc082546db9fe181edced 8bf8ab43d48448a8f294ba8d3ad8d00c daf237c88ea2cd161fabeb02f04cb9f6 3070b24edf667588bc4091e3fd274f3d 411decd4351af93672c72621f354e930 9628c5cea266eb72de56e8c437ded681 cfb662f1c8a7dfcad437fcb420baa275 1f2c8e55da9a453736afb196f2386730'

In [41]:
AGG.clust_weights

Unnamed: 0,clust,weight
0,0,0.333333
1,1,0.166667
2,2,0.166667
3,3,0.125
4,4,0.208333


In [85]:
NFind.get_interesting(AGG, n=10)

Unnamed: 0,ID,news,category,title,summary,link,date,time,datetime,label
3,03f7c7322f658b7373eb795d639af623,The island struggling to secure its future. Af...,[BBC News - Business],The island struggling to secure its future,"After being hit by a hurricane, Puerto Rico is...",https://www.bbc.co.uk/news/business-50458311,"(2019, 12, 26)","(0, 37, 2)","Thu, 26 Dec 2019 00:37:02 GMT",0
4,0501d6d8261498d19f006fe38ed971d2,FA Cup: Culture secretary calls for FA to reco...,[BBC News - UK Politics],FA Cup: Culture secretary calls for FA to reco...,The government has called on the FA to immedia...,https://www.bbc.co.uk/sport/football/51028507,"(2020, 1, 8)","(15, 24, 32)","Wed, 08 Jan 2020 15:24:32 GMT",0
6,07f5a02b7f4395d08fd476e7804191ba,New MPs attend Prime Minister's Questions for ...,"[BBC News - Home, BBC News - UK, BBC News - UK...",New MPs attend Prime Minister's Questions for ...,New MPs describe what the first Prime Minister...,https://www.bbc.co.uk/news/uk-politics-51042210,"(2020, 1, 8)","(23, 56, 41)","Wed, 08 Jan 2020 23:56:41 GMT",0
8,08de3d64d4e0b6db9085bffab8587218,SpaceX sends 60 more Starlink satellites into ...,[BBC News - Science & Environment],SpaceX sends 60 more Starlink satellites into ...,The California company's latest mission makes ...,https://www.bbc.co.uk/news/science-environment...,"(2020, 1, 7)","(3, 28, 56)","Tue, 07 Jan 2020 03:28:56 GMT",0
0,005503512f38f130303cb133d656203b,What 2020 holds for Scottish politics. After a...,[BBC News - UK Politics],What 2020 holds for Scottish politics,"After a tumultuous 2019, what does the new yea...",https://www.bbc.co.uk/news/uk-scotland-scotlan...,"(2020, 1, 7)","(0, 55, 44)","Tue, 07 Jan 2020 00:55:44 GMT",1
2,0398993ce0ce2269038657b076b0c940,Celeste got fired over her love of music - now...,"[BBC News - Home, BBC News - UK, BBC News - En...",Celeste got fired over her love of music - now...,Celeste got fired over her love of music. Now ...,https://www.bbc.co.uk/news/entertainment-arts-...,"(2020, 1, 9)","(0, 3, 28)","Thu, 09 Jan 2020 00:03:28 GMT",1
12,0e38a6df45dde1773239fe834d9c6543,Iran plane crash: Tributes to three British na...,[BBC News - UK],Iran plane crash: Tributes to three British na...,Two engineers and the owner of a dry cleaners ...,https://www.bbc.co.uk/news/uk-51032651,"(2020, 1, 8)","(23, 58, 35)","Wed, 08 Jan 2020 23:58:35 GMT",2
24,1a65089f35462a69d8fec6e5010ad67f,Soleimani attack: What does international law ...,[BBC News - World],Soleimani attack: What does international law ...,The US says it acted to prevent future Iranian...,https://www.bbc.co.uk/news/world-51007961,"(2020, 1, 7)","(13, 58, 28)","Tue, 07 Jan 2020 13:58:28 GMT",2
9,0ad9372e7bffa7290cccab430d76a7df,'Some men think women shouldn't be in the gym'...,[BBC News - Health],'Some men think women shouldn't be in the gym',"The world's second ""naturally"" strongest woman...",https://www.bbc.co.uk/news/uk-england-nottingh...,"(2020, 1, 4)","(0, 2, 8)","Sat, 04 Jan 2020 00:02:08 GMT",3
10,0c46f58913ea8a891fe8d66f2dfe96f5,NHS pressures 'put medical breakthroughs at ri...,[BBC News - Health],NHS pressures 'put medical breakthroughs at risk',The proportion of senior doctors involved in l...,https://www.bbc.co.uk/news/health-51011461,"(2020, 1, 8)","(6, 0, 39)","Wed, 08 Jan 2020 06:00:39 GMT",3
