In [1]:
import feedparser
import pandas as pd
import gensim
from gensim.models import Word2Vec
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk import word_tokenize
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
stop_words = stopwords.words('english')
from pyemd import emd
from scipy.cluster.hierarchy import fclusterdata
from sklearn.cluster import DBSCAN, AffinityPropagation, AgglomerativeClustering
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
import pickle
import time
import hashlib

#from itertools import combinations
#from tqdm import tqdm_notebook
#from scipy.stats import skew, kurtosis
#from sklearn.preprocessing import StandardScaler
#from sklearn.decomposition import PCA

In [2]:
bbc_rss = ['http://feeds.bbci.co.uk/news/rss.xml', 
           'http://feeds.bbci.co.uk/news/world/rss.xml', 
           'http://feeds.bbci.co.uk/news/uk/rss.xml', 
           'http://feeds.bbci.co.uk/news/business/rss.xml', 
           'http://feeds.bbci.co.uk/news/politics/rss.xml', 
           'http://feeds.bbci.co.uk/news/health/rss.xml', 
           'http://feeds.bbci.co.uk/news/education/rss.xml', 
           'http://feeds.bbci.co.uk/news/science_and_environment/rss.xml', 
           'http://feeds.bbci.co.uk/news/technology/rss.xml', 
           'http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml']

In [3]:
class Embeddings:
    
    def __init__(self, create=False, ser_model_path='W2VModel',
                 embeddings='GoogleNews-vectors-negative300.bin.gz',
                 model_fun=gensim.models.KeyedVectors.load_word2vec_format, binary=True, norm=True):
        self.ser_model = ser_model_path
        self.embeddings = embeddings
        self.model_fun = model_fun
        self.binary = binary
        self.norm = norm
        
        if create == False:
            self.model = self.load_model()
        else:
            self.model = self.create_model()
        
            
    def create_model(self):
        model = self.model_fun(self.embeddings, binary=self.binary)
        if self.norm:
            model.init_sims(replace=True)
        return model
            
    def load_model(self):
        with open(self.ser_model, 'rb') as file:
            model = pickle.load(file)
        return model

In [4]:
#TODO: call news_vectors(news), no action in init
class News_Vectorizer:
    
    def __init__(self, news, model):
        self.news = news #array of strings
        self.model = model #Word2Vec model
        self.news_vectors = self.news2vec() #vector representations
        self.cos_dist = None #cosine distance matrix
        self.wm_dist = None #wmd-matrix
    
    def wmd(self, q1, q2):
        q1 = str(q1).lower().split()
        q2 = str(q2).lower().split()
        q1 = [w for w in q1 if w not in stop_words]
        q2 = [w for w in q2 if w not in stop_words]
        return self.model.wmdistance(q1, q2)
    
    def sent2vec(self, s):
        words = str(s).lower()
        words = word_tokenize(words)
        words = [w for w in words if not w in stop_words]
        words = [w for w in words if w.isalpha()]
        M = []
        for w in words:
            try:
                M.append(self.model[w])
            except:
                continue
        M = np.array(M)
        v = M.sum(axis=0)
        # reshape?
        return v / np.sqrt((v ** 2).sum())
    
    def news2vec(self):
        return np.array([self.sent2vec(text) for text in self.news])
    
    #TODO: triangle matrix -> optimize
    def cosine_matrix(self): 
        cdist = np.zeros((len(self.news_vectors), len(self.news_vectors)))
        for n, i in enumerate(self.news_vectors):
            for m, j in enumerate(self.news_vectors):
                cdist[n, m] = cosine(i, j)
        self.cos_dist = cdist
        return cdist
    
    def wmd_matrix(self): #list (news)
        wmdist = np.zeros((len(self.news), len(self.news)))
        for n, i in enumerate(self.news):
            for m, j in enumerate(self.news):
                wmdist[n, m] = self.wmd(i, j)
        self.wm_dist = wmdist
        return wmdist

In [132]:
class RSS_Feeds:
    
    def __init__(self, urls):
        self.urls = urls
        self.feeds = self.get_feeds()
        self.df_news = self.create_df()
        self.df_unique_news = self.df_unique()
        
    def get_feeds(self):
        return [feedparser.parse(feed) for feed in self.urls]
    
    def get_category(self, feed):
        # sources may have different category names - agg categories?
        return feed.feed['title']
    
    #TODO: return (title, summary) - for user representation
    def get_title_summary(self, feed, sep='. '): #get and join title and summary for each entry in feed
        return([entry['title'] + sep + entry['summary'] for entry in feed['entries']])
    
    def get_date(self, feed): #(year, month, day) for each entry in feed
        return([entry['published_parsed'][:3] for entry in feed['entries']])
    
    def get_time(self, feed): #(hour, min, sec) for each entry in feed
        return([entry['published_parsed'][3:6] for entry in feed['entries']])
    
    def get_datetime_nparsed(self, feed): #not parsed date and time for each entry in feed
        return([entry['published'] for entry in feed['entries']])
    
    def get_link(self, feed): # link for each entry in feed
        return([entry['link'] for entry in feed['entries']])
    
    def str2hash(self, s):
        return hashlib.md5(s.encode()).hexdigest()
    
    def create_df(self): 
        news, category, pdate, ptime, fdatetime, links  = [], [], [], [], [], []
        for feed in self.feeds:
            cat = self.get_category(feed)
            texts = self.get_title_summary(feed)
            d_ymd, t_hms = self.get_date(feed), self.get_time(feed)
            fdt = self.get_datetime_nparsed(feed)
            news_links = self.get_link(feed)
            
            cat = np.resize([cat], len(texts))
            news.extend(texts)
            pdate.extend(d_ymd)
            ptime.extend(t_hms)
            fdatetime.extend(fdt)
            links.extend(news_links)
            category.extend(cat)
        df_news = pd.DataFrame({'news':news, 
                                'category':category, 
                                'link':links,
                                'date':pdate, 
                                'time':ptime, 
                                'datetime':fdatetime})
        df_news['ID'] = df_news.news.apply(self.str2hash)
        self.df_news = df_news
        return df_news
    
    def df_unique(self):
        df_unique_news = self.df_news.groupby('news').agg({'category':list})
        df_unique_news.reset_index(inplace=True)
        self.df_unique_news = df_unique_news
        return df_unique_news
    
    def get_unique_news(self):
        return self.df_unique_news.news.values

In [299]:
#TODO: Weights!
class Aggregator:
    
    def __init__(self, clusterizer, classifier, labeled_data=None, labels=None, clust_weights=None):
        self.clusterizer = clusterizer
        self.classifier = classifier
        self.labeled_data = labeled_data #clustered
        self.labels = labels #clust nums of labeled_data
        self.clust_weights = clust_weights # [[label, weight],...]
        
    def clusterize(self, data):
        labels = self.clusterizer.fit_predict(data)
        return data, labels
    
    def classify(self, new_data): #if one sample: reshape to (1, 300)
        try:
            predicted = self.classifier.predict(new_data)
        except NotFittedError as e:
            return(repr(e))
        return predicted
    
    def fit_classifier(self):
        X, y = self.labeled_data, self.labels
        self.classifier.fit(X, y)
        return self.classifier
    
    def prep_data(self, new_data=None):
        if self.labeled_data is None and new_data is None:
            return None
        else:
            try:
                ldata = pd.DataFrame(self.labeled_data)
            except:
                ldata = None
            try:
                ndata = pd.DataFrame(new_data)
            except:
                ndata=None
            try:
                data = pd.concat([ldata, ndata]).values
                return data
            except:
                return None
    
    def update_weights(self):
        unique, counts = np.unique(self.labels, return_counts=True)
        weights = counts/counts.sum() #smth like this
        weights = np.asarray((unique, weights)).T # [label, weight]
        self.clust_weights = pd.DataFrame({'clust': weights[:,0].astype(int), 'weight': weights[:,1]})
        return self.clust_weights
    
    def update_aggregator(self, new_data):
        data = self.prep_data(new_data=new_data)
        if data is None:
            return 'no data'
        else:
            self.labeled_data, self.labels = self.clusterize(data)    
            self.fit_classifier()
            self.update_weights()
            return 'updated'
        

In [7]:
class News_Finder:
    
    def __init__(self, df_news):
        self.df_news = df_news
    
    def from_categories(self, n=5):
        # returns n top news from each category
        return self.df_news.groupby('category').head(n) 
    
    def similar(self, news_item, n=5):
        # returns the n most similar news to news_item
        pass
    
    def preferred(self, clust_weights, n=20):
        # returns n news based on user preferencies
        pass

In [259]:
#TODO: delete, (prepare?)
class Data_Manager:
    
    def __init__(self, path_dict=None): #path_dict {'csv':{obj:path}, 'serialized':{obj:path}}
        self.path_dict = path_dict
        if self.path_dict is not None:
            self.data_dict = self.load_data()
        else:
            self.data_dict = {}
        
    def load_data(self):
        data_dict = {}
        try:
            for obj_name, path in self.path_dict['csv'].items():
                data_dict[obj_name] = pd.read_csv(path, index_col=0)
        except:
            print('something is not ok with "csv" key or it does not exist')
        try:
            for obj_name, path in self.path_dict['serialized'].items():
                with open(path, 'rb') as file:
                    data_dict[obj_name] = pickle.load(file)
        except:
            print('something is not ok with "serialized" key or it does not exist')
        return data_dict
    
    def delete_old(self, n_recent):
        #del old data, except n_recent
        pass
    def prep_data(self):
        #maybe some data manipulations
        pass
    
    def get_data_item(self, obj_name):
        return self.data_dict.get(obj_name, 'Does not exist')
    
    def update_data_item(self, obj_name, new_data):
        data = self.get_data_item(obj_name)
        if data != 'Does not exist':
            if type(data) == pd.core.frame.DataFrame:
                try:
                    data = pd.concat([data, new_data])
                    self.data_dict[obj_name] = data
                    return 'updated'
                except:
                    return 'could not update'
            elif type(data) == np.ndarray:
                try:
                    data = np.vstack([data, new_data])
                    self.data_dict[obj_name] = data
                    return 'updated'
                except:
                    return 'could not update'
            else:
                self.data_dict[obj_name] = new_data
                return 'upd: obj = new_data (not an array or DF)'
        else:
            self.data_dict[obj_name] = new_data
            return 'upd: obj = new_data (obj did not exist yet)'
    
    def save_model(self, data_items='all'): #data_items: 'all' or list of keys for data_dict
        if data_items == 'all':
            data_items = self.data_dict.keys()
        for obj_name in data_items:
            data = self.data_dict[obj_name]
            if type(data) == pd.core.frame.DataFrame:
                data.to_csv(obj_name + '.csv')
            elif type(data) == np.ndarray:
                # are there any ndarrays?..
                # write csv...
                pass
            else:
                with open(obj_name, 'wb') as file:
                    pickle.dump(data, file)
        return 'saved'

In [247]:
rss = RSS_Feeds(bbc_rss)

In [249]:
rss.df_news.head()

Unnamed: 0,news,category,link,date,time,datetime,ID
0,Qasem Soleimani: PM 'will not lament' Iranian ...,BBC News - Home,https://www.bbc.co.uk/news/uk-51001236,"(2020, 1, 5)","(19, 5, 52)","Sun, 05 Jan 2020 19:05:52 GMT",c6feecfbdefd901ce4b7b404f50afa9c
1,Iran rolls back nuclear deal commitments. Iran...,BBC News - Home,https://www.bbc.co.uk/news/world-middle-east-5...,"(2020, 1, 5)","(19, 8, 1)","Sun, 05 Jan 2020 19:08:01 GMT",11ce682d0bf7650eef652a43d4fdf2da
2,Qasem Soleimani: Mourning begins in Iran. The ...,BBC News - Home,https://www.bbc.co.uk/news/world-middle-east-5...,"(2020, 1, 5)","(8, 25, 15)","Sun, 05 Jan 2020 08:25:15 GMT",b5f8537288db3951cc22f3cbac30d8aa
3,Labour leadership: Contenders set out stalls o...,BBC News - Home,https://www.bbc.co.uk/news/uk-politics-50996799,"(2020, 1, 5)","(15, 59, 9)","Sun, 05 Jan 2020 15:59:09 GMT",0ea3549d868732a9fd67f59f374bed69
4,Finsbury Park stabbing: Manhunt as killed Deli...,BBC News - Home,https://www.bbc.co.uk/news/uk-england-london-5...,"(2020, 1, 5)","(15, 36, 53)","Sun, 05 Jan 2020 15:36:53 GMT",3511d139f23538e5b9fada328ccbbb3f


In [251]:
manager = Data_Manager()

In [252]:
manager.update_data_item('news', rss.df_news)

'upd: obj = new_data (obj did not exist yet)'

In [254]:
manager.get_data_item('news').head()

Unnamed: 0,news,category,link,date,time,datetime,ID
0,Qasem Soleimani: PM 'will not lament' Iranian ...,BBC News - Home,https://www.bbc.co.uk/news/uk-51001236,"(2020, 1, 5)","(19, 5, 52)","Sun, 05 Jan 2020 19:05:52 GMT",c6feecfbdefd901ce4b7b404f50afa9c
1,Iran rolls back nuclear deal commitments. Iran...,BBC News - Home,https://www.bbc.co.uk/news/world-middle-east-5...,"(2020, 1, 5)","(19, 8, 1)","Sun, 05 Jan 2020 19:08:01 GMT",11ce682d0bf7650eef652a43d4fdf2da
2,Qasem Soleimani: Mourning begins in Iran. The ...,BBC News - Home,https://www.bbc.co.uk/news/world-middle-east-5...,"(2020, 1, 5)","(8, 25, 15)","Sun, 05 Jan 2020 08:25:15 GMT",b5f8537288db3951cc22f3cbac30d8aa
3,Labour leadership: Contenders set out stalls o...,BBC News - Home,https://www.bbc.co.uk/news/uk-politics-50996799,"(2020, 1, 5)","(15, 59, 9)","Sun, 05 Jan 2020 15:59:09 GMT",0ea3549d868732a9fd67f59f374bed69
4,Finsbury Park stabbing: Manhunt as killed Deli...,BBC News - Home,https://www.bbc.co.uk/news/uk-england-london-5...,"(2020, 1, 5)","(15, 36, 53)","Sun, 05 Jan 2020 15:36:53 GMT",3511d139f23538e5b9fada328ccbbb3f


In [255]:
manager.save_model()

'saved'

In [274]:
aggregator = Aggregator(agg_clust, knn)
nv = News_Vectorizer(df_viewed.news.values, model=emb.model)

In [275]:
aggregator.update_aggregator(nv.news_vectors)

'updated'

In [276]:
aggregator.labels

array([2, 1, 1, 2, 4, 3, 1, 1, 1, 1, 0, 5, 1, 0, 4, 6, 0, 1, 4, 4, 0, 3,
       0], dtype=int64)

In [277]:
df_viewed['label'] = aggregator.labels

In [279]:
manager.update_data_item('viewed', df_viewed)

'upd: obj = new_data (obj did not exist yet)'

In [280]:
manager.update_data_item('classifier', aggregator.classifier)

'upd: obj = new_data (not an array or DF)'

In [281]:
manager.update_data_item('clusterizer', aggregator.clusterizer)

'upd: obj = new_data (obj did not exist yet)'

In [283]:
manager.update_data_item('weights')

dict_keys(['news', 'classifier', 'viewed', 'clusterizer'])

In [284]:
manager.save_model()

'saved'

In [285]:
manager = Data_Manager(path_dict={'csv':{'news':'news.csv', 'viewed':'viewed.csv'}, 'serialized':{'classifier':'classifier', 'clusterizer':'clusterizer'}})

In [286]:
manager.data_dict.keys()

dict_keys(['news', 'viewed', 'classifier', 'clusterizer'])

In [43]:
def from_categories(df_news, n=5):
    return df_news.groupby('category').head(n)       

In [143]:
def show_news(df_chosen_news):
    for cat in df_chosen_news.category.unique():
        print('Category: ' + cat)
        #for news in df_chosen_news.query('category == @cat'):
        #    print(news[['news', 'link', 'datetime']])
        temp = df_chosen_news.query('category == @cat')[['news', 'link', 'datetime', 'ID']]
        print(temp)

In [74]:
show_news(from_categories(rss.df_news))

Category: BBC News - Home
                                                news  \
0  Trump says US ready to strike 52 Iranian sites...   
1  Qasem Soleimani: Royal Navy to protect UK ship...   
2  HS2 costs out of control, says review's deputy...   
3  Australia bushfires: Fundraiser reaches A$20m ...   
4  Sunken chest syndrome: 'I'm being strangled in...   

                                                link  \
0  https://www.bbc.co.uk/news/world-middle-east-5...   
1    https://www.bbc.co.uk/news/uk-politics-50996630   
2       https://www.bbc.co.uk/news/business-50995116   
3  https://www.bbc.co.uk/news/world-australia-509...   
4  https://www.bbc.co.uk/news/uk-england-devon-50...   

                        datetime  
0  Sun, 05 Jan 2020 06:38:58 GMT  
1  Sun, 05 Jan 2020 09:04:02 GMT  
2  Sun, 05 Jan 2020 00:42:42 GMT  
3  Sun, 05 Jan 2020 07:24:08 GMT  
4  Sun, 05 Jan 2020 00:01:15 GMT  
Category: BBC News - World
                                                 news  \
47  Tr

In [156]:
def append_viewed_news(news_id, category, df_news, df_viewed):
    if news_id not in df_viewed.ID.values:
        df_viewed = pd.concat([df_viewed, df_news.query('ID == @news_id and category == @category')])
    return df_viewed

In [207]:
def subset_not_viewed(df_news, df_viewed):
    df_news = df_news.query('ID not in @df_viewed.ID.values')
    return df_news

In [167]:
sample = from_categories(rss.df_news, 3)[['ID', 'category']]
df_viewed = pd.DataFrame(columns=rss.df_news.columns)

In [172]:
sample.ID.nunique()

23

In [173]:
emb = Embeddings()

In [177]:
agg_clust = AgglomerativeClustering(n_clusters=7, affinity='cosine', linkage='complete')

In [178]:
knn = KNeighborsClassifier(n_neighbors=3, weights='distance', metric='cosine')

In [182]:
aggregator = Aggregator(clusterizer=agg_clust, classifier=knn)

In [183]:
nv = News_Vectorizer(df_viewed.news.values, emb.model)

In [184]:
aggregator.update_aggregator(nv.news_vectors)

'updated'

In [192]:
news_to_class = [nv.sent2vec(i) for i in rss.df_news.groupby('category').head(10).iloc[7:9,:].news.values]

In [193]:
aggregator.classify(news_to_class)

array([4, 3], dtype=int64)

In [196]:
aggregator.labeled_data.shape

(23, 300)

In [158]:
rss.df_news.query('ID == @nid')

Unnamed: 0,news,category,link,date,time,datetime,ID
0,Soleimani assassination: Mourners flood the st...,BBC News - Home,https://www.bbc.co.uk/news/world-middle-east-5...,"(2020, 1, 5)","(9, 15, 6)","Sun, 05 Jan 2020 09:15:06 GMT",92b52890a83cf10f253652b35a697c9e
47,Soleimani assassination: Mourners flood the st...,BBC News - World,https://www.bbc.co.uk/news/world-middle-east-5...,"(2020, 1, 5)","(9, 15, 6)","Sun, 05 Jan 2020 09:15:06 GMT",92b52890a83cf10f253652b35a697c9e


In [160]:
df_viewed = append_viewed_news(nid, 'BBC News - World', rss.df_news, df_viewed)

In [33]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
iris = load_iris()

In [171]:
sent = nv.sent2vec('Fresh Cambridge Analytica leak shows global manipulation is out of control. Company’s work in 68 countries laid bare with release of more than 100,000 documents')