In [8]:
import feedparser
import numpy as np
import pandas as pd
import hashlib

In [2]:
bbc_rss = ['http://feeds.bbci.co.uk/news/rss.xml', 
           'http://feeds.bbci.co.uk/news/world/rss.xml', 
           'http://feeds.bbci.co.uk/news/uk/rss.xml', 
           'http://feeds.bbci.co.uk/news/business/rss.xml', 
           'http://feeds.bbci.co.uk/news/politics/rss.xml', 
           'http://feeds.bbci.co.uk/news/health/rss.xml', 
           'http://feeds.bbci.co.uk/news/education/rss.xml', 
           'http://feeds.bbci.co.uk/news/science_and_environment/rss.xml', 
           'http://feeds.bbci.co.uk/news/technology/rss.xml', 
           'http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml']

In [3]:
class RSS_Feeds:
    
    def __init__(self, urls):
        self.urls = urls
        self.feeds = self.get_feeds()
        self.df_news = self.create_df()
        self.df_unique_news = self.create_unique()
        
    def get_feeds(self):
        return [feedparser.parse(feed) for feed in self.urls]
    
    def get_category(self, feed):
        # sources may have different category names - agg categories?
        return feed.feed.get('title', '')

    def get_title_summary(self, feed, sep='. '): #get and join title and summary for each entry in feed
        titles = [entry['title'] for entry in feed['entries']]
        summaries = [entry['summary'] for entry in feed['entries']]
        title_summary = [entry['title'] + sep + entry['summary'] for entry in feed['entries']]
        return titles, summaries, title_summary
    
    def get_date(self, feed): #(year, month, day) for each entry in feed
        return([entry['published_parsed'][:3] for entry in feed['entries']])
    
    def get_time(self, feed): #(hour, min, sec) for each entry in feed
        return([entry['published_parsed'][3:6] for entry in feed['entries']])
    
    def get_datetime_nparsed(self, feed): #not parsed date and time for each entry in feed
        return([entry['published'] for entry in feed['entries']])
    
    def get_link(self, feed): # link for each entry in feed
        return([entry['link'] for entry in feed['entries']])
    
    def str2hash(self, s):
        return hashlib.md5(s.encode()).hexdigest()
    
    def create_df(self): 
        news, title, summary, category, pdate, ptime, fdatetime, links  = [], [], [], [], [], [], [], []
        for feed in self.feeds:
            cat = self.get_category(feed)
            titles, summaries, texts = self.get_title_summary(feed)
            d_ymd, t_hms = self.get_date(feed), self.get_time(feed)
            fdt = self.get_datetime_nparsed(feed)
            news_links = self.get_link(feed)
            
            cat = np.resize([cat], len(texts))
            news.extend(texts)
            title.extend(titles)
            summary.extend(summaries)
            pdate.extend(d_ymd)
            ptime.extend(t_hms)
            fdatetime.extend(fdt)
            links.extend(news_links)
            category.extend(cat)
        df_news = pd.DataFrame({'news':news, 
                                'category':category,
                                'title':title, 
                                'summary':summary,
                                'link':links,
                                'date':pdate, 
                                'time':ptime, 
                                'datetime':fdatetime})
        df_news['ID'] = df_news.news.apply(self.str2hash)
        self.df_news = df_news
        return df_news
    
    def create_unique(self):
        df_unique_news = self.df_news.groupby('news').agg({'category':list, 
                                                           'title': np.unique, 
                                                           'summary': np.unique, 
                                                           'link': np.unique, 
                                                           'date': np.unique, 
                                                           'time': np.unique, 
                                                           'datetime': np.unique, 
                                                           'ID': np.unique})
        df_unique_news.reset_index(inplace=True)
        self.df_unique_news = df_unique_news
        return df_unique_news
    
    def get_unique_news(self):
        return self.df_unique_news.news.values

### Dummy-Communicator für die GUI Entwicklung


In [21]:
class Communicator:
    '''Application start -> initialize Communicator instance and call `start()`
    to start an existing system or create a new system and get DataFrames, e.g.:
    
        CI = Communicator()
        news_in_categories, interesting_news = CI.start()
        display(news_in_categories)
    
    Then use `handle_input()` to process user input and get system output, e.g.:
    
        news_in_categories, interesting_news = CI.handle_input(u_input='upd')
    
    
    '''
    
    def __init__(self): # Application start -> initialize Communicator instance 
        # call `start` to start an existing system or create a new system
        self.RSS = RSS_Feeds(bbc_rss)
        #self.start()
    
    def create_new_system(self):
        # create a new system without any already existing data
        pass
    
    def load_data_models(self):
        pass
        
    def select_news(self):
        categories = self.RSS.df_news.groupby('category').head(10)
        interesting = self.RSS.df_unique_news.sample(n=np.random.randint(15, 25), random_state=42)
        return categories, interesting
    
    def start(self):
        #try to load data
        #if ok -> start existing
        #if smth is not ok -> call `create_new_system`
        # return DataFrames
        c, i = self.select_news()
        return c, i


    def handle_input(self, u_input): # the main method that GUI has to call
        ''' Call this method with `u_input` argument to communicate with the system.
            It takes a string `u_input` and returns an appropriate output.
            
            Interactions as `u_input` -> `method output`:
            
            * 'upd' -> two pandas DataFrame objects (news in categories, interesting news)
            * 'exit' -> boolean: True, if data and models have been successfully saved, False otherwise
            * 'viewed' + ' ' + ID (e.g. 'viewed 005503512f38f130303cb133d656203b') -> two pandas DataFrame objects (news in categories, interesting news)
            * 'similar' + ' ' + ID (e.g. 'similar dc313bbf1bfca18d28e95862e972822f') -> pandas DataFrame with nearest news            
        '''
        # parse GUI input
        # call appropriate methods
        # return system output to GUI
        instruction = u_input.split()[0]
        if instruction == 'upd':
            #update news
            self.RSS = RSS_Feeds(bbc_rss)
            c, i = self.select_news()
            return c, i
        elif instruction == 'exit':
            #save
            return True
        elif instruction == 'viewed':
            #get news id and process viewed...
            n_id = u_input.split()[1:] # list, even if it consists of only one ID (generally)
            c, i = self.select_news()
            return c, i
        elif instruction == 'similar':
            #get id and return similar
            n_id = u_input.split()[1] # string
            return self.RSS.df_unique_news.sample(n=5, random_state=111)
        else:
            return('unknown input')
        
    
    def update_news(self):
        # download news from rss feeds
        # return DataFrames
        pass
    
    def save(self):
        # save data and models
        # return ok or error
        pass
    
    def handle_viewed(self):
        # get news ID, check if already in viewed, remove from all news,.....
        # if n(viewed) >= THRESHOLD -> update model -> save mpdel
        # return DataFrames
        pass
    
    def find_similar(self, n):
        # get and return the n nearest news to the given news item
        pass

In [9]:
CI = Communicator()

In [10]:
from_categories, interesting = CI.start()

In [12]:
from_categories.head(3)

Unnamed: 0,news,category,title,summary,link,date,time,datetime,ID
0,"Brexit: 'No alignment' with EU on regulation, ...",BBC News - Home,"Brexit: 'No alignment' with EU on regulation, ...",Sajid Javid warns businesses UK and EU regulat...,https://www.bbc.co.uk/news/uk-politics-51157933,"(2020, 1, 18)","(11, 25, 39)","Sat, 18 Jan 2020 11:25:39 GMT",51f007e7fca0248885436004338df61b
1,New Chinese virus 'will have infected hundreds...,BBC News - Home,New Chinese virus 'will have infected hundreds',"The virus emerged in December, two people have...",https://www.bbc.co.uk/news/health-51148303,"(2020, 1, 18)","(8, 50, 9)","Sat, 18 Jan 2020 08:50:09 GMT",39e612824fbd70b00d011c8c2b9286a4
2,Newport Pagnell: Nine-hour search as boy vanis...,BBC News - Home,Newport Pagnell: Nine-hour search as boy vanis...,"More than 1,000 people searched through the ni...",https://www.bbc.co.uk/news/uk-england-beds-buc...,"(2020, 1, 18)","(11, 50, 39)","Sat, 18 Jan 2020 11:50:39 GMT",e99f8c1dd748052cf20359d8a6324bb5


In [23]:
c, i = CI.handle_input('upd')

In [24]:
c, i = CI.handle_input('viewed 51f007e7fca0248885436004338df61b')

In [25]:
s = CI.handle_input('similar 51f007e7fca0248885436004338df61b')

In [26]:
CI.handle_input('exit')

True

In [27]:
help(Communicator)

Help on class Communicator in module __main__:

class Communicator(builtins.object)
 |  Application start -> initialize Communicator instance and call `start()`
 |  to start an existing system or create a new system and get DataFrames, e.g.:
 |  
 |      CI = Communicator()
 |      news_in_categories, interesting_news = CI.start()
 |      display(news_in_categories)
 |  
 |  Then use `handle_input()` to process user input and get system output, e.g.:
 |  
 |      news_in_categories, interesting_news = CI.handle_input(u_input='upd')
 |  
 |  Methods defined here:
 |  
 |  __init__(self)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  create_new_system(self)
 |  
 |  find_similar(self, n)
 |  
 |  handle_input(self, u_input)
 |      Call this method with `u_input` argument to communicate with the system.
 |      It takes a string `u_input` and returns an appropriate output.
 |      
 |      Interactions as `u_input` -> `method output`:
 |      
 |      * '

In [22]:
help(Communicator.handle_input)

Help on function handle_input in module __main__:

handle_input(self, u_input)
    Call this method with `u_input` argument to communicate with the system.
    It takes a string `u_input` and returns an appropriate output.
    
    Interactions as `u_input` -> `method output`:
    
    * 'upd' -> two pandas DataFrame objects (news in categories, interesting news)
    * 'exit' -> boolean: True, if data and models have been successfully saved, False otherwise
    * 'viewed' + ' ' + ID (e.g. 'viewed 005503512f38f130303cb133d656203b') -> two pandas DataFrame objects (news in categories, interesting news)
    * 'similar' + ' ' + ID (e.g. 'similar dc313bbf1bfca18d28e95862e972822f') -> pandas DataFrame with nearest news

