In [66]:
import os
import pickle
import requests
from collections import Counter
from datetime import datetime, timedelta
from functools import wraps
from requests import ConnectionError

from IPython.display import clear_output
from hackernews import HackerNews, InvalidItemID
from readability import Document

In [2]:
DATA_FOLDER = 'data'

In [3]:
# Hacker News API raise ConnectionError very often. This code fix it.

def decorate_all_methods(decorator):
    @wraps(decorator)
    def decorate(cls):
        for attr_name in cls.__dict__:
            attr = getattr(cls, attr_name)
            if callable(attr):
                setattr(cls, attr_name, decorator(attr))
        return cls
    return decorate


def try_to_reconnect(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        while True:
            try:
                res = func(*args, **kwargs)
            except ConnectionError:
                continue
            return res
    return wrapper

HackerNews = decorate_all_methods(try_to_reconnect)(HackerNews)

In [70]:
def mkdir_if_not_exists(folder_path):
    if not os.path.isdir(folder_path):
        os.mkdir(folder_path)


class HNUser:
    hn_api = HackerNews()
        
        
class ExtendedItem(HNUser):
    """
    Note:
        Has all item's attrs
    """
    
    def __init__(self, item):
        self.__dict__.update(item.__dict__)
        self.kids = self.kids if self.kids is not None else []
        

class Comment(ExtendedItem):
    """
    Note:
        Has all ExtendedItem's attrs
        
    Attributes:
        * subcomments (list): of Comments
    """
    
    def __init__(self, item):
        super().__init__(item)
        self.load_subcomments()
    
    def load_subcomments(self):
        self.subcomments = []
        for comment_id in self.kids:
            try:
                item_api = self.hn_api.get_item(comment_id)
            except InvalidItemID:
                continue
            self.subcomments.append(Comment(item_api))
        
        
class Post(ExtendedItem):
    """
    Note:
        Has all ExtendedItem's attrs
        
    Attributes:
        * comments (list): of Comments
    """
    
    def __init__(self, item):
        super().__init__(item)
        self.load_comments()
    
    def load_comments(self):
        self.comments = []
        for comment_id in self.kids:
            try:
                item_api = self.hn_api.get_item(comment_id)
            except InvalidItemID:
                continue
            self.comments.append(Comment(item_api))
            
    

class HNData(HNUser):
    """
    HackerNews downloader
    
    Notes:
        Uses unofficial python HN python wrapper (https://github.com/HackerNews/API)
    
    Args:
        folder: for data storage
    """
    
    def __init__(self, folder=DATA_FOLDER):
        self.folder = folder
        
    def download_last(self, last_days=182, types=('story', ), start_from_id=None):
        """
        Download all needed data from HackerNews

        Args:
            types (iterable): list of types (“job”, “story”, “comment”, “poll”, “pollopt”).
        """
        
        start_from_id = self.hn_api.get_max_item() if start_from_id is None else start_from_id
        saved = 0
        stat = Counter()
        
        stop_cycle = False
        
        last_date = datetime.now()
            
        for item_number in range(start_from_id, 1, -1):
            clear_output()
            print('{}/{} watched | {} saved\nlast_date: {}'.format(
                item_number, start_from_id, saved,
                last_date
            ))
            print(stat)
            
            try:
                api_item = self.hn_api.get_item(item_number)
            except InvalidItemID:
                continue
            
            item = ExtendedItem(api_item)
                
            stat[item.item_type] += 1

            if self.type_from_list(item, types):
                if self.is_last_n_days(item, last_days):
                    post = Post(item)
                    self.save_post(post)
                    saved += 1
                    last_date = post.submission_time
                else:
                    stop_cycle = True
            
            if stop_cycle:
                break
        
        print('Done!')
    
    def add_external_to_posts(self, start_from=None):
        for number, post in enumerate(self.iterate_all_posts()):
            clear_output()
            print('{}/{}'.format(number, self.get_files_number()), post.item_id)
            if start_from is None:
                if post.url is None:
                    post.external = None
                else:
                    try:
                        response = requests.get(post.url, timeout=5)
                        doc = Document(response.text)
                        post.external = doc.summary()
                    except:
                        post.external = None
                self.save_post(post)
            elif str(post.item_id) == str(start_from):
                start_from = None
        
        print('Done!')
    
    def save_post(self, post):
        with open(os.path.join(self.folder, str(post.item_id)), 'wb') as file:
            pickle.dump(post, file)
            
    def load_post(self, file):
        with open(os.path.join(self.folder, file), 'rb') as file:
            return pickle.load(file)
    
    def get_files(self):
        return sorted(os.listdir(self.folder))
    
    def get_files_number(self):
        return len(self.get_files())
            
    def iterate_all_posts(self):
        for file in self.get_files():
            yield self.load_post(file)
            
    def is_last_n_days(self, item, last_days, from_time=datetime.now()):
        return item.submission_time > from_time - timedelta(days=182)
    
    def type_from_list(self, item, types):
        return item.item_type in types


In [71]:
hn_data = HNData()

In [None]:
hn_data.download_last(start_from_id=15361009)

15361009/15367531 watched | 876 saved
last_date: 2017-09-29 00:46:04
Counter({'comment': 5644, 'story': 876, 'job': 2})


In [72]:
hn_data.add_external_to_posts()

5726/5727 15404208


In [81]:
post = hn_data.load_post('15384351')

In [83]:
post.url

'https://blog.merovius.de/2017/09/12/diminishing-returns-of-static-typing.html'