# <center> Hacker News Pipeline 
    
The data we will use comes from a [Hacker News(HN)](https://news.ycombinator.com/) API that returns JSON data of the top stories in 2014. To make things easier, data have been already downloaded a list of JSON posts to a file called 'hn_stories_2014.json'. The goal will be to find the top 100 keywords of Hacker News Posts in 2014. 

## Import Pipeline module 

In [1]:
from pipeline import Pipeline 
pipeline = Pipeline()

## Loading the JSON Data

Because JSON files resemble a key-value dictionary, the goal is to parse the JSON file into a Python dict object. 

In [2]:
import json

@pipeline.task() 
def file_to_json() : 
    with open('hn_stories_2014.json', 'r') as f : 
        data = json.load(f)
        stories = data['stories']
    return stories

## Filtering the Stories 

We can filter for popular stories by ensuring they are links (not Ask HN posts), have a good number of points, and have some comments. 

In [3]:
@pipeline.task(depends_on = file_to_json) 
def filter_stories(stories) : 
    def check_popular(story) : 
        return story['points'] > 50 and story['num_comments'] > 1 and not story['title'].startswith('Ask HN')
    return (story for story in stories if check_popular(story)) 

## Convert to CSV

with a reduced set of stories, it's time to write these dict objects to a CSV file. The purpose of translating the dictionaries to a CSV is that we want to have a consistenet data format when running the last summarizations. 

In [4]:
import io
from pipeline import build_csv
from datetime import datetime 

@pipeline.task(depends_on = filter_stories)
def json_to_csv(stories) :
    rows = [] 
    for story in stories : 
        rows.append((story['objectID'], datetime.strptime(story['created_at'], "%Y-%m-%dT%H:%M:%SZ"), 
                     story['url'], story['points'], story['title']))
    return bulid_csv(rows, header = ['objecID', 'created_at', 'url', 'points', 'title'], file = io.StringIO())

## Extract Title Columns

Using the CSV file format we created, we can now extract the title column. Once we have extracted the title of each popular most, we can then run the next word frequency task. To extract the titles, we'll follow the steps following : 

1. Import csv, and create a csv.reader() object from the file object.
2. Find the index of the title in the header.
3. Iterate the through the reaer, and return eac hitem from the reader in the corresponding title index position.

In [5]:
import csv

@pipeline.task(depends_on = json_to_csv)
def extract_titles(csv_file) : 
    reader = csv.reader(csv_file)
    header = next(reader)
    idx = header.index('title')
    return (row[idx] for row in reader)

## Clean the Titles 

Because we're trying to create a word frequency model of words from Hacker News titles, we need a way to create a consistent set of wrods to use. For example, words like Google, google, Google?, and google., all means the same keyword: gooogle. If we were to split hte title into words, however, they would all be lumped into different categories. 

In [6]:
@pipeline.task(depends_on = extract_titles)
def clean_titles(titles) : 
    for title in titles : 
        title = title.lower() 
        title = ''.join(char for char in title if char not in string.punctuation)
        yield title

## Create the Word Frequency Dictionary 

With a cleaned title, we can now build the word frequency dictionary. A word frequency dictionary are key value pairs that connects a word to the number of times it is used in a text. 

Furthermore, to find actual keywords, we should enforce the word frequency dictionary to not include stop words. Stop words are words that occur frequently in language like "the", "or", etc., and are commonly rejected in keyword searches.

In [7]:
@pipeline.task(depends_on = clean_titles)
def build_keyword_dictionary(cleaned_titles) : 
    words_freq = {} 
    for title in cleaned_titles : 
        for word in title.split(" "): 
            if word and word not in stop_words : 
                if word not in words_freq : 
                    words_freq[word] = 1
                words_freq[word] += 1
    return words_freq

## Sort the Top Words 

The toal is to output a list of tuples with (word, frequency) as the entries sorted from most used, to leas most used.

In [9]:
@pipeline.task(depends_on = build_keyword_dictionary)
def top100_words(words_freq):
    top100_freq = [(word, freq) for word, freq in sorted(words_freq.items(), key = lambda x: x[1], reverse = True)]
    return words_freq[:100]

## Full Pipeline 

In [27]:
import io
import csv 
import json
import string 
from datetime import datetime 

from pipeline import build_csv, Pipeline 
from stop_words import stop_words 

pipeline = Pipeline()

@pipeline.task() 
def file_to_json() : 
    with open('hn_stories_2014.json', 'r') as f : 
        data = json.load(f)
        stories = data['stories']
    return stories

@pipeline.task(depends_on = file_to_json) 
def filter_stories(stories) : 
    def check_popular(story) : 
        return story['points'] > 50 and story['num_comments'] > 1 and not story['title'].startswith('Ask HN')
    return (story for story in stories if check_popular(story)) 

@pipeline.task(depends_on = filter_stories)
def json_to_csv(stories) :
    rows = [] 
    for story in stories : 
        rows.append((story['objectID'], datetime.strptime(story['created_at'], "%Y-%m-%dT%H:%M:%SZ"), story['url'], story['points'], story['title']))
    return build_csv(rows, header = ['objecID', 'created_at', 'url', 'points', 'title'], file = io.StringIO())

@pipeline.task(depends_on = json_to_csv)
def extract_titles(csv_file) : 
    reader = csv.reader(csv_file)
    header = next(reader)
    idx = header.index('title')
    return (row[idx] for row in reader)

@pipeline.task(depends_on = extract_titles)
def clean_titles(titles) : 
    for title in titles : 
        title = title.lower() 
        title = ''.join(char for char in title if char not in string.punctuation)
        yield title
    
@pipeline.task(depends_on = clean_titles)
def build_keyword_dictionary(cleaned_titles) : 
    words_freq = {} 
    for title in cleaned_titles : 
        for word in title.split(" "): 
            if word and word not in stop_words : 
                if word not in words_freq : 
                    words_freq[word] = 1
                words_freq[word] += 1
    return words_freq

@pipeline.task(depends_on = build_keyword_dictionary)
def top100_words(words_freq):
    top100_freq = [(word, freq) for word, freq in sorted(words_freq.items(), key = lambda x: x[1], reverse = True)]
    return top100_freq[:100]

result = pipeline.run()
for word in result[top100_words] : 
    print(word)

('new', 186)
('google', 168)
('bitcoin', 102)
('open', 93)
('programming', 91)
('web', 89)
('data', 86)
('video', 80)
('python', 76)
('code', 73)
('facebook', 72)
('released', 72)
('using', 71)
('2013', 66)
('javascript', 66)
('free', 65)
('source', 65)
('game', 64)
('internet', 63)
('microsoft', 60)
('c', 60)
('linux', 59)
('app', 58)
('pdf', 56)
('work', 55)
('language', 55)
('software', 53)
('2014', 53)
('startup', 52)
('apple', 51)
('use', 51)
('make', 51)
('time', 49)
('yc', 49)
('security', 49)
('nsa', 46)
('github', 46)
('windows', 45)
('world', 42)
('way', 42)
('like', 42)
('1', 41)
('project', 41)
('computer', 41)
('heartbleed', 41)
('git', 38)
('users', 38)
('dont', 38)
('design', 38)
('ios', 38)
('developer', 37)
('os', 37)
('twitter', 37)
('ceo', 37)
('vs', 37)
('life', 37)
('big', 36)
('day', 36)
('android', 35)
('online', 35)
('years', 34)
('simple', 34)
('court', 34)
('guide', 33)
('learning', 33)
('mt', 33)
('api', 33)
('says', 33)
('apps', 33)
('browser', 33)
('server'