### The goal of this project will be to find the ""top 100 keywords of Hacker News".
### Because Hacker News is the most popular technology social media site, this will give us an understanding of the most talked about tech topics.

In [1]:
# importing libraries
import nltk
from nltk.corpus import stopwords
import concurrent.futures
import json
import requests
import csv
import itertools
from datetime import datetime
import time
import string
import re

In [2]:
# Extending stopwords
stop_words = stopwords.words('english')
a = ['us','new','could','want','know','use','used','may','hn','using','say','says']
stop_words.extend(a)

In [3]:
# Creating Pipeline class
class Pipeline:
    def __init__(self):
        self.tasks = []
     
    """this method is used for inserting all the tasks in a tasks list."""
    def task(self, depends_on=None):
        idx = 0
        if depends_on:
            idx = self.tasks.index(depends_on) + 1
        def inner(f):
            self.tasks.insert(idx, f)
            return f
        return inner
    
    """this method is used to run the pipeline. Which takes the input from dependable function output."""
    def run(self, input_=None):
        if input_ is None:
            input_ = []
        output = input_
        for task in self.tasks:
            output = task(output) if output else task()
        return output

In [4]:
"""This function is used to dump the data into a csv format"""
def build_csv(lines,file,header=None):
    if header:
        lines = itertools.chain([header],lines)
    writer = csv.writer(file, delimiter=',')
    writer.writerows(lines)

    file.seek(0)
    return file

In [5]:
# Creating pipeline object
pipeline = Pipeline()

@pipeline.task()
def extract_top_stories():
    
    """this function is used to filter out the story on certain parameters"""
    def process_story(item_id, base_url):
        resp = session.get(f"{base_url}item/{item_id}.json")
        story = resp.json()
        
        # Filtering stories
        try:
            if story['descendants']>1 and story['score']>50 and not story['title'].startswith('Ask HN'):
                return [story['id'], datetime.fromtimestamp(story['time']).strftime('%d/%m/%Y'), 
                        story['url'], story['score'], story['title']]
        except Exception as e:
            pass

    # Creating a session object
    session = requests.Session()
    
    base_url = "https://hacker-news.firebaseio.com/v0/"
    top_stories_url = f"{base_url}/topstories.json"
    response = session.get(top_stories_url)
    
    # Extracting item_ids of top stories
    item_ids = response.json()

    # Use a ThreadPoolExecutor to process the stories concurrently
    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
        
        # Submit a task for each item id
        futures = [executor.submit(process_story, item_id, base_url) for item_id in item_ids]

        # Iterate through the completed futures and append the results to the lines list
        lines = [future.result() for future in concurrent.futures.as_completed(futures) if future.result() is not None]

    # Close the session
    session.close()

    return lines

In [6]:
@pipeline.task(depends_on=extract_top_stories)
def to_csv(lines):
    file = open('temporary.csv', 'w+')
    csv_file = build_csv(lines, file, header =['id', 'time', 'url','score', 'title'])
    return csv_file

In [7]:
"""this function is used to extract tht title from each story"""
@pipeline.task(depends_on=to_csv)
def extract_titles(csv_file):
    reader = csv.reader(csv_file)
    header = next(reader)
    idx = header.index('title')
    
    return (line[idx] for line in reader)

In [8]:
"""cleanig the title by eliminating any special characters(string.punctuation)"""
@pipeline.task(depends_on=extract_titles)
def clean_title(titles):
    for title in titles:
        clean_title = title.lower()
        clean_title = ''.join(c for c in clean_title if c not in string.punctuation + '–')
        yield clean_title

In [9]:
"""this function is used to build the dictionary of word which are not in stop_words 
and does not starts with any digits"""
@pipeline.task(depends_on=clean_title)
def build_keyword_dictionary(titles):
    word_freq = {}
    for title in titles:
        for word in title.split(' '):
            if word and word not in stop_words and not re.match(r"^\d", word):
                if word not in word_freq:
                    word_freq[word] = 1
                word_freq[word] += 1
    return word_freq

In [10]:
"""this func is used to sorted the word on the basis of their occurences in story titles in decreasing order"""
@pipeline.task(depends_on=build_keyword_dictionary)
def top_keywords(word_freq):
    freq_tuple = [(word, word_freq[word])
        for word in sorted(word_freq, key=word_freq.get, reverse=True)]
    return freq_tuple[:100]

In [12]:
output = pipeline.run()
print(f'{output=}')

output=[('lastpass', 9), ('video', 7), ('users', 6), ('code', 6), ('christmas', 6), ('money', 6), ('microsoft', 5), ('software', 5), ('time', 5), ('ruby', 5), ('life', 5), ('nix', 5), ('app', 5), ('year', 5), ('pdf', 4), ('study', 4), ('ai', 4), ('windows', 4), ('introduction', 4), ('apple', 4), ('released', 4), ('winter', 4), ('search', 4), ('world', 4), ('development', 4), ('protocol', 4), ('law', 4), ('computer', 4), ('data', 4), ('linux', 4), ('machine', 4), ('vaults', 4), ('print', 4), ('tiktok', 4), ('government', 4), ('learning', 4), ('gui', 3), ('os', 3), ('storm', 3), ('save', 3), ('bees', 3), ('writing', 3), ('power', 3), ('state', 3), ('tools', 3), ('games', 3), ('website', 3), ('people', 3), ('finally', 3), ('call', 3), ('soaring', 3), ('see', 3), ('pi', 3), ('deep', 3), ('brain', 3), ('assembly', 3), ('fans', 3), ('nixos', 3), ('pay', 3), ('seven', 3), ('mystery', 3), ('reverse', 3), ('engineering', 3), ('another', 3), ('source', 3), ('hypercard', 3), ('engineer', 3), ('go