# Exploring Hacker News Posts

In [75]:
from csv import reader
import pprint

with open('hacker_news.csv') as hacker_news:
    reader = reader(hacker_news)
    hn = list(reader)

print(hn[:5])

[['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at'], ['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20'], ['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01']]


In [76]:
headers = hn[0]
hn = hn[1:]
print(hn[:5])

[['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20'], ['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01'], ['10301696', 'Note by Note: The Making of Steinway L1037 (2007)', 'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0', '8', '2', 'walterbell', '9/30/2015 4:12']]


We're only concerned with post titles beginning with Ask HN or Show HN, so we'll create new lists of lists containing just the data for those titles

In [77]:
ask_posts = []
show_posts = []
other_posts = []

for row in hn:
    title = row[1]
    if title.lower().startswith('ask hn'):
        ask_posts.append(row)
    elif title.lower().startswith('show hn'):
        show_posts.append(row)
    else:
        other_posts.append(row)
        
print(f'Number of posts in Ask posts: {len(ask_posts)}')
print(f'Number of posts in Show posts: {len(show_posts)}')
print(f'Number of posts in Other posts: {len(other_posts)}')

Number of posts in Ask posts: 1744
Number of posts in Show posts: 1162
Number of posts in Other posts: 17194


Below we are checking whether there are more comments on posts that start with ask hn or show hn

In [78]:
total_ask_comments = 0

for row in ask_posts:
    num_comments = int(row[4])
    total_ask_comments += num_comments

avg_ask_comments = total_ask_comments / len(ask_posts)

total_show_comments = 0

for row in show_posts:
    num_comments = int(row[4])
    total_show_comments += num_comments

avg_show_comments = total_show_comments / len(show_posts)

avg_ask_comments = round(avg_ask_comments)
avg_show_comments = round(avg_show_comments)

print(f'The average number of comments on "ASK" posts:{avg_ask_comments}')
print(f'The average numbeer of comments on "SHOW" posts: {avg_show_comments}')
    

The average number of comments on "ASK" posts:14
The average numbeer of comments on "SHOW" posts: 10


We can see from the analysis that posts starting with 'Ask hn' receive from comments on average than posts starting with 'Show hn'

Since posts starting with 'Ask hn' are likely to receive more comments, we will focus our remaining analysis on these posts only

In [79]:
import datetime as dt

result_list = []

for row in ask_posts:
    created_at = row[6]
    num_comments = int(row[4])
    result_list.append([created_at, num_comments])

counts_by_hour = {}
comments_by_hour = {}

for row in result_list:
    date_time = dt.datetime.strptime(row[0], "%m/%d/%Y %H:%M")
    time_hour = date_time.strftime('%H')
    
    if time_hour not in counts_by_hour:
        counts_by_hour[time_hour] = 1
        comments_by_hour[time_hour] = row[1]
    else:
        counts_by_hour[time_hour] += 1
        comments_by_hour[time_hour] += row[1]

print(f'Number of posts created in each hour of the day\n{counts_by_hour})')
print('\n')
print(f'Number of comments on posts by hour created\n{comments_by_hour}')


Number of posts created in each hour of the day
{'09': 45, '13': 85, '10': 59, '14': 107, '16': 108, '23': 68, '12': 73, '17': 100, '15': 116, '21': 109, '20': 80, '02': 58, '18': 109, '03': 54, '05': 46, '19': 110, '01': 60, '22': 71, '08': 48, '04': 47, '00': 55, '06': 44, '07': 34, '11': 58})


Number of comments on posts by hour created
{'09': 251, '13': 1253, '10': 793, '14': 1416, '16': 1814, '23': 543, '12': 687, '17': 1146, '15': 4477, '21': 1745, '20': 1722, '02': 1381, '18': 1439, '03': 421, '05': 464, '19': 1188, '01': 683, '22': 479, '08': 492, '04': 337, '00': 447, '06': 397, '07': 267, '11': 641}


We have created two dictionaries, counts_by_hour and commentS_by_hour. 

We will now use these two dictionaries to calculate the average number of comments for posts created during each hour of the day

The result will be a 2d list in which the first element of each list is the hour and the second element is the average number of comments per post

In [80]:
import pprint

avg_by_hour = []


for k,v in comments_by_hour.items():
    avg_by_hour.append([k, v/counts_by_hour[k]])
    
pprint.pprint(avg_by_hour)

[['09', 5.5777777777777775],
 ['13', 14.741176470588234],
 ['10', 13.440677966101696],
 ['14', 13.233644859813085],
 ['16', 16.796296296296298],
 ['23', 7.985294117647059],
 ['12', 9.41095890410959],
 ['17', 11.46],
 ['15', 38.5948275862069],
 ['21', 16.009174311926607],
 ['20', 21.525],
 ['02', 23.810344827586206],
 ['18', 13.20183486238532],
 ['03', 7.796296296296297],
 ['05', 10.08695652173913],
 ['19', 10.8],
 ['01', 11.383333333333333],
 ['22', 6.746478873239437],
 ['08', 10.25],
 ['04', 7.170212765957447],
 ['00', 8.127272727272727],
 ['06', 9.022727272727273],
 ['07', 7.852941176470588],
 ['11', 11.051724137931034]]


We will create a new list with the two elements per inner list, swapped

In [81]:
swap_avg_by_hour = [[v,k] for [k,v] in avg_by_hour]
print(swap_avg_by_hour)


[[5.5777777777777775, '09'], [14.741176470588234, '13'], [13.440677966101696, '10'], [13.233644859813085, '14'], [16.796296296296298, '16'], [7.985294117647059, '23'], [9.41095890410959, '12'], [11.46, '17'], [38.5948275862069, '15'], [16.009174311926607, '21'], [21.525, '20'], [23.810344827586206, '02'], [13.20183486238532, '18'], [7.796296296296297, '03'], [10.08695652173913, '05'], [10.8, '19'], [11.383333333333333, '01'], [6.746478873239437, '22'], [10.25, '08'], [7.170212765957447, '04'], [8.127272727272727, '00'], [9.022727272727273, '06'], [7.852941176470588, '07'], [11.051724137931034, '11']]


In [82]:
sorted_swap = sorted(swap_avg_by_hour, reverse=True)

print('Top 5 Hours for Ask Posts Comments')

for i in sorted_swap[0:5]:
    avg = i[0]
    hour = i[1]
    print(f'{hour}:00 {"{:.2f}".format(avg)}')

Top 5 Hours for Ask Posts Comments
15:00 38.59
02:00 23.81
20:00 21.52
16:00 16.80
21:00 16.01
