# Guided Project: Exploring Hacker News Posts

This guided project brings the following skills together for some real-world practice:

- How to work with strings
- Object-oriented programming
- Dates and times

In [1]:
from csv import reader

#Reading the hacker_news.csv file in as a list of lists
opened_file = open('hacker_news.csv')
read_file = reader(opened_file)
hn = list(read_file)


In [2]:
#Display the first five rows of
for row in hn[:5]:
    print(row)

['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at']
['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52']
['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30']
['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20']
['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01']


In [3]:
#Extract the first row of data, and assign it to the variable headers
headers = hn[0]

#Remove the first row from hn
del hn[0]

In [4]:
#Display headers
print(headers)

['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at']


In [5]:
#Display the first five rows of, to verify header deletion
for row in hn[:5]:
    print(row)

['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52']
['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30']
['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20']
['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01']
['10301696', 'Note by Note: The Making of Steinway L1037 (2007)', 'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0', '8', '2', 'walterbell', '9/30/2015 4:12']


### Finding the posts that begin with either Ask HN or Show HN

In [6]:
ask_posts = [] 
show_posts = [] 
other_posts = []

for row in hn:
    title = row[1]
    if title.lower().startswith('ask hn'):
        ask_posts.append(row)
    elif title.lower().startswith('show hn'):
        show_posts.append(row)
    else:
        other_posts.append(row)

In [7]:
print('Ask posts:', len(ask_posts))
print('Show posts:', len(show_posts))
print('Other posts:',len(show_posts))

Ask posts: 1744
Show posts: 1162
Other posts: 1162


### Determine if ask posts or show posts receive more comments on average

In [8]:
# Compute the average number of comments on ask posts
total_ask_comments = 0

for row in ask_posts:
    total_ask_comments += int(row[4])
    
avg_ask_comments = total_ask_comments/len(ask_posts)
print(avg_ask_comments)

14.038417431192661


In [9]:
# Compute the average number of comments on show posts
total_show_comments = 0

for row in show_posts:
    total_show_comments += int(row[4])
    
avg_show_comments = total_show_comments/len(show_posts)
print(avg_show_comments)

10.31669535283993


Ask posts receive more comments on average than show posts

### Checking if ask posts created at a certain time are more likely to attract comments

Step 1: Calculating the number of ask posts and comments by hour created

In [10]:
# Creating a list with created_at and num_comments

import datetime as dt

result_list = []

for row in ask_posts:
    created_at = row[6]
    num_comments = int(row[4])
    result_list.append([created_at, num_comments])


In [22]:
# created two dictionaries:
# counts_by_hour: contains the number of posts created during each hour.
# comments_by_hour: contains the corresponding number of comments for each hour.

counts_by_hour = {}
comments_by_hour = {}

for row in result_list:
    comment_date = dt.datetime.strptime(row[0], "%m/%d/%Y %H:%M")
    comment_hour = comment_date.hour
    comment = row[1]
    if comment_hour not in counts_by_hour:
        counts_by_hour[comment_hour] = 1
        comments_by_hour[comment_hour] = comment
    else:
        counts_by_hour[comment_hour] += 1
        comments_by_hour[comment_hour] += comment

In [23]:
# Step 2: Calculating the average number of comments

avg_by_hour = []

for hour in counts_by_hour:
    avg_by_hour.append([hour, comments_by_hour[hour]/counts_by_hour[hour]])

In [27]:
for row in avg_by_hour:
    print(row)

[9, 5.5777777777777775]
[13, 14.741176470588234]
[10, 13.440677966101696]
[14, 13.233644859813085]
[16, 16.796296296296298]
[23, 7.985294117647059]
[12, 9.41095890410959]
[17, 11.46]
[15, 38.5948275862069]
[21, 16.009174311926607]
[20, 21.525]
[2, 23.810344827586206]
[18, 13.20183486238532]
[3, 7.796296296296297]
[5, 10.08695652173913]
[19, 10.8]
[1, 11.383333333333333]
[22, 6.746478873239437]
[8, 10.25]
[4, 7.170212765957447]
[0, 8.127272727272727]
[6, 9.022727272727273]
[7, 7.852941176470588]
[11, 11.051724137931034]


### Sorting the list of lists and printing the five highest values

In [29]:
#Creating a list equals to avg_by_hour with swapped columns 

swap_avg_by_hour = []

for row in avg_by_hour:
    swap_avg_by_hour.append(list([row[1], row[0]]))

print(swap_avg_by_hour)

[[5.5777777777777775, 9], [14.741176470588234, 13], [13.440677966101696, 10], [13.233644859813085, 14], [16.796296296296298, 16], [7.985294117647059, 23], [9.41095890410959, 12], [11.46, 17], [38.5948275862069, 15], [16.009174311926607, 21], [21.525, 20], [23.810344827586206, 2], [13.20183486238532, 18], [7.796296296296297, 3], [10.08695652173913, 5], [10.8, 19], [11.383333333333333, 1], [6.746478873239437, 22], [10.25, 8], [7.170212765957447, 4], [8.127272727272727, 0], [9.022727272727273, 6], [7.852941176470588, 7], [11.051724137931034, 11]]


In [31]:
sorted_swap = sorted(swap_avg_by_hour, reverse=True)

In [47]:
print("Top 5 Hours for Ask Posts Comments")

text = "{hour} {average:.2f} average comments per post"

for row in sorted_swap[:6]:
    str_hour = str(row[1])
    hour = dt.datetime.strptime(str_hour, "%H")
    formated_hour = dt.datetime.strftime(hour, "%H:%M")
    average = row[0]
    print(text.format(hour=formated_hour, average=average))



Top 5 Hours for Ask Posts Comments
15:00 38.59 average comments per post
02:00 23.81 average comments per post
20:00 21.52 average comments per post
16:00 16.80 average comments per post
21:00 16.01 average comments per post
13:00 14.74 average comments per post
