Exploring Hacker News Posts

In [1]:
from csv import reader

In [2]:
def reading(filename):
    open_file = open(filename,encoding='utf8')
    read_file = reader(open_file)
    return list(read_file)

def manage_column(dataset):
    return dataset[0],dataset[1:]

def explore_data(dataset,col_name,data_range=3):
    print("Column name")
    print(col_name)
    print("Dataset")
    for i in range(data_range):
        print(dataset[i])
    print(f"Number of rows {len(dataset)} Number of Column {len(col_name)}")

In [3]:
hn = reading("HN_posts_year_to_Sep_26_2016.csv")

In [4]:
hn_col,hn_data = manage_column(hn)

In [5]:
explore_data(hn_data,hn_col,data_range=5)

Column name
['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at']
Dataset
['12579008', 'You have two days to comment if you want stem cells to be classified as your own', 'http://www.regulations.gov/document?D=FDA-2015-D-3719-0018', '1', '0', 'altstar', '9/26/2016 3:26']
['12579005', 'SQLAR  the SQLite Archiver', 'https://www.sqlite.org/sqlar/doc/trunk/README.md', '1', '0', 'blacksqr', '9/26/2016 3:24']
['12578997', 'What if we just printed a flatscreen television on the side of our boxes?', 'https://medium.com/vanmoof/our-secrets-out-f21c1f03fdc8#.ietxmez43', '1', '0', 'pavel_lishin', '9/26/2016 3:19']
['12578989', 'algorithmic music', 'http://cacm.acm.org/magazines/2011/7/109891-algorithmic-composition/fulltext', '1', '0', 'poindontcare', '9/26/2016 3:16']
['12578979', 'How the Data Vault Enables the Next-Gen Data Warehouse and Data Lake', 'https://www.talend.com/blog/2016/05/12/talend-and-Â\x93the-data-vaultÂ\x94', '1', '0', 'markgainor1', '9/26/2016 3:14']
Nu

Removing Rows with zero comment

In [6]:
def zero_remove(dataset,com_index):
    list_wo_0 = []
    for row in dataset:
        zero_com = row[com_index]
        if zero_com == "0":
            pass
        else:
            list_wo_0.append(row)
    return list_wo_0

In [7]:
hn_data = zero_remove(hn_data,4)

In [8]:
explore_data(hn_data,hn_col,data_range=10)

Column name
['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at']
Dataset
['12578975', 'Saving the Hassle of Shopping', 'https://blog.menswr.com/2016/09/07/whats-new-with-your-style-feed/', '1', '1', 'bdoux', '9/26/2016 3:13']
['12578908', 'Ask HN: What TLD do you use for local development?', '', '4', '7', 'Sevrene', '9/26/2016 2:53']
['12578822', 'Amazons Algorithms Dont Find You the Best Deals', 'https://www.technologyreview.com/s/602442/amazons-algorithms-dont-find-you-the-best-deals/', '1', '1', 'yarapavan', '9/26/2016 2:26']
['12578694', 'Emergency dose of epinephrine that does not cost an arm and a leg', 'http://m.imgur.com/gallery/th6Ua', '2', '1', 'dredmorbius', '9/26/2016 1:54']
['12578624', 'Phone Makers Could Cut Off Drivers. So Why Dont They?', 'http://www.nytimes.com/2016/09/25/technology/phone-makers-could-cut-off-drivers-so-why-dont-they.html', '4', '1', 'danso', '9/26/2016 1:37']
['12578556', 'OpenMW, Open Source Elderscrolls III: Morrowind Reimpl

Extracting Ask HN and Show HN Posts

In [9]:
def post_classification(dataset,title_index):
    ask_posts = []
    show_posts = []
    other_posts = []

    for row in dataset:
        title = row[title_index]
        if "ask hn" in title.lower():
            ask_posts.append(row)
        elif "show hn" in title.lower():
            show_posts.append(row)
        else:
            other_posts.append(row)
    return ask_posts,show_posts,other_posts

In [10]:
ask_posts,show_posts,other_posts = post_classification(dataset=hn_data,title_index=1)

In [11]:
len(ask_posts)

6918

In [12]:
len(show_posts)

5068

In [13]:
len(other_posts)

68415

Number of Comments

In [14]:
def comment_count(dataset,comment_index):
    total_com = 0
    for row in dataset:
        comment = int(row[comment_index])
        total_com += comment
    return total_com

In [15]:
ask_comment = comment_count(ask_posts,4)
ask_comment

95000

In [16]:
show_comment = comment_count(show_posts,4) 
show_comment

49690

In [17]:
other_comment = comment_count(other_posts,4)
other_comment

1768071

Number of Ask Posts and Comments by Hour Created

In [18]:
import datetime as dt

In [19]:
def date_transform(dataset,create_index,comment_index):
    result_list = []
    counts_by_hour = {}
    comments_by_hour = {}
    time_format = '%m/%d/%Y %H:%M'
    for row in dataset:
        create_at = row[create_index]
        comment = int(row[comment_index])
        result_list.append([create_at,comment])
    for row in result_list:
        date = dt.datetime.strptime(row[0],time_format)
        hour = date.strftime("%H")
        if hour not in counts_by_hour:
            counts_by_hour[hour] = 1
            comments_by_hour[hour] = row[1]
        else:
            counts_by_hour[hour] += 1
            comments_by_hour[hour] += row[1]
    return counts_by_hour,comments_by_hour

In [20]:
counts_by_hour,comments_by_hour = date_transform(ask_posts,6,4)

In [21]:
counts_by_hour

{'02': 227,
 '01': 223,
 '22': 287,
 '21': 407,
 '19': 421,
 '17': 404,
 '15': 468,
 '14': 378,
 '13': 326,
 '11': 251,
 '10': 219,
 '09': 176,
 '07': 157,
 '03': 213,
 '16': 415,
 '08': 190,
 '00': 231,
 '23': 278,
 '20': 393,
 '18': 452,
 '12': 274,
 '04': 186,
 '06': 176,
 '05': 166}

In [22]:
comments_by_hour

{'02': 2996,
 '01': 2089,
 '22': 3372,
 '21': 4500,
 '19': 3955,
 '17': 5547,
 '15': 18526,
 '14': 4972,
 '13': 7245,
 '11': 2797,
 '10': 3013,
 '09': 1477,
 '07': 1585,
 '03': 2159,
 '16': 4466,
 '08': 2362,
 '00': 2277,
 '23': 2300,
 '20': 4463,
 '18': 4877,
 '12': 4234,
 '04': 2360,
 '06': 1587,
 '05': 1841}

 Average Number of Comments for Ask HN Posts by Hour

In [23]:
def average(ditc_1,dict_2):
    average_dict = {}
    for key in ditc_1:
        average_val = ditc_1[key] / dict_2[key]
        average_dict[key] = round(average_val,2)
    return average_dict

In [24]:
avg_by_hour = average(comments_by_hour,counts_by_hour)

In [25]:
avg_by_hour

{'02': 13.2,
 '01': 9.37,
 '22': 11.75,
 '21': 11.06,
 '19': 9.39,
 '17': 13.73,
 '15': 39.59,
 '14': 13.15,
 '13': 22.22,
 '11': 11.14,
 '10': 13.76,
 '09': 8.39,
 '07': 10.1,
 '03': 10.14,
 '16': 10.76,
 '08': 12.43,
 '00': 9.86,
 '23': 8.27,
 '20': 11.36,
 '18': 10.79,
 '12': 15.45,
 '04': 12.69,
 '06': 9.02,
 '05': 11.09}

Sorting

In [26]:
def display_table(dataset):
    display_list = []
    for key in dataset:
        display_list.append((round(dataset[key],2),key))

    return sorted(display_list,reverse= True)

In [27]:
display_table(avg_by_hour)

[(39.59, '15'),
 (22.22, '13'),
 (15.45, '12'),
 (13.76, '10'),
 (13.73, '17'),
 (13.2, '02'),
 (13.15, '14'),
 (12.69, '04'),
 (12.43, '08'),
 (11.75, '22'),
 (11.36, '20'),
 (11.14, '11'),
 (11.09, '05'),
 (11.06, '21'),
 (10.79, '18'),
 (10.76, '16'),
 (10.14, '03'),
 (10.1, '07'),
 (9.86, '00'),
 (9.39, '19'),
 (9.37, '01'),
 (9.02, '06'),
 (8.39, '09'),
 (8.27, '23')]