In [1]:
#Intro
import pandas as pd
import re

hn = pd.read_csv("hacker_news.csv")
titles = hn['title']
'''
id: The unique identifier from Hacker News for the story
title: The title of the story
url: The URL that the stories links to, if the story has a URL
num_points: The number of points the story acquired, calculated as the total number of upvotes minus the total number of downvotes
num_comments: The number of comments that were made on the story
author: The username of the person who submitted the story
created_at: The date and time at which the story was submitted
'''

#Case insensitive pattern to match all variations of SQL, re.I flags ignores cases
pattern = r"sql"
sql_counts = titles.str.contains(pattern, flags=re.I).sum()

In [2]:
#Capture Groups, Pivot Table, str.extract()

hn_sql = hn[hn['title'].str.contains(r"\w+SQL", flags=re.I)].copy()

pattern = r"(\w+sql)"
#Create new col called flavor, extract all mentions of SQL followed by any character before
hn_sql['flavor'] = hn_sql['title'].str.extract(pattern, flags=re.I)
#Reassign with lowercase vals
hn_sql['flavor'] = hn_sql['flavor'].str.lower()

#Make pivot table, index flavor, values arregatet mean of num_comments
sql_pivot = hn_sql.pivot_table(index='flavor', values = 'num_comments', aggfunc='mean')

In [3]:
pattern = r"[Pp]ython ([\d\.]+)"

#Expand = False to avoid DF has no object attribute value_counts error
#Extract all versions
py_versions = titles.str.extract(pattern, expand=False)

#Create freq table of extracted versions
py_versions_freq = dict(py_versions.value_counts())

In [4]:
#Negative set
def first_10_matches(pattern):
    """
    Return the first 10 story titles that match
    the provided regular expression
    """
    all_matches = titles[titles.str.contains(pattern)]
    first_10 = all_matches.head(10)
    return first_10

#Matches c or C, excludes instances where either c is followed by . or +
pattern = r"\b[Cc]\b[^.+]"
first_ten = first_10_matches(pattern)
first_ten

365                      The new C standards are worth it
444           Moz raises $10m Series C from Foundry Group
521          Fuchsia: Micro kernel written in C by Google
1307            Show HN: Yupp, yet another C preprocessor
1326                     The C standard formalized in Coq
1365                          GNU C Library 2.23 released
1429    Cysignals: signal handling (SIGINT, SIGSEGV, )...
1620                        SDCC  Small Device C Compiler
1949    Rewriting a Ruby C Extension in Rust: How a Na...
2195    MyHTML  HTML Parser on Pure C with POSIX Threa...
Name: title, dtype: object

In [5]:
#Using Lookarounds to check preceeding and followed strings
#?<!Series\s  matches where not preceed by Series and a whitespace
# (?1[\+\.])  matches when patter does not end with 1 or more + or . characters
#\b[Cc]\b Matches cC with word boundary. i.e only C,c when not part of another word
pattern = r"(?<!Series\s)\b[Cc]\b(?![\+\.])"

#Sum count of matches in titles
c_mentions = titles.str.contains(pattern).sum()
    

In [None]:
pattern = r"\b(\w+)\s\1\b"

#Select items from titles that have series of 1 or more word characters, preceded and followed by boundary anchor
repeated_words = titles[titles.str.contains(pattern)]

In [6]:
#Replacing all variations with a string
email_variations = pd.Series(['email', 'Email', 'e Mail',
                        'e mail', 'E-mail', 'e-mail',
                        'eMail', 'E-Mail', 'EMAIL'])

#replace above variations with email, assign result to email_uniform
pattern = r"e[\-\s]?mail"
email_uniform = email_variations.str.replace(pattern, 'email', flags=re.I)

#same pattern to replace all variations of email in titles
titles_clean = titles.str.replace(pattern, 'email', flags=re.I)

In [8]:
#EXTRACTING DOMAINS FROM LIST OF URLS

test_urls = pd.Series([
 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429',
 'http://www.interactivedynamicvideo.com/',
 'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0',
 'http://evonomics.com/advertising-cannot-maintain-internet-heres-solution/',
 'HTTPS://github.com/keppel/pinn',
 'Http://phys.org/news/2015-09-scale-solar-youve.html',
 'https://iot.seeed.cc',
 'http://www.bfilipek.com/2016/04/custom-deleters-for-c-smart-pointers.html',
 'http://beta.crowdfireapp.com/?beta=agnipath',
 'https://www.valid.ly?param',
 'http://css-cursor.techstream.org'
])

#Extracts only the domain, from URL. First series matches protocol, next mactches
#Chars in domain, since URL end with domain or continue with '/'.
#No need to search for / since it is excluded from domain name.
pattern = r"https?://([\w\-\.]+)"

#Extract just domain name and assign to test_urls_clean
test_urls_clean = test_urls.str.extract(pattern, flags=re.I)

#Extract domain name from column, assign to domains
#EXPAND = false to avoid error of df not having value count option
domains = hn['url'].str.extract(pattern, flags=re.I, expand = False)

#Freq table of domain names, top5 only
top_domains = domains.value_counts().head(5)


In [10]:
#EXTRACTING URL PARTS USING MULTIPLE CAPTURE GROUPS
# `test_urls` is available from the previous screen
#Capture url components in 3 groups:
#First contains protocol text, up to ://
#second contains domain, after :// not including /
#third contains page path, from / to end of string

#captures http + optional s, captures 1 or more word chars with either. or -, 
#captures zero ir more non-newline characters
pattern = r"(https?)://([\w\.\-]+)/?(.*)"

test_url_parts = test_urls.str.extract(pattern, flags=re.I)

url_parts = hn['url'].str.extract(pattern, flags=re.I)
url_parts

Unnamed: 0,0,1,2
0,http,www.interactivedynamicvideo.com,
1,http,www.thewire.com,entertainment/2013/04/florida-djs-april-fools-...
2,https,www.amazon.com,Technology-Ventures-Enterprise-Thomas-Byers/dp...
3,http,www.nytimes.com,2007/11/07/movies/07stein.html?_r=0
4,http,arstechnica.com,business/2015/10/comcast-and-other-isps-boost-...
...,...,...,...
20094,https,puri.sm,philosophy/how-purism-avoids-intels-active-man...
20095,https,medium.com,@zreitano/the-yc-application-broken-down-and-t...
20096,http,blog.darknedgy.net,technology/2016/01/01/0/
20097,https,medium.com,@benjiwheeler/how-product-hunt-really-works-d8...


In [None]:
#NAMED CAPTURE GROUPS TO EXRACT DATA 
#Add labels to 3 capture groups
pattern = r"(?P<protocol>https?)://(?P<domain>.[\w\.\-]+)/?(?P<path>.*)"
#Capture same as above section for 'url' column of hn
url_parts = hn['url'].str.extract(pattern, flags=re.I)