# 1. Introduction

In [3]:
import pandas as pd
import re

hn = pd.read_csv("hacker_news.csv")
titles = hn['title']

pattern=r"sql"

sql_counts=titles.str.contains(pattern,flags=re.I).sum()
print(sql_counts)

108


# 2. Capture Groups

![](https://s3.amazonaws.com/dq-content/369/single_capture_group.svg)

In [11]:
hn_sql = hn[hn['title'].str.contains(r"\w+SQL", flags=re.I)].copy()

hn_sql["flavor"]=hn_sql["title"].str.extract(r"(\w+sql)",flags=re.I)

hn_sql["flavor"]=hn_sql["flavor"].str.lower()

sql_pivot=hn_sql.pivot_table(values="num_comments",index="flavor")

print(hn_sql)

print(sql_pivot)

             id  ...      flavor
142    10957172  ...  postgresql
221    11544342  ...      memsql
882    10413272  ...  postgresql
1160   10546681  ...       nosql
1197   11583183  ...  postgresql
1370   10532855  ...       nosql
2430   12300670  ...       mysql
2432   10361294  ...       nosql
4546   11437660  ...  postgresql
4568   10725042  ...       nosql
4616   10674187  ...  postgresql
4944   10519135  ...       nosql
5398   12430768  ...  postgresql
5523   11174174  ...       mysql
5654   11170360  ...    sparksql
5738   10484824  ...       mysql
5844   11984351  ...       nosql
6523   12576116  ...  postgresql
6532   10469304  ...       mysql
7050   12329499  ...       mysql
7245   12142364  ...  postgresql
7571   12576002  ...  postgresql
8371   11588305  ...  postgresql
8823   10761955  ...  postgresql
9643   11353322  ...  postgresql
10238  10204844  ...  postgresql
10264  11183348  ...  postgresql
10478  11458621  ...       nosql
10851  11927626  ...       nosql
11793  112

# 3. Using Capture Groups to Extract Data 

In [20]:
pattern=r"[Pp]ython ([\d.]+)"

py_versions=titles.str.extract(pattern,expand=False)

py_versions_freq=dict(py_versions.value_counts())

print(py_versions,py_versions_freq)

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
        ... 
20094    NaN
20095    NaN
20096    NaN
20097    NaN
20098    NaN
Name: title, Length: 20099, dtype: object {'3': 10, '3.5': 3, '2': 3, '3.6': 2, '4': 1, '3.5.0': 1, '1.5': 1, '2.7': 1, '8': 1}


# 4. Counting Mentions of the 'C' Language

In [23]:
def first_10_matches(pattern):
    """
    Return the first 10 story titles that match
    the provided regular expression
    """
    all_matches = titles[titles.str.contains(pattern)]
    first_10 = all_matches.head(10)
    return first_10

pattern = r"\b[Cc]\b[^\+\.]"

first_ten=first_10_matches(pattern)

print(first_ten)

365                      The new C standards are worth it
444           Moz raises $10m Series C from Foundry Group
521          Fuchsia: Micro kernel written in C by Google
1307            Show HN: Yupp, yet another C preprocessor
1326                     The C standard formalized in Coq
1365                          GNU C Library 2.23 released
1429    Cysignals: signal handling (SIGINT, SIGSEGV, )...
1620                        SDCC  Small Device C Compiler
1949    Rewriting a Ruby C Extension in Rust: How a Na...
2195    MyHTML  HTML Parser on Pure C with POSIX Threa...
Name: title, dtype: object


# 5. Using Lookarounds to Control Matches Based on Surrounding Text

![](https://s3.amazonaws.com/dq-content/369/lookarounds.svg)

In [41]:
pattern=r"(?<!Series\s)\b[Cc]\b(?![.\+])"


print(titles[titles.str.contains(pattern)])

c_mentions=titles.str.contains(pattern).sum()
print(c_mentions)

365                       The new C standards are worth it
521           Fuchsia: Micro kernel written in C by Google
1307             Show HN: Yupp, yet another C preprocessor
1326                      The C standard formalized in Coq
1365                           GNU C Library 2.23 released
                               ...                        
18543                 C-style for loops removed from Swift
18549            Show HN: An awesome C library for Windows
18649                 Python vs. C/C++ in embedded systems
19151                      Ask HN: How to learn C in 2016?
19933    Lightweight C library to parse NMEA 0183 sente...
Name: title, Length: 102, dtype: object
102


# 6. BackReferences: Using Capture Groups in a RegEx Pattern

![](https://s3.amazonaws.com/dq-content/369/backreference_syntax_1.svg)

In [45]:
pattern=r"\b(\w+)\s\1\b"

repeated_words=titles[titles.str.contains(pattern)]

print(repeated_words)

3102                  Silicon Valley Has a Problem Problem
3176                Wire Wire: A West African Cyber Threat
3178                         Flexbox Cheatsheet Cheatsheet
4797                            The Mindset Mindset (2015)
7276     Valentine's Day Special: Bye Bye Tinder, Flirt...
10371    Mcdonalds copying cyriak  cows cows cows in th...
11575                                    Bang Bang Control
11901          Cordless Telephones: Bye Bye Privacy (1991)
12697          Solving the the Monty-Hall-Problem in Swift
15049    Bye Bye Webrtc2SIP: WebRTC with Asterisk and A...
15839          Intellij-Rust Rust Plugin for IntelliJ IDEA
Name: title, dtype: object


  return func(self, *args, **kwargs)


# 7. Substituting Regular Expression Matches

In [56]:
email_variations = pd.Series(['email', 'Email', 'e Mail',
                        'e mail', 'E-mail', 'e-mail',
                        'eMail', 'E-Mail', 'EMAIL'])

pattern=r"\be[\s-]?mail?"

print(email_variations)

email_uniform=email_variations.str.replace(pattern,"email",flags=re.I)

print(email_uniform)

titles_clean=titles.str.replace(pattern,"email",flags=re.I)

print(titles_clean)

0     email
1     Email
2    e Mail
3    e mail
4    E-mail
5    e-mail
6     eMail
7    E-Mail
8     EMAIL
dtype: object
0    email
1    email
2    email
3    email
4    email
5    email
6    email
7    email
8    email
dtype: object
0                                Interactive Dynamic Video
1        Florida DJs May Face Felony for April Fools' W...
2             Technology ventures: From Idea to Enterprise
3        Note by Note: The Making of Steinway L1037 (2007)
4        Title II kills investment? Comcast and other I...
                               ...                        
20094    How Purism Avoids Intels Active Management Tec...
20095            YC Application Translated and Broken Down
20096    Microkernels are slow and Elvis didn't do no d...
20097                        How Product Hunt really works
20098    RoboBrowser: Your friendly neighborhood web sc...
Name: title, Length: 20099, dtype: object


# 8. Extracting Domains from URLs

In [None]:
test_urls = pd.Series([
 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429',
 'http://www.interactivedynamicvideo.com/',
 'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0',
 'http://evonomics.com/advertising-cannot-maintain-internet-heres-solution/',
 'HTTPS://github.com/keppel/pinn',
 'Http://phys.org/news/2015-09-scale-solar-youve.html',
 'https://iot.seeed.cc',
 'http://www.bfilipek.com/2016/04/custom-deleters-for-c-smart-pointers.html',
 'http://beta.crowdfireapp.com/?beta=agnipath',
 'https://www.valid.ly?param',
 'http://css-cursor.techstream.org'
])

pattern = r"(https?://)()"