# Regular expression REGEX
## Introduction

In [1]:
string = ['data science', 'big data']
regex = 'data'
# our regex correspond to our list elements

## Generic characters

In [2]:
# . (dot)
string = ['but', 'batte', 'robotique']
regex = 'b.t'

## look for start and end of string

In [3]:
# ^ -> beginning

# $ -> end

In [5]:
# '^a' -> match all string starting with 'a'

# 'a$' -> match all string ending with 'a'

In [6]:
# Combine with generic characters !
string = ['il est sur le feu', 'fou']
bad_string = ['un feu de paille']
regex = 'f.u$'

# Dataset top 1000 questions AskReddit in 2015

In [11]:
import csv 

file = csv.reader(open('askreddit-2015.csv', 'r', encoding = 'utf-8'))
posts = list(file)

In [12]:
print(posts[0:5])

[['Title', 'Score', 'Time', 'Gold', 'NumComs'], ['What\'s your internet "white whale", something you\'ve been searching for years to find with no luck?', '11510', '1433213314.0', '1', '26195'], ["What's your favorite video that is 10 seconds or less?", '8656', '1434205517.0', '4', '8479'], ['What are some interesting tests you can take to find out about yourself?', '8480', '1443409636.0', '1', '4055'], ["PhD's of Reddit. What is a dumbed down summary of your thesis?", '7927', '1440188623.0', '0', '13201']]


In [13]:
posts = posts[1:]

In [14]:
for post in posts[:10]:
    print(post)

['What\'s your internet "white whale", something you\'ve been searching for years to find with no luck?', '11510', '1433213314.0', '1', '26195']
["What's your favorite video that is 10 seconds or less?", '8656', '1434205517.0', '4', '8479']
['What are some interesting tests you can take to find out about yourself?', '8480', '1443409636.0', '1', '4055']
["PhD's of Reddit. What is a dumbed down summary of your thesis?", '7927', '1440188623.0', '0', '13201']
['What is cool to be good at, yet uncool to be REALLY good at?', '7711', '1440082910.0', '0', '20325']
['[Serious] Redditors currently in a relationship, besides dinner and a movie, what are your favorite activities for date night?', '7598', '1439993280.0', '2', '5389']
["Parents of Reddit, what's something that your kid has done that you pretended to be angry about but secretly impressed or amused you?", '7553', '1439161809.0', '0', '11520']
['What is a good subreddit to binge read the All Time Top Posts of?', '7498', '1438822288.0',

## Find corresponding items with re() module

In [15]:
# re.search(regex, string)
import re

if re.search('baton', 'kung fu') is not None:
    print('found!')
else:
    print('no match!')

no match!


In [16]:
if re.search('f.', 'kung fu') is not None:
    print('found!')
else:
    print('no match!')

found!


In [19]:
# 'Of reddit'
of_reddit_count = 0
for post in posts:
    if re.search('of Reddit', post[0]) is not None:
        of_reddit_count +=1
        
print(of_reddit_count)    

76


## Match various letters !

In [20]:
# example
regex = '[slm]ac'
string = ['sac', 'lac', 'mac']

In [21]:
of_reddit_count = 0

for post in posts:
    regex = 'of [Rr]eddit'
    if re.search(regex, post[0]) is not None:
        of_reddit_count += 1
        
print(of_reddit_count)

102


## Ignore specific characters 

In [25]:
# [Serious] -> to say they look for serious answer on reddit
regex = "[Serious]" # doesn't work, found all accurences
# to ignore specific -> \

regex = '\.$'

# Count all [Serious tags]

serious_count = 0
for post in posts:
    regex = '\[Serious\]'
    if re.search(regex, post[0]) is not None:
        serious_count +=1
        
print(serious_count)

69


## Improve our regex

In [28]:
serious_count = 0
for post in posts:
    regex = '[\[\(][Ss]erious[\]\)]' # \ before all specific character except [] used to allow different options
    # works for (serious)--(Serious)--[Serious]--[serious]
    if re.search(regex, post[0]) is not None:
        serious_count +=1
        
print(serious_count)

80


## Combine multiple Regex

In [40]:
# use of | like in other language

serious_start_count = 0
serious_end_count = 0
serious_count_final = 0

for post in posts:
    regex_start = '^[\[\(][Ss]erious[\]\)]'
    regex_end = '[\[\(][Ss]erious[\]\)]$'
    final_regex = '^[\[\(][Ss]erious[\]\)]|[\[\(][Ss]erious[\]\)]$'
    if re.search(regex_start, post[0]) is not None:
        serious_start_count += 1
    if re.search(regex_end, post[0]) is not None:
        serious_end_count +=1
    if re.search(final_regex, post[0]) is not None:
        serious_count_final +=1
    
    

In [41]:
print(serious_count_final)
print(serious_end_count)
print(serious_start_count)

80
11
69


## Modify strings with regex

In [46]:
# sub()

re.sub('yo', 'hello', 'hi world !')

# re.sub(found, change, string)

'hi world !'

In [49]:
new_posts = []

for post in posts:
    new_post = re.sub('[\[\(][Ss]erious[\]\)]', '[Serious]', post[0])
    new_posts.append(new_post)
    
for post in new_posts:   
    if re.search('\[Serious\]', post) is not None :
        print(post)



[Serious] Redditors currently in a relationship, besides dinner and a movie, what are your favorite activities for date night?
[Serious] At the end of a job interview when they ask me, "So, do you have any questions?", what are some genuinely good questions to ask?
What is that one trick that "they" really don't want you to know? [Serious]
[Serious] Redditors who want Trump to become president, why?
[Serious] People of Reddit who have HIV/AIDS, when did you realize something was wrong?
[Serious] What are some great apps for smartphones that people might not be aware of?
[Serious] people who have twin siblings but are not a twin themselves, how is your relationship with your twin brothers or sisters?
[Serious] Therapists/psychiatrists of reddit: what is something that most people think they are alone in experiencing/feeling/thinking?
[Serious] What's the creepiest TRUE story that happened to you or someone you know?
[Serious] Redditors who want Bernie Sanders to become president, why?
[

# Match years with regex

In [50]:
# [0-9] -> between 0 and 9
# same as [a-z] for letters
# [0-2] -> between 0 and 2

# for a year -> [0-2][0-9]{3} ({x} x the number of repetition)
# same as [0-2][0-9][0-9][0-9]

In [53]:
year_string = []

for string in year_string:
    if re.search('[0-2][0-9]{3}', string) is not None:
        # year in 4 digits found

SyntaxError: unexpected EOF while parsing (<ipython-input-53-c60eac5cbce1>, line 5)

## Extract all years

In [54]:
# findall()

re.findall('[a-z]', 'abc123')

['a', 'b', 'c']

In [57]:
string = " We're in 2018, one year past 2017, one year before 2019"

years = re.findall('[0-2][0-9]{3}', string)
print(years)

['2018', '2017', '2019']
