# Les expressions régulières (regex)

## Introduction

In [1]:
strings = ["data science", "big data"]
regex = "data"

## Caractères génériques

In [2]:
strings = ["but", "batte", "robotique"]
regex = "b.t"

## Rechercher le début et la fin d'une chaine de caractères

In [3]:
# ^ début d'une chaine de caractères
# $ fin d'une chaine de caractères

# ^a matche avec toutes les chaines de caractères qui commencent par a
# $a matche avec toutes les chaines de caractères qui se terminent par a

In [4]:
strings = ["il est sur le feu", "fou"]
bad_string = "un feu de paille"

regex = "f.u$"

## Introduction au dataset

In [5]:
import csv

f = open("askreddit_2015.csv", encoding="utf-8")
csvreader = csv.reader(f)
posts = list(csvreader)
posts[0:5]

[['Title', 'Score', 'Time', 'Gold', 'NumComs'],
 ['What\'s your internet "white whale", something you\'ve been searching for years to find with no luck?',
  '11510',
  '1433213314.0',
  '1',
  '26195'],
 ["What's your favorite video that is 10 seconds or less?",
  '8656',
  '1434205517.0',
  '4',
  '8479'],
 ['What are some interesting tests you can take to find out about yourself?',
  '8480',
  '1443409636.0',
  '1',
  '4055'],
 ["PhD's of Reddit. What is a dumbed down summary of your thesis?",
  '7927',
  '1440188623.0',
  '0',
  '13201']]

In [6]:
posts = posts[1:]

In [7]:
for post in posts[:10]:
    print(post)

['What\'s your internet "white whale", something you\'ve been searching for years to find with no luck?', '11510', '1433213314.0', '1', '26195']
["What's your favorite video that is 10 seconds or less?", '8656', '1434205517.0', '4', '8479']
['What are some interesting tests you can take to find out about yourself?', '8480', '1443409636.0', '1', '4055']
["PhD's of Reddit. What is a dumbed down summary of your thesis?", '7927', '1440188623.0', '0', '13201']
['What is cool to be good at, yet uncool to be REALLY good at?', '7711', '1440082910.0', '0', '20325']
['[Serious] Redditors currently in a relationship, besides dinner and a movie, what are your favorite activities for date night?', '7598', '1439993280.0', '2', '5389']
["Parents of Reddit, what's something that your kid has done that you pretended to be angry about but secretly impressed or amused you?", '7553', '1439161809.0', '0', '11520']
['What is a good subreddit to binge read the All Time Top Posts of?', '7498', '1438822288.0',

## Compter les valeurs avec le module re()

In [8]:
# re.search(regex, string)

import re

if re.search("f.", "kung fu") is not None:
    print("Trouvé")
else:
    print("Aucune correspondance")

Trouvé


### Training

In [9]:
# "of Reddit"

import re

of_reddit_count = 0

for row in posts:
    if re.search("of Reddit", row[0]) is not None:
        of_reddit_count += 1
print(of_reddit_count)
        

76


## Crochets pour matcher avec plusieurs lettres

In [10]:
regex = "[slm]ac"
strings = ["sac", "lac", "mac"]

### Training

In [11]:
# "of Reddit"

import re

of_reddit_count = 0

for row in posts:
    if re.search("of [Rr]eddit", row[0]) is not None:
        of_reddit_count += 1
print(of_reddit_count)

102


## Ignorer des caractères spéciaux

In [12]:
# [Serious]

regex = "[Serious]"

# \ à placer juste avant l'élément à ignorer

regex = "\.$" # ici le . n'est plus un caraéctère spécial mais un .

### Training

In [13]:
# [Serious]

import re

serious_count = 0

for row in posts:
    if re.search("\[Serious\]", row[0]) is not None:
        serious_count += 1
print(serious_count)

69


## Améliorer notre Regex

In [14]:
# (Serious) (serious) [Serious] [serious]

import re

serious_count = 0

for row in posts:
    if re.search("[\(\[][Ss]erious[\]\)]", row[0]) is not None:
        serious_count += 1
print(serious_count)

80


## Combiner plusieurs regex

In [15]:
# ^ trouver un élément au début d'un texte

# $ trouver un élément à la fin d'un texte

# | combiner 2 éléments d'un regex

### Training

In [20]:
# (Serious) (serious) [Serious] [serious] au début du titre
# (Serious) (serious) [Serious] [serious] à la fin du titre
# (Serious) (serious) [Serious] [serious] au début et à la fin

import re

serious_start_count = 0
serious_end_count = 0
serious_count_final = 0

for row in posts:
    if re.search("^[\(\[][Ss]erious[\]\)]", row[0]) is not None:
        serious_start_count += 1
    if re.search("[\(\[][Ss]erious[\]\)]$", row[0]) is not None:
        serious_end_count += 1
    if re.search("^[\(\[][Ss]erious[\]\)]|[\(\[][Ss]erious[\]\)]$", row[0]) is not None:
        serious_count_final += 1
    
print(serious_start_count)
print(serious_end_count)
print(serious_count_final)

69
11
80


## Modifier des chaînes de caractères avec regex

In [21]:
# sub() modifier un élément par un autre

In [26]:
re.sub("yo", "hello", "yo world !") # ("regex qui doit matcher", "chaine de caractère qui remplace l'élément", "chaine qui contient l'élément")

'hello world !'

### Training

In [33]:
# remplacer (Serious) (serious) [Serious] [serious] par [Serious]

import re

posts_new = []

for row in posts:
    row[0] = re.sub("[\(\[][Ss]erious[\]\)]", "[Serious]", row[0])
    posts_new.append(row)

print(posts_new[0:14])

[['What\'s your internet "white whale", something you\'ve been searching for years to find with no luck?', '11510', '1433213314.0', '1', '26195'], ["What's your favorite video that is 10 seconds or less?", '8656', '1434205517.0', '4', '8479'], ['What are some interesting tests you can take to find out about yourself?', '8480', '1443409636.0', '1', '4055'], ["PhD's of Reddit. What is a dumbed down summary of your thesis?", '7927', '1440188623.0', '0', '13201'], ['What is cool to be good at, yet uncool to be REALLY good at?', '7711', '1440082910.0', '0', '20325'], ['[Serious] Redditors currently in a relationship, besides dinner and a movie, what are your favorite activities for date night?', '7598', '1439993280.0', '2', '5389'], ["Parents of Reddit, what's something that your kid has done that you pretended to be angry about but secretly impressed or amused you?", '7553', '1439161809.0', '0', '11520'], ['What is a good subreddit to binge read the All Time Top Posts of?', '7498', '143882

## Matcher les années avec regex

In [34]:
# [0-9]

# [a-z]

# [0-2]

# [1-2][0-9][0-9][0-9]

# [1-2][0-9]{3} ligne équivalente à celle du dessus

In [35]:
year_strings = []

for string in strings:
    if re.search("[1-2][0-9]{3}", string) is not None:
        year_strings.append(string)

## Extraire toutes les années

In [36]:
# findall()

re.findall("[a-z]", "abc123")

['a', 'b', 'c']

### Training

In [39]:
year_strings = "On est déjà en 2017, une année de plus que 2016 et de moins que 2018"
years = []

years = re.findall("[1-2][0-9]{3}", year_strings)

print(years)

['2017', '2016', '2018']
