In [2]:
import numpy as np
import pandas as pd
import re


Python Regular Expression Quick Guide
^ Matches the beginning of a line
$ Matches the end of the line
. Matches any character
\s Matches whitespace
\S Matches any non-whitespace character
* Repeats a character zero or more times
*? Repeats a character zero or more times
(non-greedy)
+ Repeats a character one or more times
+? Repeats a character one or more times
(non-greedy)
[aeiou] Matches a single character in the listed set
[^XYZ] Matches a single character not in the listed set
[a-z0-9] The set of characters can include a range
( Indicates where string extraction is to start
) Indicates where string extraction is to end

### Expressions :
***
`\d`                         Any numeric digit from `0` to `9`.
`\D`                         Matches any character which is not a decimal digit.
                           This is the opposite of `\d`.
`\w`                         Any letter, numeric digit, or the underscore
                           character.  (Think of this as matching
                           "word" characters.)
`\W`                         Any character that is not a letter,
                           numeric digit, or the underscore character.
`\s`                         Any space, tab, or newline character.  (
                           Think of this as matching white-space
                           characters.)
`\S`                         Any character that is not a space, tab,
                           or newline.
***

In [3]:
text = "A78L41K"

In [4]:
num = re.search("\d",text)  #ilk karsilastigi num 7 # basladigi  index 1 bittigi index2
num

<re.Match object; span=(1, 2), match='7'>

In [5]:
num.group()

'7'

In [7]:
num = re.search("\d\d",text)  #iki hane
num

<re.Match object; span=(1, 3), match='78'>

In [9]:
num = re.search("\w",text)
num

<re.Match object; span=(0, 1), match='A'>

In [10]:
num = re.search("\w\w",text)
num

<re.Match object; span=(0, 2), match='A7'>

In [11]:
###

In [12]:
num = re.search("\d\d",text)  
num

<re.Match object; span=(1, 3), match='78'>

In [13]:
text = "8PM19MIN"

In [14]:
nongdigit= re.search("\D",text)
nongdigit

<re.Match object; span=(1, 2), match='P'>

In [15]:
nongdigit= re.search("\D",text)
print(nongdigit.group())

P


In [16]:
text = 'My phone number is 5556667777'
#Burdan sadece telefon numarasini nasil cekecegiz?

In [18]:
telno= re.search("\d\d\d\d\d\d\d\d\d\d",text)
print(telno)

<re.Match object; span=(19, 29), match='5556667777'>


In [22]:
telno= re.search("\d\d\d\d\d\d\d\d\d\d",text)
print(telno.group())

5556667777


In [23]:
text = 'My phone number is 415-555-1212'

In [26]:
telno=re.search("\d""\d""\d"'-'"\d""\d""\d"'-'"\d""\d""\d""\d",text)
print(telno.group(0))

415-555-1212


In [31]:
telno=re.search('(\d\d\d)-(\d\d\d)-(\d\d\d\d)',text)
print(telno.group(0))
print(telno.group(1))
print(telno.group(2))
print(telno.group(3))

415-555-1212
415
555
1212


In [32]:
with open("text.txt","w") as file:
    file.write(text)

In [34]:
with open("text.txt","r") as file:
    txt=file.read()
print(txt)

My phone number is 415-555-1212


###

In [36]:
value = "O 1, t 10, o 100. 100000"
value

'O 1, t 10, o 100. 100000'

In [38]:
output= re.findall("\d{1}",value)   #tek olanlari buluyor
#bir digit olanlari bul
print(output)

['1', '1', '0', '1', '0', '0', '1', '0', '0', '0', '0', '0']


In [39]:
output= re.findall("\d{2}",value)  #2 digit olanlari bul
print(output)

['10', '10', '10', '00', '00']


In [40]:
output= re.findall("\d{1,6}",value)  #6 ya kadar hepsini bul
print(output)

['1', '10', '100', '100000']


In [41]:
phone = "2004-959-559 # This is Phone Number"

In [42]:
###istemedigimiz kisimlari degistirecegiz

In [48]:
output = re.sub("\D",'',phone)  #ilkinde neyi degistireceksin ikincisinde ne ile degistirecen
print(output)

2004959559


In [49]:
output

'2004959559'

In [50]:
output = re.sub("\d",'+',phone)  #ilkinde neyi degistireceksin ikincisinde ne ile degistirecen
print(output)

++++-+++-+++ # This is Phone Number


___
``"[]"``	  A set of characters	``"[a-m]"``
``"\"``	      Signals a special sequence (can also be used to escape special characters)
``"."``	      Any character (except newline character)
``"^"``	      Starts with	``"^hello"``
``"$"``	      Ends with	``"world$"``
``"*"``	      Zero or more occurrences
`"+"`	      One or more occurrences
`"{}"`	  Exactly the specified number of occurrences
`"|"`	      Either or	`"falls|stays"`
`"()"`	  Capture and group
___

In [51]:
txt = "1 person against 100 people"

In [53]:
output=re.findall('\d+',txt) #tum eslenenleri bul
print(output)

['1', '100']


In [55]:
output=re.findall('\d*',txt) #gereksizleri de buldu
print(output)

['1', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '100', '', '', '', '', '', '', '', '']


In [56]:
txt = "Hello World!"

In [59]:
output = re.findall("^H",txt)  #icinde H varmis
print(output)

['H']


In [63]:
a = re.findall('(^\S).*', txt)
a

['H']

In [66]:
out=re.findall('World!$',txt)
print(out)

['World!']


In [67]:
s = pd.Series(['a3', 'b4', 'c5', 'd'])

In [68]:
s.apply(lmabda x : True if re.search("\d",x)else False)

SyntaxError: invalid syntax (Temp/ipykernel_14000/617037327.py, line 1)

In [70]:
s.str.extract("(\d)")

Unnamed: 0,0
0,3.0
1,4.0
2,5.0
3,


In [71]:
s.str.extract("(\w)")

Unnamed: 0,0
0,a
1,b
2,c
3,d


In [72]:
s = pd.Series(['a3aa', 'b4aa', 'c5aa'])

In [75]:
s.str.extract("(\w)(\w)(\w)")  #w digitleri de icerir

Unnamed: 0,0,1,2
0,a,3,a
1,b,4,a
2,c,5,a


In [76]:
s.str.extract("(\w)\d(\w)(\w)") #digit kismini atlattik

Unnamed: 0,0,1,2
0,a,a,a
1,b,a,a
2,c,a,a


In [77]:
s= pd.Series(['40 l/100 km (comb)',
        '38 l/100 km (comb)', '6.4 l/100 km (comb)',
       '8.3 kg/100 km (comb)', '5.1 kg/100 km (comb)',
       '5.4 l/100 km (comb)', '6.7 l/100 km (comb)',
       '6.2 l/100 km (comb)', '7.3 l/100 km (comb)',
       '6.3 l/100 km (comb)', '5.7 l/100 km (comb)',
       '6.1 l/100 km (comb)', '6.8 l/100 km (comb)',
       '7.5 l/100 km (comb)', '7.4 l/100 km (comb)',
       '3.6 kg/100 km (comb)', '0 l/100 km (comb)',
       '7.8 l/100 km (comb)'])

In [78]:
s

0       40 l/100 km (comb)
1       38 l/100 km (comb)
2      6.4 l/100 km (comb)
3     8.3 kg/100 km (comb)
4     5.1 kg/100 km (comb)
5      5.4 l/100 km (comb)
6      6.7 l/100 km (comb)
7      6.2 l/100 km (comb)
8      7.3 l/100 km (comb)
9      6.3 l/100 km (comb)
10     5.7 l/100 km (comb)
11     6.1 l/100 km (comb)
12     6.8 l/100 km (comb)
13     7.5 l/100 km (comb)
14     7.4 l/100 km (comb)
15    3.6 kg/100 km (comb)
16       0 l/100 km (comb)
17     7.8 l/100 km (comb)
dtype: object

In [79]:
# eda ile sadelestirme yapacagiz sonra regex ile istdigimiz dataya ulasacagiz

In [80]:
s.str.extract("(\d\d)") #capture group parantezin icindekileri alir

Unnamed: 0,0
0,40
1,38
2,10
3,10
4,10
5,10
6,10
7,10
8,10
9,10


In [82]:
s.str.extract("(\d\d|\d.\d)") 

Unnamed: 0,0
0,40.0
1,38.0
2,6.4
3,8.3
4,5.1
5,5.4
6,6.7
7,6.2
8,7.3
9,6.3


In [84]:
s.str.extract("(\d\d|\d.\d|\d)") # or dan sonra arada bosluk koyunca hata veriyor

Unnamed: 0,0
0,40.0
1,38.0
2,6.4
3,8.3
4,5.1
5,5.4
6,6.7
7,6.2
8,7.3
9,6.3


In [86]:
result= s.str.extract("(\d\d|\d.d|\d).+(\d\d\d)")
result

Unnamed: 0,0,1
0,40,100
1,38,100
2,6,100
3,8,100
4,5,100
5,5,100
6,6,100
7,6,100
8,7,100
9,6,100


In [91]:
result = s.str.extract("(^\d*.\d*) \w*/(\d*)")
result

Unnamed: 0,0,1
0,40.0,100
1,38.0,100
2,6.4,100
3,8.3,100
4,5.1,100
5,5.4,100
6,6.7,100
7,6.2,100
8,7.3,100
9,6.3,100
