# Iterables: strings, lists, tuples, sets, dictionaries

In [1]:
#create an empty string
s = ""
#create an unempty string
s = "cat"

In [2]:
#create an empty list
ls = []
#create an unempty list
ls = ["cat", "dog"]

In [3]:
#create an empty tuple
tp = ()
#create an unempty tuple
tp = ("cat", "dog")

In [4]:
#create an empty set
st = set()
#create an unempty set
st = {"cat", "dog"}

In [5]:
#create an empty dictionary
dt = {}
#create an unempty dictionary
dt = {"cat": 4, "dog": 3}

In [6]:
#convert a list to a set or vice versa
list2set = set(['cat', 'dog'])
print (list2set)

set2list = list({'cat', 'dog'})
print (set2list)

{'cat', 'dog'}
['cat', 'dog']


In [7]:
#convert a list of tuples of two elements each to a dictionary
list2dt = dict([('cat', 4), ('dog', 3)])
print (list2dt)

{'cat': 4, 'dog': 3}


In [8]:
#The zip funtion takes two or more lists (or tuples, or any iterable) and combine them into one list, pairing up items from the two original lists:
l1 = ['cat', 'dog']
l2 = [4, 3]
print (list(zip(l1, l2)))
print (tuple(zip(l1, l2)))
print (set(zip(l1, l2)))
dict(zip(l1, l2)) #if we zip two iterables

[('cat', 4), ('dog', 3)]
(('cat', 4), ('dog', 3))
{('dog', 3), ('cat', 4)}


{'cat': 4, 'dog': 3}

In [9]:
#Example:

#to produce a list of 2-grams from a text
t = "this is the sentence"
#2gramlist = list(zip(t, t[1:])) # there is error because variable names cannot start with numbers
gram2list = list(zip(t, t[1:]))
print (gram2list)

[('t', 'h'), ('h', 'i'), ('i', 's'), ('s', ' '), (' ', 'i'), ('i', 's'), ('s', ' '), (' ', 't'), ('t', 'h'), ('h', 'e'), ('e', ' '), (' ', 's'), ('s', 'e'), ('e', 'n'), ('n', 't'), ('t', 'e'), ('e', 'n'), ('n', 'c'), ('c', 'e')]


In [10]:
#Question: How about producing a list of 3-grams?

#Example Anser:
t = "this is the sentence"
gram3list = list(zip(t, t[1:], t[2:]))
print (gram3list)

[('t', 'h', 'i'), ('h', 'i', 's'), ('i', 's', ' '), ('s', ' ', 'i'), (' ', 'i', 's'), ('i', 's', ' '), ('s', ' ', 't'), (' ', 't', 'h'), ('t', 'h', 'e'), ('h', 'e', ' '), ('e', ' ', 's'), (' ', 's', 'e'), ('s', 'e', 'n'), ('e', 'n', 't'), ('n', 't', 'e'), ('t', 'e', 'n'), ('e', 'n', 'c'), ('n', 'c', 'e')]


In [11]:
#Example:

#We have a list of words and want to create a new list that contains the length of each word

#-without comprehension
wordlist = ['cat', 'dog', 'horse']
lengthlist = []
for w in wordlist:
    lengthlist.append(len(w))
print (lengthlist)

#-with comprehension
lengthlist = [len(w) for w in wordlist]
print (lengthlist)

[3, 3, 5]
[3, 3, 5]


In [12]:
#List comphrehension can also be used to filter lists by adding a condition at the end

#EXAMPLE
#We have a list of words and want to create a new list of only words that are shorter than 5 characters.
wordlist = ['cat', 'dog', 'horse']
filteredlist = [w for w in wordlist if len(w) < 5]
print (filteredlist)

['cat', 'dog']


In [13]:
#EXAMPLE of dictionary comprehension 1

#We have a list of words and want to create a dictionary of key:value as word:word-length from the wordlist
wordlist = ['cat', 'dog', 'horse']
word_length_dict = {w:len(w) for w in wordlist}
print (word_length_dict)

{'cat': 3, 'horse': 5, 'dog': 3}


In [14]:
#EXAMPLE of dictionary comprehension 2

#We have a list of words and want to create a dictionary of key:value as word:word-length from the wordlist for words that have less than 5 characters
wordlist = ['cat', 'dog', 'horse']
short_word_length_dict = {w:len(w) for w in wordlist if len(w) < 5}
print (short_word_length_dict)

{'cat': 3, 'dog': 3}


In [15]:
#Example anser:

t = "this is the sentence"
#gram2list = list(zip(t, t[1:]))
#gram2str_list = [x[0]+x[1] for x in gram2list]

#a more compact version
gram2str_list = [x[0]+x[1] for x in list(zip(t, t[1:]))]
print (gram2str_list)

['th', 'hi', 'is', 's ', ' i', 'is', 's ', ' t', 'th', 'he', 'e ', ' s', 'se', 'en', 'nt', 'te', 'en', 'nc', 'ce']


In [16]:
#Example anser:

"""lines = [line.strip() for line in open("gettysburg.txt")]
wordlist = []
for line in lines:
    line = line.split()
    wordlist.extend(line)
"""

wordlist = []
for line in open("gettysburg.txt"):
    line = line.strip().split()
    wordlist.extend(line)


#gram3list = list(zip(wordlist, wordlist[1:], wordlist[2:]))
#gram3str_list = [' '.join(w for w in x) for x in gram3list]

#a more compact version
gram3str_list = [x[0]+' '+ x[1]+' '+x[2] for x in list(zip(wordlist, wordlist[1:], wordlist[2:]))]
print (gram3str_list)



['four score and', 'score and seven', 'and seven years', 'seven years ago', 'years ago our', 'ago our fathers', 'our fathers brought', 'fathers brought forth', 'brought forth on', 'forth on this', 'on this continent', 'this continent a', 'continent a new', 'a new nation', 'new nation conceived', 'nation conceived in', 'conceived in liberty', 'in liberty and', 'liberty and dedicated', 'and dedicated to', 'dedicated to the', 'to the proposition', 'the proposition that', 'proposition that all', 'that all men', 'all men are', 'men are created', 'are created equal', 'created equal now', 'equal now we', 'now we are', 'we are engaged', 'are engaged in', 'engaged in a', 'in a great', 'a great civil', 'great civil war', 'civil war testing', 'war testing whether', 'testing whether that', 'whether that nation', 'that nation or', 'nation or any', 'or any nation', 'any nation so', 'nation so conceived', 'so conceived and', 'conceived and so', 'and so dedicated', 'so dedicated can', 'dedicated can l

# Convert finite automata into regex

According to Kleene's Theorem, any finite automaton can be expressed by an regular expression, and vice versa.


![title](img/FA1.png)

![title](img/FA2.png)

![title](img/FA3.png)

![title](img/FA4.png)

![title](img/FA5.png)

# Regular expressions in Python and the re module

In [1]:
#raw Python string: r""
#grouping
#re.findall()
#re.search() --focus
#re.findall()
#re.match()
#re.sub()

In [35]:
import re

t = "A fat cat doesn't eat oat but a rat eats bats."
print (re.findall("[force]at", t))

['fat', 'cat', 'eat', 'oat', 'rat', 'eat']


In [37]:
#EXAMPLE:

#We have a long string with various Python training courses and their dates. 
courses = "Python Training Course for Beginners: 15/Aug/2011 - 19/Aug/2011;Python Training Course Intermediate: 12/Dec/2011 - 16/Dec/2011;Python Text Processing Course: 31/Oct/2011 - 4/Nov/2011"

#With the following use of re.findall, we don't use any grouping and receive the complete string as a result. 
items = re.findall("[^:]*: [^;]*;?", courses)
print (items)

#In the next call to re.findall, we use grouping and findall returns a list of 2-tuples, each having the course name as the first component and the dates as the second component
items = re.findall("([^:]*): ([^;]*;?)", courses)
print (items)

['Python Training Course for Beginners: 15/Aug/2011 - 19/Aug/2011;', 'Python Training Course Intermediate: 12/Dec/2011 - 16/Dec/2011;', 'Python Text Processing Course: 31/Oct/2011 - 4/Nov/2011']
[('Python Training Course for Beginners', '15/Aug/2011 - 19/Aug/2011;'), ('Python Training Course Intermediate', '12/Dec/2011 - 16/Dec/2011;'), ('Python Text Processing Course', '31/Oct/2011 - 4/Nov/2011')]


In [20]:
import re

x = re.search(r"cat", "A cat and a rat can't be friends.")
print (x)

<_sre.SRE_Match object; span=(2, 5), match='cat'>


In [21]:
y = re.search(r"cow", "A cat and a rat can't be friends.")
print (y)

None


In [22]:
if re.search("cat", "A cat and a rat can't be friends."):
    print ("Some kind of cat has been found.")
else:
    print ("No cat has been found.")

Some kind of cat has been found.


In [23]:
if re.search("cow", "A cat and a rat can't be friends."):
    print ("Some kind of cow has been found.")
else:
    print ("No cow has been found.")

No cow has been found.


In [24]:
#Exampe Answer:

import re
print ([line.strip() for line in open('simpsons_phone_book.txt') if re.search(r"J.*Neu", line)])

['Jack Neu 555-7666', 'Jeb Neu 555-5543', 'Jennifer Neu 555-3652']


In [25]:
#Extension
#Instead of downloading simpsons_phone_book.txt, we can use the file directly from the website by using urlopen from the module urllib.request

import re
from urllib.request import urlopen

#print ([line.strip() for line in urlopen('https://www.python-course.eu/simpsons_phone_book.txt') if re.search(r"J.*Neu", line)])
#this does not work because what urlopen returns is a bytes-like object

with urlopen('https://www.python-course.eu/simpsons_phone_book.txt') as f:
    for line in f:
        # line is a byte string so we transform it to utf-8:
        line = line.decode('utf-8').strip() 
        if re.search(r"J.*Neu",line):
            print(line)

Jack Neu 555-7666
Jeb Neu 555-5543
Jennifer Neu 555-3652


In [26]:
s = "Customer number: 232454, Date: February 12, 2011"
mo = re.search("([0-9]+).*: (.*)", s)
print (mo)

<_sre.SRE_Match object; span=(17, 48), match='232454, Date: February 12, 2011'>


In [27]:
mo.group()

'232454, Date: February 12, 2011'

In [28]:
mo.group(1)  #numbering for group() starts with 1, not 0
#to the the substring matched by the first grouping parenthesis
#i.e. ([0-9]+)

'232454'

In [29]:
mo.group(2)
#to the the substring matched by the first grouping parenthesis
#i.e. (.*)

'February 12, 2011'

In [30]:
mo.group(1, 2)

('232454', 'February 12, 2011')

In [31]:
#Example:

#An XML or HTML tag is usually like this <A>B</A>. For example "<composer>Wolfgang Amadeus Mozart</composer>".
#Let's supposed A's and B's are strings consisting of only letters or space. 

#Please print out the A and B parts in the string "<composer>Wolfgang Amadeus Mozart</composer>"

In [32]:
#r"<([a-zA-Z ]+)>[a-zA-Z ]+</\1>"

g = re.search(r"<([a-zA-Z ]+)>([a-zA-Z ]+)</\1>", "<composer>Wolfgang Amadeus Mozart</composer>")
print (g.group(1))
print (g.group(2))

composer
Wolfgang Amadeus Mozart


In [34]:
#Example Answer:

import re
tags = ["<composer>Wolfgang Amadeus Mozart</composer>", "<author>Samuel Beckett</author>", "<city>London</city>"]
fw = open("tags_formated.txt", "w")
for i in tags:
    res = re.search(r"<([a-z]+)>(.*)</\1>", i) #it works
    #res = re.search("<([a-z]+)>(.*)</\\1>", i) #it works
    #res = re.search("<([a-z]+)>(.*)</\1>", i) #this one does not work, because of slash \
                                               #"raw Python strings" vs "regular Python strings"
                                               #r"" changes string into raw Python strings
    #newform = res.group(1) + ": " + res.group(2)
    #fw.write(newform + "\n")
    fw.write(res.group(1) + ": " + res.group(2) + "\n") #??? why does this not write to the output file?
    print (res.group(1) + ": " + res.group(2))

#fw.close()

fw.flush() # later more things can be written into the output file.

composer: Wolfgang Amadeus Mozart
author: Samuel Beckett
city: London


In [40]:
#Example Answer 2:
import re
tags = ["<composer>Wolfgang Amadeus Mozart</composer>", "<author>Samuel Beckett</author>", "<city>London</city>"]
for i in tags:
    res = re.findall(r"<([a-z]+)>(.*)</\1>", i) #it works
    for item in res:
        print (item[0] + ": " + item[1])


composer: Wolfgang Amadeus Mozart
author: Samuel Beckett
city: London
