In [1]:
import re


In [4]:
# match() checks for a match that is at the beginning of the string, and returns a boolean
# search() checks for a match anywhere in the string, and returns a boolean

text = "This is a good day"

if re.search("good", text): 
    print("Is a good day")
else:
    print("Isn't")

if re.match("good", text): 
    print("Start with good")
else:
    print("Nope")

Is a good day
Nope


In [5]:
text = "Fran works diligently. Fran gets good grades. Our student Fran is succesful."

# Lets split this on all instances of Fran
re.split("Fran", text)


['',
 ' works diligently. ',
 ' gets good grades. Our student ',
 ' is succesful.']

In [6]:
re.findall("Fran", text)

['Fran', 'Fran', 'Fran']

In [7]:
re.search("^Fran", text)

<re.Match object; span=(0, 4), match='Fran'>

# Patterns and Character Classes

In [20]:
grades = "ACAAAABCBCBAA"

# How many B's are in the grade list?
re.findall("B", grades)

['B', 'B', 'B']

In [21]:
# Count the number of A's or B's in the list using the set operator []
re.findall("[AB]", grades)

['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'A', 'A']

In [22]:
# Instances where A is followed by a B or C
re.findall("[A][B-C]", grades)

['AC', 'AB']

In [23]:
# Or operator |
re.findall("AB|AC", grades)


['AC', 'AB']

In [24]:
# Negate the results
re.findall("[^A]", grades)

['C', 'B', 'C', 'B', 'C', 'B']

In [25]:
# The caret operator was previously matched to the beginning of a string as an anchor point, 
# but inside of the set operator the caret lose their meaning.
re.findall("^[^A]", grades)

#It's an empty list because match any value at the beginning of the string wich is not an A, but our string starts with an A

[]

# Quantifiers

In [26]:
# Quantifiers are the number of times you want a pattern to be matched in order to match.
# The most basic is expressed as exp{m,n} where "exp" is the expression or character we are matching, "m" is the minimum
# number of times you want it to matched, and "n" is the maximum number of times the item could be matched.

# How many times we got a back-to-back A's streak?
re.findall("A{2,20}", grades) # At least 2 A's and max 10 A's

['AAAA', 'AA']

In [27]:
# With single values
re.findall("A{1,1}A{1,1}", grades)

# This is diferent because the first is looking for any combination of two A's up to ten A's in a row, 
# so it sees four A's as a single streak.
# This second pattern is looking for two A's back to back, so it sees two A's followed immediatly by two more A's.


['AA', 'AA', 'AA']

In [30]:
# If you got an extra space in between the braces, you'll get an empty result
re.findall("A{2, 2}", grades)

[]

In [31]:
# If we don't include a quantifier, then the default is {1,1}
re.findall("AA", grades)

['AA', 'AA', 'AA']

In [32]:
# If we just include one number in the braces, it's considered to be the both "m" and "n"
re.findall("A{2}", grades)

['AA', 'AA', 'AA']

In [33]:
# Decreasing trend in a strudent's grades
re.findall("A{1,20}B{1,20}C{1,20}", grades)

['AAAABC']

In [38]:
with open("Data/Grades.txt", "r") as file:
    Grades = file.read()
    
print(Grades)

Ronald Mayr: A
Bell Kassulke: B
Jacqueline Rupp: A 
Alexander Zeller: C
Valentina Denk: C 
Simon Loidl: B 
Elias Jovanovic: B 
Stefanie Weninger: A 
Fabian Peer: C 
Hakim Botros: B
Emilie Lorentsen: B
Herman Karlsen: C
Nathalie Delacruz: C
Casey Hartman: C
Lily Walker : A
Gerard Wang: C
Tony Mcdowell: C
Jake Wood: B
Fatemeh Akhtar: B
Kim Weston: B
Nicholas Beatty: A
Kirsten Williams: C
Vaishali Surana: C
Coby Mccormack: C
Yasmin Dar: B
Romy Donnelly: A
Viswamitra Upandhye: B
Kendrick Hilpert: A
Killian Kaufman: B
Elwood Page: B
Mukti Patel: A
Emily Lesch: C
Elodie Booker: B
Jedd Kim: A
Annabel Davies: A
Adnan Chen: B
Jonathan Berg: C
Hank Spinka: B
Agnes Schneider: C
Kimberly Green: A
Lola-Rose Coates: C
Rose Christiansen: C
Shirley Hintz: C
Hannah Bayer: B


In [42]:
re.findall("[a-zA-Z]{1,100}", Grades)

['Ronald',
 'Mayr',
 'A',
 'Bell',
 'Kassulke',
 'B',
 'Jacqueline',
 'Rupp',
 'A',
 'Alexander',
 'Zeller',
 'C',
 'Valentina',
 'Denk',
 'C',
 'Simon',
 'Loidl',
 'B',
 'Elias',
 'Jovanovic',
 'B',
 'Stefanie',
 'Weninger',
 'A',
 'Fabian',
 'Peer',
 'C',
 'Hakim',
 'Botros',
 'B',
 'Emilie',
 'Lorentsen',
 'B',
 'Herman',
 'Karlsen',
 'C',
 'Nathalie',
 'Delacruz',
 'C',
 'Casey',
 'Hartman',
 'C',
 'Lily',
 'Walker',
 'A',
 'Gerard',
 'Wang',
 'C',
 'Tony',
 'Mcdowell',
 'C',
 'Jake',
 'Wood',
 'B',
 'Fatemeh',
 'Akhtar',
 'B',
 'Kim',
 'Weston',
 'B',
 'Nicholas',
 'Beatty',
 'A',
 'Kirsten',
 'Williams',
 'C',
 'Vaishali',
 'Surana',
 'C',
 'Coby',
 'Mccormack',
 'C',
 'Yasmin',
 'Dar',
 'B',
 'Romy',
 'Donnelly',
 'A',
 'Viswamitra',
 'Upandhye',
 'B',
 'Kendrick',
 'Hilpert',
 'A',
 'Killian',
 'Kaufman',
 'B',
 'Elwood',
 'Page',
 'B',
 'Mukti',
 'Patel',
 'A',
 'Emily',
 'Lesch',
 'C',
 'Elodie',
 'Booker',
 'B',
 'Jedd',
 'Kim',
 'A',
 'Annabel',
 'Davies',
 'A',
 'Adnan',
 

In [45]:
re.findall("[a-zA-Z]{1,100}:", Grades)

['Mayr:',
 'Kassulke:',
 'Rupp:',
 'Zeller:',
 'Denk:',
 'Loidl:',
 'Jovanovic:',
 'Weninger:',
 'Peer:',
 'Botros:',
 'Lorentsen:',
 'Karlsen:',
 'Delacruz:',
 'Hartman:',
 'Wang:',
 'Mcdowell:',
 'Wood:',
 'Akhtar:',
 'Weston:',
 'Beatty:',
 'Williams:',
 'Surana:',
 'Mccormack:',
 'Dar:',
 'Donnelly:',
 'Upandhye:',
 'Hilpert:',
 'Kaufman:',
 'Page:',
 'Patel:',
 'Lesch:',
 'Booker:',
 'Kim:',
 'Davies:',
 'Chen:',
 'Berg:',
 'Spinka:',
 'Schneider:',
 'Green:',
 'Coates:',
 'Christiansen:',
 'Hintz:',
 'Bayer:']

In [49]:
re.findall(":\s[A-Z]", Grades)

[': A',
 ': B',
 ': A',
 ': C',
 ': C',
 ': B',
 ': B',
 ': A',
 ': C',
 ': B',
 ': B',
 ': C',
 ': C',
 ': C',
 ': A',
 ': C',
 ': C',
 ': B',
 ': B',
 ': B',
 ': A',
 ': C',
 ': C',
 ': C',
 ': B',
 ': A',
 ': B',
 ': A',
 ': B',
 ': B',
 ': A',
 ': C',
 ': B',
 ': A',
 ': A',
 ': B',
 ': C',
 ': B',
 ': C',
 ': A',
 ': C',
 ': C',
 ': C',
 ': B']

In [50]:
re.findall(":\s([A-Z])", Grades)

['A',
 'B',
 'A',
 'C',
 'C',
 'B',
 'B',
 'A',
 'C',
 'B',
 'B',
 'C',
 'C',
 'C',
 'A',
 'C',
 'C',
 'B',
 'B',
 'B',
 'A',
 'C',
 'C',
 'C',
 'B',
 'A',
 'B',
 'A',
 'B',
 'B',
 'A',
 'C',
 'B',
 'A',
 'A',
 'B',
 'C',
 'B',
 'C',
 'A',
 'C',
 'C',
 'C',
 'B']

In [51]:
re.findall(":\s(B)", Grades)

['B',
 'B',
 'B',
 'B',
 'B',
 'B',
 'B',
 'B',
 'B',
 'B',
 'B',
 'B',
 'B',
 'B',
 'B',
 'B']

In [61]:
with open("Data/LogData.txt", "r") as file:
    logData = file.read()

print(logData)

['1']


In [138]:
re.findall("\d+\.\d+\.\d+\.\d+", logData)

['146.204.224.152',
 '197.109.77.178',
 '156.127.178.177',
 '100.32.205.59',
 '168.95.156.240',
 '71.172.239.195',
 '180.95.121.94',
 '144.23.247.108',
 '2.179.103.97',
 '241.114.184.133',
 '224.188.38.4',
 '94.11.36.112',
 '126.196.238.197',
 '103.247.168.212',
 '57.86.153.68',
 '231.220.8.214',
 '219.133.7.154',
 '159.252.184.44',
 '40.167.172.66',
 '167.153.239.72',
 '115.214.173.248',
 '21.43.188.186',
 '86.187.99.249',
 '76.72.133.93',
 '73.162.151.229',
 '13.112.8.80',
 '159.253.153.40',
 '136.195.158.6',
 '219.194.113.255',
 '59.101.239.174',
 '233.187.15.207',
 '62.79.96.179',
 '217.70.194.150',
 '161.242.130.180',
 '155.127.24.96',
 '63.208.115.205',
 '5.10.80.69',
 '11.57.203.39',
 '124.137.187.175',
 '203.36.55.39',
 '175.5.52.40',
 '232.220.131.214',
 '87.234.209.125',
 '177.164.9.130',
 '71.17.229.20',
 '161.106.82.60',
 '118.244.60.25',
 '226.97.226.58',
 '153.60.64.187',
 '226.59.29.125',
 '180.234.185.134',
 '202.186.241.42',
 '130.240.69.236',
 '226.187.228.119',
 '169

In [152]:
host = re.findall("\d+\.\d+\.\d+\.\d+", logData)
print(len(host))

979


In [160]:
re.findall("[a-z+0-9+]", logData)

['1',
 '4',
 '6',
 '2',
 '0',
 '4',
 '2',
 '2',
 '4',
 '1',
 '5',
 '2',
 'f',
 'e',
 'e',
 's',
 't',
 '6',
 '8',
 '1',
 '1',
 '2',
 '1',
 'u',
 'n',
 '2',
 '0',
 '1',
 '9',
 '1',
 '5',
 '4',
 '5',
 '2',
 '4',
 '0',
 '7',
 '0',
 '0',
 'i',
 'n',
 'c',
 'e',
 'n',
 't',
 'i',
 'v',
 'i',
 'z',
 'e',
 '1',
 '1',
 '3',
 '0',
 '2',
 '4',
 '6',
 '2',
 '2',
 '1',
 '9',
 '7',
 '1',
 '0',
 '9',
 '7',
 '7',
 '1',
 '7',
 '8',
 'k',
 'e',
 'r',
 't',
 'z',
 'm',
 'a',
 'n',
 'n',
 '3',
 '1',
 '2',
 '9',
 '2',
 '1',
 'u',
 'n',
 '2',
 '0',
 '1',
 '9',
 '1',
 '5',
 '4',
 '5',
 '2',
 '5',
 '0',
 '7',
 '0',
 '0',
 'v',
 'i',
 'r',
 't',
 'u',
 'a',
 'l',
 's',
 'o',
 'l',
 'u',
 't',
 'i',
 'o',
 'n',
 's',
 't',
 'a',
 'r',
 'g',
 'e',
 't',
 'w',
 'e',
 'b',
 '+',
 's',
 'e',
 'r',
 'v',
 'i',
 'c',
 'e',
 's',
 '2',
 '0',
 '2',
 '0',
 '3',
 '2',
 '6',
 '5',
 '5',
 '4',
 '1',
 '5',
 '6',
 '1',
 '2',
 '7',
 '1',
 '7',
 '8',
 '1',
 '7',
 '7',
 'o',
 'k',
 'u',
 'n',
 'e',
 'v',
 'a',
 '5',
 '2',
 '2'

In [151]:
users = re.findall("\s-\s([-|\w]+)", logData)
print(len(users))

979


In [130]:
re.findall("\[(.+)\]", logData)


['21/Jun/2019:15:45:24 -0700',
 '21/Jun/2019:15:45:25 -0700',
 '21/Jun/2019:15:45:27 -0700',
 '21/Jun/2019:15:45:28 -0700',
 '21/Jun/2019:15:45:31 -0700',
 '21/Jun/2019:15:45:32 -0700',
 '21/Jun/2019:15:45:34 -0700',
 '21/Jun/2019:15:45:35 -0700',
 '21/Jun/2019:15:45:36 -0700',
 '21/Jun/2019:15:45:37 -0700',
 '21/Jun/2019:15:45:40 -0700',
 '21/Jun/2019:15:45:41 -0700',
 '21/Jun/2019:15:45:45 -0700',
 '21/Jun/2019:15:45:49 -0700',
 '21/Jun/2019:15:45:50 -0700',
 '21/Jun/2019:15:45:52 -0700',
 '21/Jun/2019:15:45:53 -0700',
 '21/Jun/2019:15:45:54 -0700',
 '21/Jun/2019:15:45:57 -0700',
 '21/Jun/2019:15:45:58 -0700',
 '21/Jun/2019:15:46:00 -0700',
 '21/Jun/2019:15:46:02 -0700',
 '21/Jun/2019:15:46:03 -0700',
 '21/Jun/2019:15:46:05 -0700',
 '21/Jun/2019:15:46:08 -0700',
 '21/Jun/2019:15:46:09 -0700',
 '21/Jun/2019:15:46:10 -0700',
 '21/Jun/2019:15:46:11 -0700',
 '21/Jun/2019:15:46:12 -0700',
 '21/Jun/2019:15:46:13 -0700',
 '21/Jun/2019:15:46:14 -0700',
 '21/Jun/2019:15:46:15 -0700',
 '21/Jun

In [131]:
time = re.findall("\[(.+)\]", logData)
print(len(time))

979


In [132]:
re.findall("\"(.+)\"", logData)

['POST /incentivize HTTP/1.1',
 'DELETE /virtual/solutions/target/web+services HTTP/2.0',
 'DELETE /interactive/transparent/niches/revolutionize HTTP/1.1',
 'PATCH /architectures HTTP/1.0',
 'GET /engage HTTP/2.0',
 'PUT /cutting-edge HTTP/2.0',
 'PATCH /extensible/reinvent HTTP/1.1',
 'POST /extensible/infrastructures/one-to-one/enterprise HTTP/1.1',
 'POST /grow/front-end/e-commerce/robust HTTP/2.0',
 'GET /redefine/orchestrate HTTP/1.0',
 'PUT /orchestrate/out-of-the-box/unleash/syndicate HTTP/1.1',
 'POST /enhance/solutions/bricks-and-clicks HTTP/1.1',
 'DELETE /rich/reinvent HTTP/2.0',
 'HEAD /scale/global/leverage HTTP/1.0',
 'POST /innovative/roi/robust/systems HTTP/1.1',
 'HEAD /systems/sexy HTTP/1.1',
 'GET /incubate/incubate HTTP/1.1',
 'GET /convergence HTTP/2.0',
 'HEAD /convergence HTTP/2.0',
 'DELETE /bandwidth/reintermediate/engage HTTP/2.0',
 'PUT /optimize HTTP/1.1',
 'DELETE /bandwidth/turn-key/users HTTP/2.0',
 'POST /efficient/unleash HTTP/1.1',
 'POST /morph/optimi

In [140]:
request = re.findall("\"(.+)\"", logData)
print(len(request))

979


In [176]:
pattern = """
(?P<host>\d+\.\d+\.\d+\.\d+)    # Host
(\s-\s)                         # Indicator of the user_name position
(?P<user_name>[-|\w]+)          # User Name
(\s\[)                          # Indicator of the time position
(?P<time>.+(?=\]))              # Time
(\]\s\")                        # Indicator of the request position
(?P<request>.+(?=\"))           # Request
"""

for item in re.finditer(pattern, logData, re.VERBOSE):
    print(item.groupdict())


{'host': '146.204.224.152', 'user_name': 'feest6811', 'time': '21/Jun/2019:15:45:24 -0700', 'request': 'POST /incentivize HTTP/1.1'}
{'host': '197.109.77.178', 'user_name': 'kertzmann3129', 'time': '21/Jun/2019:15:45:25 -0700', 'request': 'DELETE /virtual/solutions/target/web+services HTTP/2.0'}
{'host': '156.127.178.177', 'user_name': 'okuneva5222', 'time': '21/Jun/2019:15:45:27 -0700', 'request': 'DELETE /interactive/transparent/niches/revolutionize HTTP/1.1'}
{'host': '100.32.205.59', 'user_name': 'ortiz8891', 'time': '21/Jun/2019:15:45:28 -0700', 'request': 'PATCH /architectures HTTP/1.0'}
{'host': '168.95.156.240', 'user_name': 'stark2413', 'time': '21/Jun/2019:15:45:31 -0700', 'request': 'GET /engage HTTP/2.0'}
{'host': '71.172.239.195', 'user_name': 'dooley1853', 'time': '21/Jun/2019:15:45:32 -0700', 'request': 'PUT /cutting-edge HTTP/2.0'}
{'host': '180.95.121.94', 'user_name': 'mohr6893', 'time': '21/Jun/2019:15:45:34 -0700', 'request': 'PATCH /extensible/reinvent HTTP/1.1'}
{

In [204]:
import re
def logs():
    with open("Data/LogData.txt", "r") as file:
        logdata = file.read()
    
    # YOUR CODE HERE
    listOfDict = list()
    pattern = """
    (?P<host>\d+\.\d+\.\d+\.\d+)    # Host
    (\s-\s)                         # Indicator of the user_name position
    (?P<user_name>[-|\w]+)          # User Name
    (\s\[)                          # Indicator of the time position
    (?P<time>.+(?=\]))              # Time
    (\]\s\")                        # Indicator of the request position
    (?P<request>.+(?=\"))           # Request
    """
    
    for item in re.finditer(pattern, logdata, re.VERBOSE):
        listOfDict.append(item.groupdict())
    return listOfDict
    raise NotImplementedError()

In [205]:
print(logs())

[{'host': '146.204.224.152', 'user_name': 'feest6811', 'time': '21/Jun/2019:15:45:24 -0700', 'request': 'POST /incentivize HTTP/1.1'}, {'host': '197.109.77.178', 'user_name': 'kertzmann3129', 'time': '21/Jun/2019:15:45:25 -0700', 'request': 'DELETE /virtual/solutions/target/web+services HTTP/2.0'}, {'host': '156.127.178.177', 'user_name': 'okuneva5222', 'time': '21/Jun/2019:15:45:27 -0700', 'request': 'DELETE /interactive/transparent/niches/revolutionize HTTP/1.1'}, {'host': '100.32.205.59', 'user_name': 'ortiz8891', 'time': '21/Jun/2019:15:45:28 -0700', 'request': 'PATCH /architectures HTTP/1.0'}, {'host': '168.95.156.240', 'user_name': 'stark2413', 'time': '21/Jun/2019:15:45:31 -0700', 'request': 'GET /engage HTTP/2.0'}, {'host': '71.172.239.195', 'user_name': 'dooley1853', 'time': '21/Jun/2019:15:45:32 -0700', 'request': 'PUT /cutting-edge HTTP/2.0'}, {'host': '180.95.121.94', 'user_name': 'mohr6893', 'time': '21/Jun/2019:15:45:34 -0700', 'request': 'PATCH /extensible/reinvent HTTP/