In [1]:
#Collections Module
#built into python
#implments specialized container data types which are 
#alternatives to pythons built in containers for general purpose
#e.g of a container are a dictionary or a tuple
#counter has specialized dictionary objects
#or specialized tuple like objects

In [2]:
from collections import Counter

In [3]:
mylist = [1,1,1,1,1,2,2,2,2,3,3,3,3,3,3,3]

In [5]:
#get a count for each unique items in the list (eg. how many 1s,2s and 3s)
#use a for loop and keep a dictionary to keep if you seen the number already
#if you already seen it add a plus 1 to the count
#if you havent already create a new key and add 1 to it
#Counter can automatically do this for us in a single call

In [6]:
Counter(mylist)
#to get a specialized counter object that has now counted the instances
#of each item in that list

Counter({1: 5, 2: 4, 3: 7})

In [7]:
#works with strings as well
mylist = ['a', 'a', 10, 10, 10]

In [10]:
Counter(mylist)
#looks very similar to a dictionary
#bc counter is actually a a dictionary subclass
#that helps count hashable objects
#so inside of it elements are stored as dictionary keys (object)
#and the counts of the objects are stored as values (count)

Counter({'a': 2, 10: 3})

In [11]:
#also works with strings
Counter('aaaabbbbshshsjs')

Counter({'a': 4, 'b': 4, 's': 4, 'h': 2, 'j': 1})

In [12]:
#count the words in a sentece
sentence = "How many times does each word show up in this sentece with a word"

In [13]:
sentence.split() #split the string in the whitespace to get a list

['How',
 'many',
 'times',
 'does',
 'each',
 'word',
 'show',
 'up',
 'in',
 'this',
 'sentece',
 'with',
 'a',
 'word']

In [14]:
Counter(sentence.split())

Counter({'How': 1,
         'many': 1,
         'times': 1,
         'does': 1,
         'each': 1,
         'word': 2,
         'show': 1,
         'up': 1,
         'in': 1,
         'this': 1,
         'sentece': 1,
         'with': 1,
         'a': 1})

In [16]:
Counter(sentence.lower().split()) #to check for capitalized versions

Counter({'how': 1,
         'many': 1,
         'times': 1,
         'does': 1,
         'each': 1,
         'word': 2,
         'show': 1,
         'up': 1,
         'in': 1,
         'this': 1,
         'sentece': 1,
         'with': 1,
         'a': 1})

In [17]:
#common patterns when using the counter object

In [18]:
letters = 'aaabbbccccccccccdddddddddddddd'

In [19]:
c = Counter(letters)

In [20]:
c

Counter({'a': 3, 'b': 3, 'c': 10, 'd': 14})

In [21]:
#use tab for more options
c.most_common() #returns the most common in the list as a tuple

[('d', 14), ('c', 10), ('a', 3), ('b', 3)]

In [22]:
c.most_common(2)

[('d', 14), ('c', 10)]

In [25]:
list(c) #list unique elements (keys)

['a', 'b', 'c', 'd']

In [26]:
from collections import defaultdict

In [28]:
#normal dictionary
d = {'a':10}

In [29]:
d['a']

10

In [31]:
d['WRONG'] #key error(this doesnt exist in this dictionary)

KeyError: 'WRONG'

In [32]:
#to use a for loop to add keys in the dictionary that are not already present
#in the dictionary
#use a default dictionary
#it will assign a default value if there is an instance where a key error 
#would have occured

#essentially if you as for a key that is not present in a default dict
#it will asign it with some default value

In [33]:
#we do this with a simple lambda expression (simply say lambda colon and whatever default value you want)
d = defaultdict(lambda: 0)

In [34]:
d['correct'] = 100

In [35]:
d['correct']

100

In [36]:
d['WRONG KEY!'] #default value automatically assigned

0

In [37]:
d

defaultdict(<function __main__.<lambda>()>, {'correct': 100, 'WRONG KEY!': 0})

In [38]:
#last speicalized container object from collections module
#named tuple
#trying to expand the normal tuple object by having named indices

In [39]:
#standard tuple
mytuple = (10, 20, 30)

In [40]:
mytuple[0] #index 0

10

In [41]:
#in certain cases you have a large tuple
#or you dont know wich value is in which index

In [43]:
#so named tuple is goin to have not only a numeric connection to the values
#but it will also have essentially a named index for that value
#instead of calling it by 0 we could call it with some string code

from collections import namedtuple

In [46]:
#almost like constructing a new object
#two main parameters where typename (what type it is is what it wil be reported as)
#fild_names passed in as a list

Dog = namedtuple('Dog', ['age', 'breed', 'name'])#amost like attributes in a class call

In [47]:
#create instance of a dog object

In [48]:
Dog

__main__.Dog

In [50]:
#create a dog named sammy that is 5 yers old

sammy = Dog(age=5, breed='Husky', name='Sam')

In [52]:
type(sammy) #reports back that it is a type dog not a named tuple

__main__.Dog

In [54]:
#kind of like a mixed b/w an object that you would create in OOP
#and a tuple itself (but there is an association with each value)

In [55]:
sammy

Dog(age=5, breed='Husky', name='Sam')

In [57]:
sammy.age #called like an attribute in an object

5

In [58]:
sammy.breed

'Husky'

In [59]:
sammy.name

'Sam'

In [61]:
sammy[0] #same as calling sammy.age

5

In [1]:
#Opening and Reading Files and Folders (Python OS Module)
#Shell utilities module (Shutil) and OS Modules

In [6]:
pwd 

'C:\\Users\\Halim\\Desktop\\2020Files\\python-notes\\Laptop'

In [7]:
#reports current working directory for jupyter
#windows is \\

In [10]:
#create a practice file 
#since it doesnt exist it will create it
#and want to write to it so set mode to w+
f = open('practice.txt', 'w+')
f.write('This is a test string')
f.close

<function TextIOWrapper.close()>

In [12]:
#how to use the OS module
#it is useful bc you could get the current working directory
#or list all the files in a working directory
#these commands work in all operating systems

In [3]:
import os

In [16]:
os.getcwd() #gives back the 'current working directory'
#works with any python script

'C:\\Users\\Halim\\Desktop\\2020Files\\python-notes\\Laptop'

In [17]:
#lists everything in the current working directory
os.listdir()

['.ipynb_checkpoints',
 'BlackJack Milestone Project.ipynb',
 'cap.py',
 'Eigth Decorators.ipynb',
 'Fifth OOP.ipynb',
 'MyMainPackage',
 'mymodule.py',
 'myprogram.py',
 'Ninth Advanced Python Modules.ipynb',
 'one.py',
 'practice.txt',
 'Seventh - 2nd Milestone Project.ipynb',
 'simple1.py',
 'Sixth.ipynb',
 'testfile',
 'test_cap.py',
 'two.py',
 'War Card Game Milestone Project.ipynb',
 '__pycache__']

In [30]:
#for the files and folders under users
os.listdir('C:\\Users')

['All Users',
 'Default',
 'Default User',
 'Default.migrated',
 'desktop.ini',
 'Halim',
 'HP Laptop',
 'Public']

In [31]:
#how to move files around
#import the shell utilities module which is just shutil

In [4]:
#could be used to boot files in defferent locations
import shutil

In [33]:
#how to move a file
#need source and move it to a destination
shutil.move('practice.txt', 'C:\\Users\\Halim' )

PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'practice.txt'

In [34]:
os.listdir('C:\\Users\\Halim')

['.android',
 '.AndroidStudio3.6',
 '.bash_history',
 '.conda',
 '.condarc',
 '.config',
 '.emulator_console_auth_token',
 '.gitconfig',
 '.gradle',
 '.ipynb_checkpoints',
 '.ipython',
 '.jupyter',
 '.mongorc.js',
 '.node_repl_history',
 '.pylint.d',
 '.viminfo',
 '.VirtualBox',
 '.vscode',
 '3D Objects',
 'anaconda3',
 'AndroidStudioProjects',
 'AppData',
 'Application Data',
 'BrawlhallaReplays',
 'Contacts',
 'Cookies',
 'Desktop',
 'Documents',
 'Downloads',
 'Favorites',
 'Links',
 'Local Settings',
 'MicrosoftEdgeBackups',
 'Music',
 'My Documents',
 'My Online Documents',
 'NetHood',
 'NTUSER.DAT',
 'ntuser.dat.LOG1',
 'ntuser.dat.LOG2',
 'NTUSER.DAT{33a11152-60a4-11ea-9bba-9ac472c3cb59}.TM.blf',
 'NTUSER.DAT{33a11152-60a4-11ea-9bba-9ac472c3cb59}.TMContainer00000000000000000001.regtrans-ms',
 'NTUSER.DAT{33a11152-60a4-11ea-9bba-9ac472c3cb59}.TMContainer00000000000000000002.regtrans-ms',
 'ntuser.ini',
 'OneDrive',
 'Pictures',
 'Postman',
 'practice.txt',
 'PrintHood',
 'Recent'

In [36]:
#how to delete a file
#os.unlink(path) which deletes a file at the path you provided
#os.rmdir(path) which deletes a folder (folder must be empty) at the path you provided
#os.rmtree(path) this is the most dangerous, as itl will remove all files and folders contianed in the path
# All of these methods can not be reversed! WHich means if you make a mistake you won't be able to recover the file.
# Instead we will use the send2trash module. A safer alternatice that sends deleted files to the trash bin instead of permanent removal
#pip install send2trash

In [5]:
import send2trash

In [8]:
os.listdir()

['.ipynb_checkpoints',
 'BlackJack Milestone Project.ipynb',
 'cap.py',
 'Eigth Decorators.ipynb',
 'Fifth OOP.ipynb',
 'MyMainPackage',
 'mymodule.py',
 'myprogram.py',
 'Ninth Advanced Python Modules.ipynb',
 'one.py',
 'Seventh - 2nd Milestone Project.ipynb',
 'simple1.py',
 'Sixth.ipynb',
 'testfile',
 'test_cap.py',
 'two.py',
 'War Card Game Milestone Project.ipynb',
 '__pycache__']

In [9]:
shutil.move('C:\\Users\\Halim\\practice.txt', os.getcwd())

'C:\\Users\\Halim\\Desktop\\2020Files\\python-notes\\Laptop\\practice.txt'

In [10]:
os.listdir()

['.ipynb_checkpoints',
 'BlackJack Milestone Project.ipynb',
 'cap.py',
 'Eigth Decorators.ipynb',
 'Fifth OOP.ipynb',
 'MyMainPackage',
 'mymodule.py',
 'myprogram.py',
 'Ninth Advanced Python Modules.ipynb',
 'one.py',
 'practice.txt',
 'Seventh - 2nd Milestone Project.ipynb',
 'simple1.py',
 'Sixth.ipynb',
 'testfile',
 'test_cap.py',
 'two.py',
 'War Card Game Milestone Project.ipynb',
 '__pycache__']

In [11]:
send2trash.send2trash('practice.txt')

In [15]:
os.listdir()

['.ipynb_checkpoints',
 'BlackJack Milestone Project.ipynb',
 'cap.py',
 'Eigth Decorators.ipynb',
 'Example_Top_Level',
 'Fifth OOP.ipynb',
 'MyMainPackage',
 'mymodule.py',
 'myprogram.py',
 'Ninth Advanced Python Modules.ipynb',
 'one.py',
 'Seventh - 2nd Milestone Project.ipynb',
 'simple1.py',
 'Sixth.ipynb',
 'testfile',
 'test_cap.py',
 'two.py',
 'War Card Game Milestone Project.ipynb',
 '__pycache__']

In [13]:
os.getcwd()

'C:\\Users\\Halim\\Desktop\\2020Files\\python-notes\\Laptop'

In [14]:
file_path = 'C:\\Users\\Halim\\Desktop\\2020Files\\python-notes\\Laptop\\Example_Top_Level'

In [18]:
#directory tree generator
#use tuple unpacking
for folder, sub_folders, files in os.walk(file_path):
    
    print(f"Currently looking at {folder}")
    print('\n')
    print('The subfolders are: ')
    for sub_fold in sub_folders:
        print(f"\t Subfolder: {sub_fold}")
    
    print('\n')
    print("The files are:")
    for f in files:
        print(f"\t File: {f}")
    print('\n')

Currently looking at C:\Users\Halim\Desktop\2020Files\python-notes\Laptop\Example_Top_Level


The subfolders are: 
	 Subfolder: Mid-Example-One


The files are:
	 File: Mid-Example.txt


Currently looking at C:\Users\Halim\Desktop\2020Files\python-notes\Laptop\Example_Top_Level\Mid-Example-One


The subfolders are: 
	 Subfolder: Bottom-Level-One
	 Subfolder: Bottom-Level-Two


The files are:
	 File: Mid-Level-Doc.txt


Currently looking at C:\Users\Halim\Desktop\2020Files\python-notes\Laptop\Example_Top_Level\Mid-Example-One\Bottom-Level-One


The subfolders are: 


The files are:
	 File: One_Text.txt


Currently looking at C:\Users\Halim\Desktop\2020Files\python-notes\Laptop\Example_Top_Level\Mid-Example-One\Bottom-Level-Two


The subfolders are: 


The files are:
	 File: Bottom-Text-Two.txt




In [21]:
os.walk(file_path) #generator object

<generator object walk at 0x000002BCDA6B33C8>

In [23]:
#PYTHON DATETIME MODULE
#create objects tha have information on date and time
#as well as timezones, operations between datetime objects
#such as how many seconds have passed or days have passed

In [24]:
#explore datetime factor
import datetime

In [26]:
mytime = datetime.time(2, 20)
#its 2 am, 20 minutes pass, using a 24hr clock
#if you dont provide certain info it will automatically fill it in

In [28]:
mytime.minute #attribute minute

20

In [29]:
mytime.hour

2

In [30]:
print(mytime) #automatically fills in seconds with 00

02:20:00


In [32]:
mytime.microsecond

0

In [33]:
mytime = datetime.time(13,20,1,20)

In [35]:
print(mytime)

13:20:01.000020


In [36]:
mytime.microsecond

20

In [38]:
type(mytime)
#it is a datetime.time object
#so only holds values of time
#and has no date associated with this

datetime.time

In [39]:
#we could add in date information using a date object
#or a combined datetime object

In [40]:
today = datetime.date.today()

In [41]:
print(today)

2020-08-04


In [42]:
#all of these are attributes
today.year

2020

In [43]:
today.month

8

In [44]:
today.day

4

In [45]:
#c time formatting
today.ctime()

'Tue Aug  4 00:00:00 2020'

In [46]:
#want datetime info
#the object is called the samething as a the module
from datetime import datetime

In [48]:
mydatetime = datetime(2021, 10, 3, 14, 20, 1)

In [49]:
print(mydatetime)

2021-10-03 14:20:01


In [50]:
#if you made a mistake use the replace functionality
mydatetime.replace(year =2020)

datetime.datetime(2020, 10, 3, 14, 20, 1)

In [51]:
mydatetime = mydatetime.replace(year =2020)

In [52]:
print(mydatetime)

2020-10-03 14:20:01


In [53]:
#common to perform with datetime objects is simple arithmatic
#example someone logs in to a website on a certain day and log back out on a certain date
#want to figure out how long did they spend on my website or logged in
#we can perform simple arithmatic on a date object or a datetime object

In [54]:
# DATE
from datetime import date

In [55]:
date1 = date(2021, 11, 3)
date2 = date(2020, 11, 3)

In [56]:
#how much time between date 1 and date 2
date1 - date2

datetime.timedelta(days=365)

In [57]:
#if you date the difference betweem two date objects 
#it reports back the difference in number of days
result = date1 - date2

In [59]:
type(result) #the type is a special timedelta object

datetime.timedelta

In [60]:
result.days

365

In [61]:
#can perform arithmatic on a datetime object
datetime1 = datetime(2021, 11, 3, 22, 0)

In [68]:
datetime2 = datetime(2020, 11, 3, 12, 0)

In [69]:
datetime1 - datetime2

datetime.timedelta(days=365, seconds=36000)

In [70]:
#10 hrs apart
36000/60/60

10.0

In [71]:
mydiff = datetime1 - datetime2

In [72]:
mydiff.seconds

36000

In [74]:
mydiff.total_seconds() #inclusive of everything reported as seconds

31572000.0

In [75]:
#Python Math and Random Modules

In [76]:
#math module holds a ton of useful math functions
#random module that contains mathematical random functions
#as well as random functions for grabbing a random item from a python list
#eg. random.shuffle()

In [77]:
import math

In [78]:
help(math)

Help on built-in module math:

NAME
    math

DESCRIPTION
    This module provides access to the mathematical functions
    defined by the C standard.

FUNCTIONS
    acos(x, /)
        Return the arc cosine (measured in radians) of x.
    
    acosh(x, /)
        Return the inverse hyperbolic cosine of x.
    
    asin(x, /)
        Return the arc sine (measured in radians) of x.
    
    asinh(x, /)
        Return the inverse hyperbolic sine of x.
    
    atan(x, /)
        Return the arc tangent (measured in radians) of x.
    
    atan2(y, x, /)
        Return the arc tangent (measured in radians) of y/x.
        
        Unlike atan(y/x), the signs of both x and y are considered.
    
    atanh(x, /)
        Return the inverse hyperbolic tangent of x.
    
    ceil(x, /)
        Return the ceiling of x as an Integral.
        
        This is the smallest integer >= x.
    
    copysign(x, y, /)
        Return a float with the magnitude (absolute value) of x but the sign of y.
   

In [79]:
value = 4.35

In [80]:
#round a value
math.floor(value)

4

In [82]:
math.ceil(value) #rounds up

5

In [83]:
round(4.35)

4

In [84]:
round(4.5)#round to all even or all odds

4

In [86]:
round(5.5) #if you round down all the time then all your estimates will be lower than they should be

6

In [87]:
math.pi

3.141592653589793

In [88]:
from math import pi

In [89]:
pi

3.141592653589793

In [90]:
math.e

2.718281828459045

In [91]:
math.inf

inf

In [92]:
math.nan

nan

In [93]:
# Numpy library which is a library specific to numerical processing
#it is highly efficient compared to Python's built in math module

In [94]:
math.e

2.718281828459045

In [95]:
math.log(math.e)#base to the power to some n is equal to the variable you provide

1.0

In [96]:
#custom log base
math.log(100,10) #x base 10

2.0

In [97]:
#it is solving for what number do I have to take 10 to the power of
#in order to get 100

In [98]:
10**2

100

In [99]:
math.sin(10)

-0.5440211108893698

In [100]:
math.degrees(pi/2)

90.0

In [101]:
math.radians(180)

3.141592653589793

In [102]:
#random module allows us to create random numbers
#we could also set a a seed to produce the same random set every time
#psuedo random number generators

In [103]:
#A seed

In [104]:
import random

In [109]:
random.randint(0, 100) #to produce a random integer

82

In [110]:
#however if you are testing code scripts and you want to make adjustments to this script of code
#but I want to test it on random numbers but with the same batch of random numbers
#this is where the seed comes to play, setting the seed allows to start with a seeded
#psuedo random number generator, the same random numbers will show up in a series

In [117]:
#seed needs to be in the same cell as whatever function we are using to generate random numbers
random.seed(101)
#the seed starts the same series of random integers
random.randint(0,100)

74

In [118]:
random.randint(0,100)

24

In [128]:
random.seed(101) #set a seed for any sequence of infinite random numbers
print(random.randint(0,100)) # 74
print(random.randint(0,100)) # 24
print(random.randint(0,100)) # 69
print(random.randint(0,100))
print(random.randint(0,100))

74
24
69
45
59


In [129]:
mylist = list(range(0,20))

In [130]:
mylist

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [132]:
random.choice(mylist) #chooses a random item from a list

1

In [133]:
mylist

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [134]:
#what if you want to grab multiple items from this list

In [135]:
#1. you could allow yourself to pick the same number twice
#i.e. sampling with replacement

#2. or sample without replacement
#once you pick out the item from this list you cant pick it again

In [136]:
# SAMPLE WITH REPLACMENT

In [138]:
random.choices(population=mylist, k=10)
#I am going to randomly pick a number from this list
#and will be doing this 10 times and it will return a list with these numbers
#should see some numbers repeated

[15, 7, 9, 6, 12, 8, 4, 8, 6, 2]

In [139]:
# SAMPLE WITHOUT REPLACEMENT
#once i chosen a number I dont get to pick it again
random.sample(population=mylist, k=10)

[19, 10, 8, 12, 16, 3, 6, 1, 13, 18]

In [140]:
#how to shuffle the list
#will permanently affect the list so you dont need to assign it to anything

In [141]:
random.shuffle(mylist)

In [142]:
mylist

[8, 10, 16, 7, 1, 15, 9, 2, 4, 19, 11, 17, 18, 12, 5, 3, 6, 14, 0, 13]

In [144]:
#probability distributions
#uniform distribution (which will randomly pick a value bw a and b)
#this is a continous ditribution which means floating point numbers are allowed
#can pick any number bw 0 and 100 to a very large floating point precision
#called uniform bc every number bw 0 and 100 has the same likely hood to be chosen
random.uniform(a=0, b=100)

99.08921325823393

In [145]:
#also a normal or gaussing distribution
#this also takes in a mean and standard deviation
random.gauss(mu=0, sigma=1)

1.0038291308472342

In [147]:
#SHOULD REALLY USE NUMPY IF YOU ARE USING UNIFORM OR GAUSS
#ESPECIALLY MACHINE LEARNING AND DATA SCIENCE

In [148]:
#PYTHON DEBUGGER

In [151]:
#instead of using print statments
#x is a list
x = [1,2,3]
y = 2
z = 3

result = y + z
print(result)
result2 = x + y #cannot add a list to an integer
print(y)
print(result2)

5


TypeError: can only concatenate list (not "int") to list

In [154]:
import pdb

In [156]:
#use a python debugger to set a trace
#trace will allow you to pause operation mid script
#and allow you to play with variables to understand what is going on
#that is why you don't need to keep on adding in print functions for x, y z etc
#set a python debugger before the error
#type errors will report what line the error occurs on
#set trace before the error line
x = [1,2,3]
y = 2
z = 3

result_one = y + z

pdb.set_trace()

result_two = y + x

--Return--
> <ipython-input-156-2df44408cc57>(14)<module>()->None
-> pdb.set_trace()
(Pdb) x
[1, 2, 3]
(Pdb) y
2
(Pdb) result_one
5
(Pdb) q


BdbQuit: 

In [None]:
#what pdb allows us to explore and call variables at this point in time
#so in the line before result2
#a variable can be None when you try to interact with it
#and wont be able to perform operations bc no values
#PDB only helps you find what variables are assigned to what in mid operation

In [157]:
#type q to quite the debugger

In [158]:
#Python Regular Expressions Part One

In [159]:
#We could seach for substrings within a larger string with the in operator
"dog" in "my dog is great"

True

In [161]:
#Regular Expressions (regex) allow us to search for general patterns in text data!
#for example email format: user@email.com
#in this case we are looking gor a pattern "text" + "@" + "text" + ".com"
#the re library allows us to create specialized pattern strings and then search for matches within text
#Phone Number (555)-555-5555
#Regex Pattern r"(\d\d\d)-\d\d\d-\d\d\d\d"
#r means dont treat this as a normal string bc there are actually identifiers in the string
#\ backslashes corresponds with the indivual identifiers
# these identifiers are placeholders or wildcards waiting for a match based in a particular datatype
# eg \d means digit
# () and - are format strings (looking for an exact string)
# they dont have \ backslashes bc they are not identifiers in the regular expression
#r"(\d{3})-\d{3}-\d{4}"
# uses quantifiers in order to reduce amount of text (find 3 digits)

In [162]:
text = "The agent's phone number is 408-555-1234. Call soon!"

In [163]:
#can search for simple strings in the text
'phone' in text

True

In [164]:
import re

In [165]:
pattern = 'phone'

In [166]:
#say re and call the search function off the reg exp module
#and pass in the pattern and text
re.search(pattern,text)

<re.Match object; span=(12, 17), match='phone'>

In [167]:
# you get this matched object
# this reports back whether there is a match to the phone
# but also where the actual index location spans to 
# so starts at index 12 and ends and 17

In [168]:
pattern = 'NOT IN TEXT'

In [170]:
re.search(pattern,text)

In [175]:
#we get none bc there is no match
#run it again

In [176]:
pattern = 'phone'

In [177]:
match = re.search(pattern,text)

In [178]:
match

<re.Match object; span=(12, 17), match='phone'>

In [179]:
match.span()#to get index location of the span

(12, 17)

In [181]:
match.start()

12

In [182]:
match.end()

17

In [183]:
#if we have multiple matches it would only return 1 match

In [184]:
text = 'my phone once, my phone twice'

In [185]:
match = re.search('phone', text)

In [186]:
match

<re.Match object; span=(3, 8), match='phone'>

In [187]:
#if you want to find all the matches use the findAll func instead

In [188]:
matches = re.findall('phone',text)

In [189]:
matches #list of all the matches

['phone', 'phone']

In [190]:
#to check how many matches get the length of this list
len(matches)

2

In [192]:
#if you want to get back actual match objects use an iterator
#it iterates through this "text" and returns each match object that is found
for match in re.finditer('phone', text):
    print(match)

<re.Match object; span=(3, 8), match='phone'>
<re.Match object; span=(18, 23), match='phone'>


In [194]:
for match in re.finditer('phone', text):
    print(match.span())

(3, 8)
(18, 23)


In [196]:
#if you wanted the actual text that matched
#use the group method
#essentially what you get back if you use findall
for match in re.finditer('phone', text):
    print(match.group())

phone
phone


In [197]:
#above we realized the critical functions of the regular expressions module
#re.search() input pattern and text and returns match object
#re.findall() returns the list of strings that match

#re.finditer where we go through the entire string and return back
#match objects for the actual pattern you are searching for in the text
#then you could call any methods you want off of that match object

In [198]:
#HOW TO BUILD THE REG EXP IDENTIFIER CODES

In [199]:
#Python Regular Expressions Part Two

In [200]:
#use special pattern codes to build pattern sequences

In [204]:
#character identifiers
#start off with backslash code and a letter to mean which character you are referencing
#eg.
#\d is a placeholder for any digit
#file_\d\d would match file_25
#alphanumeric means letters or numbers (alphabet or numerical) \w
#any letter or number followed by a dash and any 3 letters or numbers
#eg. A-b_1
#white space is \s
# a\sb\sc is a b c matches this
# A non digit \D
# Non-alphanumeric \W\W\W\W\W *-_=)
# Non-whitespace \S\S\S\S Yoyo

In [205]:
text = 'My phone number is 408-555-1234'

In [209]:
phone = re.search('408-555-1234',text)

In [210]:
phone

<re.Match object; span=(19, 31), match='408-555-1234'>

In [211]:
#but we may not know the actual number but only the pattern

In [215]:
text = 'My phone number is 408-555-7777'

In [231]:
phone = re.search(r'\d\d\d-\d\d\d-\d\d\d\d',text)
#note these backslashes indicate special excape chracters
#such as \n for newline or \t for tab
#to tell python that we are using a reg exp we add r in the front of the string

In [232]:
phone #we are searching for the pattern itself

<re.Match object; span=(19, 31), match='408-555-7777'>

In [233]:
#if you want the actual number we call the group method
phone.group()

'408-555-7777'

In [219]:
#what if we wanted to include a pattern that included alot of digits'
#we could use quantifiers to indicate character repetition
#use the character identifier then immediately after use the quantifiers
#to indicate a certain quantity

<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Exammple Match</th></tr>

<tr ><td><span >+</span></td><td>Occurs one or more times</td><td>	Version \w-\w+</td><td>Version A-b1_1</td></tr>

<tr ><td><span >{3}</span></td><td>Occurs exactly 3 times</td><td>\D{3}</td><td>abc</td></tr>



<tr ><td><span >{2,4}</span></td><td>Occurs 2 to 4 times</td><td>\d{2,4}</td><td>123</td></tr>



<tr ><td><span >{3,}</span></td><td>Occurs 3 or more</td><td>\w{3,}</td><td>anycharacters</td></tr>

<tr ><td><span >\*</span></td><td>Occurs zero or more times</td><td>A\*B\*C*</td><td>AAACC</td></tr>

<tr ><td><span >?</span></td><td>Once or none</td><td>plurals?</td><td>plural</td></tr></table>

In [227]:
#does s? occur once or none
#match could be plural or plurals

In [228]:
#A*B*C
#does A occur zero or more times
#followed by does B occur zero or more times
#followed by does C occur zero or more times
#matches alot of patterns

#egs. AAACC 
#A occured 3 times
#B occured zero or more times
#C occured 2 times

In [236]:
phone = re.search(r'\d{3}-\d{3}-\d{4}',text)

In [237]:
phone

<re.Match object; span=(19, 31), match='408-555-7777'>

In [239]:
#now lets imagine if we want to dp tasks
#one to find phone numbers
#but also be able to extract their area code (first 3 digits of the phone number)

#we could use groups
#for any general tasks that involves grouping together regular expression
#which allows us later to break them down
#use the compile function
#compile does is compiles together different reg exps pattern codes
#r'\d{3}-\d{3}-\d{4}' this is a single pattern code
#but we could think of it as 3 pattern codes connected by the dash

#() indicate that is a group of a pattern

In [241]:
phone_pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})')
#takes multiple pattern codes and each code is seperated as a group with ()
#and compiles them into a single expression
#what is cool with compile is that it still understands it is 3 seperate groupings
#can call the groupings individually

In [242]:
results = re.search(phone_pattern, text)

In [243]:
results.group() #returns all the patterns

'408-555-7777'

In [244]:
#can call the group postion denoted by the parenthese ()
#group ordering starts at 1
results.group(1)

'408'

In [245]:
results.group(2)

'555'

In [246]:
results.group(3)

'7777'

In [247]:
results.group(4)

IndexError: no such group

In [248]:
#therefore we could extract parts of the information while looking for a complete match

In [249]:
#COMPILE AND GROUP FUNCTIONS ABOVE

In [250]:
# Python Regular Expressions Part Three

In [251]:
#can use the OR Operator to search for multiple terms
re.search(r'cat', 'The cat is here')

<re.Match object; span=(4, 7), match='cat'>

In [253]:
#search for cat or dog using the pipe operator |
re.search(r'cat|dog', 'The dog is here')

<re.Match object; span=(4, 7), match='dog'>

In [254]:
#wildcard operator where it acts as a placement that will match any character there

In [255]:
re.findall(r'at', 'The cat in the hat sat there.')

['at', 'at', 'at']

In [256]:
#if you want to find the character before at
#. is a wildcard
re.findall(r'.at', 'The cat in the hat sat there.')

['cat', 'hat', 'sat']

In [261]:
re.findall(r'...at', 'The cat in the hat went splat.')
#. also counting spaces
#want more control use character idenifier before

['e cat', 'e hat', 'splat']

In [262]:
#want to find everything that starts with a number use caret ^
re.findall(r'^\d', '1 is a number')

['1']

In [263]:
#this search only the entire text itself that starts with a number
re.findall(r'^\d', 'The 2 is a number')

[]

In [265]:
#$ means ends with
re.findall(r'\d$', 'The number is 2')

['2']

In [266]:
#to exclude characters we use enjuction with a set of brackets
phrase = 'there are 3 numbers 34 inside 5 this sentence'

In [267]:
#want to get everything that is not a number
#exclude digits
#use inside square brackets whatever you want to exclude
pattern = r'[^\d]' #exclude any digits (anything that matched the character identifier)

In [268]:
#get back a list of the characters that are not a number
re.findall(pattern, phrase)

['t',
 'h',
 'e',
 'r',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 's',
 ' ',
 ' ',
 'i',
 'n',
 's',
 'i',
 'd',
 'e',
 ' ',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 's',
 'e',
 'n',
 't',
 'e',
 'n',
 'c',
 'e']

In [269]:
#if you want to get all the words back together
#add a + sign to occur one or more times
pattern = r'[^\d]+'
re.findall(pattern, phrase)

['there are ', ' numbers ', ' inside ', ' this sentence']

In [270]:
#we use the exlusion syntax
# a common way to get rid of puctuation in a sentence
test_phrase = 'This is a string! But it has punctuation. How can we remove it?'

In [272]:
re.findall(r'[^!.?]+',test_phrase)
#want to exclude those wherever they occur (!.?)

['This is a string', ' But it has punctuation', ' How can we remove it']

In [273]:
#it is split on any punctuation
#we could add a space to remove the spaces to get a list of all the words
clean = re.findall(r'[^!.? ]+',test_phrase)

In [274]:
clean

['This',
 'is',
 'a',
 'string',
 'But',
 'it',
 'has',
 'punctuation',
 'How',
 'can',
 'we',
 'remove',
 'it']

In [275]:
' '.join(clean)

'This is a string But it has punctuation How can we remove it'

In [276]:
#always break down reg exp pattern codes to undestand what is happening

In [278]:
#square brackets are to group things
#fore example [^ ] is group these things for exclusion

In [280]:
text = 'Only find the hypen-words in this sentence. But you do not know how long-ish they are'

In [281]:
#don't know how many letters are before or after the -
#try to find words that have a - in the middle of them
#alphanumeric group - alphanumeric group

In [284]:
#+ this indicates a group of \w (alphanumerics) that occures one or more times
pattern = r'[\w]+'

In [285]:
re.findall(pattern, text)

['Only',
 'find',
 'the',
 'hypen',
 'words',
 'in',
 'this',
 'sentence',
 'But',
 'you',
 'do',
 'not',
 'know',
 'how',
 'long',
 'ish',
 'they',
 'are']

In [287]:
#now find the the hypenated words
pattern = r'[\w]+-[\w]+'
re.findall(pattern, text)

['hypen-words', 'long-ish']

In [289]:
#now find the the hypenated words
#this gets the same result but it is hard to read
#use the brace notation to seperate groups
pattern = r'\w+-\w+'
re.findall(pattern, text)

['hypen-words', 'long-ish']

In [None]:
#can use parenthese for multiple options

In [296]:
#can use parenthese for multiple options
text = 'Hello, wpuld you like some catfish?'
texttwo = "Hello, would you like to take a catnap?"
textthree = "Hello, have you seen this caterpillar?"

In [297]:
re.search(r'cat(fish|nap|claw)',text)

<re.Match object; span=(27, 34), match='catfish'>

In [298]:
re.search(r'cat(fish|nap|claw)',texttwo)

<re.Match object; span=(32, 38), match='catnap'>

In [300]:
re.search(r'cat(fish|nap|erpillar)',textthree)

In [301]:
#() can combine that OR statment from earlier 
#with other pieces of text and provide multiple options

In [302]:
#TIMING YOUR PYTHON CODE

In [None]:
#want to choose the fastest code bu timing the code
#Simply tracking time elapsed
#Using the timeit module
#Special %%timeit "magic" for Jupyter Notebooks

In [1]:
#if you give a number it gives a sitring list up to the number
def func_one(n):
    return [str(num) for num in range(n)]#list comprehension

In [2]:
func_one(10)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [3]:
def func_two(n): #use the mapping function
    return list(map(str, range(n)))

#we get the same values from range(n)
#we convert all them to a string by mapping 
#the string function for each number that results in range
#we convert that whole thing to a list by calling list on it

In [4]:
func_two(10)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [6]:
m = range(10)

In [10]:
m[1]

1

In [12]:
m[9]

9

In [13]:
m

range(0, 10)

In [14]:
list(m)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [15]:
#func_one and func_two do the same thing but which is more efficient
#could time the code by marking a start time and calculating the difference
import time #import the time library

In [18]:
# CURRENT TIME BEFORE
start_time = time.time() #grabs time from OS
# RUN CODE
result = func_one(1000000)
# CURRENT TIME AFTER RUNNING CODE
end_time = time.time()
# ELAPSED TIME 
elapsed_time = end_time - start_time

print(elapsed_time)

1.0820188522338867


In [19]:
# CURRENT TIME BEFORE
start_time = time.time() #grabs time from OS
# RUN CODE
result = func_two(1000000)
# CURRENT TIME AFTER RUNNING CODE
end_time = time.time()
# ELAPSED TIME 
elapsed_time = end_time - start_time

print(elapsed_time)

0.9919641017913818


In [29]:
#function two is slightly faster
#there are definitley limitations here
#if the function is faster for a tenth of a second




# CURRENT TIME BEFORE
start_time = time.time() #grabs time from OS
# RUN CODE
result = func_one(10000)
# CURRENT TIME AFTER RUNNING CODE
end_time = time.time()
# ELAPSED TIME 
elapsed_time = end_time - start_time

print(elapsed_time)

0.008780717849731445


In [30]:
# CURRENT TIME BEFORE
start_time = time.time() #grabs time from OS
# RUN CODE
result = func_two(10000)
# CURRENT TIME AFTER RUNNING CODE
end_time = time.time()
# ELAPSED TIME 
elapsed_time = end_time - start_time

print(elapsed_time)

0.007993936538696289


In [31]:
#the precision is not enough to show anything on the elapsed time
#running so fast it is hard to compare the performance 
#bw func_one and func_two

In [32]:
#use the timeit module which is specifically designed to test code
import timeit

In [33]:
#in order to time the code with the timeit module use this
#3 main things to look at
#statement, setup and number (i.e. number of times you want to run the code)
#timeit is going to run the statment code over and over again
#statement and setup are actually passed in as strings
timeit.timeit

<function timeit.timeit(stmt='pass', setup='pass', timer=<built-in function perf_counter>, number=1000000, globals=None)>

In [34]:
stmt = '''
func_one(100)
'''

In [38]:
#setup is what code you need to run before you call statement over and over again
setup = '''
def func_one(n):
    return [str(num) for num in range(n)]
'''

In [39]:
timeit.timeit(stmt,setup,number=100000)

6.698519100000112

In [43]:
stmt2 = '''
func_two(100)
'''

In [44]:
setup2 = '''
def func_two(n): #use the mapping function
    return list(map(str, range(n)))
'''

In [45]:
timeit.timeit(stmt2,setup2,number=100000)

5.711154000000079

In [46]:
#THEREFORE FUNCTION TWO PERFORMS FASTER
#RUN FOR MORE TIMES TO CONFIRM

In [49]:
#jupyter notebooks magic method %%timeit
%%timeit
func_one(100)

UsageError: Line magic function `%%timeit` not found.


In [48]:
%%timeit
func_two(100)

61.6 µs ± 2.57 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [50]:
%%timeit
func_one(100)

71.3 µs ± 3.51 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [51]:
#ZIPPING AND UNZIPPING FILES WITH PYTHON

In [52]:
f = open('fileone.txt','w+')
f.write('ONE FILE')
f.close()

In [53]:
f = open('filetwo.txt','w+')
f.write('TWO FILE')
f.close()

In [None]:
#want to compress the two files to send in an email

In [54]:
import zipfile

In [55]:
#create the zipfile first
comp_file = zipfile.ZipFile('comp_file.zip','w')

In [56]:
#compress the two text files and insert them in the zipfile
#specify the compression type
#the most standard one is ZIP_DEFLATED
comp_file.write('fileone.txt', compress_type=zipfile.ZIP_DEFLATED)

In [57]:
comp_file.write('filetwo.txt', compress_type=zipfile.ZIP_DEFLATED)

In [58]:
#once done simply close it
comp_file.close()

In [59]:
#how to extract times from zipfile
zip_obj = zipfile.ZipFile('comp_file.zip', 'r')

In [60]:
#have two options
#if you want to extract one specific file
zip_obj.extract('fileone.txt')
#or extract all and specify the folder for it to be
zip_obj.extractall('extracted_content')

In [61]:
pwd

'C:\\Users\\Halim\\Desktop\\2020Files\\python-notes\\Laptop'

In [62]:
#often compress and entire folder and extract an entire folder
#not usually compressing single text items
#use the shell utility library is better
import shutil
#can point out a directory folder into a zipfile

In [63]:
dir_to_zip = 'C:\\Users\\Halim\\Desktop\\2020Files\\python-notes\\Laptop\\extracted_content'

In [64]:
output_filename = 'example'

In [65]:
shutil.make_archive(output_filename, 'zip', dir_to_zip)
#accepts the output file name
#format such as a zip file or tar file
#choose the directory to zip

'C:\\Users\\Halim\\Desktop\\2020Files\\python-notes\\Laptop\\example.zip'

In [68]:
#how to extract the contents from a folder
shutil.unpack_archive('example.zip','final_unzip','zip')
#take filename you wish to unpack
#what should be the extracted directory be called
#what actual file type is this

In [69]:
#Advanced Python Module Puzzle - Overview