# Files

# os module
- many low level operating system operations, including file status and manipulation
- [doc](https://docs.python.org/3/library/os.html#module-os)

# tempfile module
- will create a valid unique temporary pathname on any OS
- [doc](https://docs.python.org/3/library/tempfile.html)
- does NOT make the file

In [1]:
import os
import tempfile

tp = tempfile.NamedTemporaryFile().name
tp2 = tempfile.NamedTemporaryFile().name

# os.path.exists(path) - True if file path exists

[tp, tp2, os.path.exists(tp), os.path.exists(tp2)]

['C:\\Users\\justi\\AppData\\Local\\Temp\\tmpwg3q5flf',
 'C:\\Users\\justi\\AppData\\Local\\Temp\\tmpvd_17zou',
 False,
 False]

# Getting file status

In [2]:
# os.path.exists and os.access reports 
# file status without throwning errors
# os.stat throws an error if the path doesn't exist. 

# this is similar to linux 'touch' command 
# make an empty file

open(tp, 'w').close()

def ac(p):
    # can check exists, readable, writeable, executable
    return([ os.access(p, m) \
    for m in [os.F_OK, os.R_OK, os.W_OK, os.X_OK] ])

ac(tp)


[True, True, True, True]

In [3]:
# last accessed time, last modified time

[os.path.getatime(tp), os.path.getmtime(tp)]

[1538003903.941263, 1538003903.941263]

In [4]:
# does a path refer to a file or a directory?

[os.path.isfile(tp), os.path.isdir(tp)]

[True, False]

In [5]:
# gets several pieces of info in one call

sr = os.stat(tp)
sr

os.stat_result(st_mode=33206, st_ino=34339947158772568, st_dev=2322009872, st_nlink=1, st_uid=0, st_gid=0, st_size=0, st_atime=1538003903, st_mtime=1538003903, st_ctime=1538003896)

In [6]:
# get attributes

[sr.st_mode, sr.st_atime]

[33206, 1538003903.941263]

In [7]:
# removes a file, but raises error if it doesn't exist

os.remove(tp)
ac(tp)

[False, False, False, False]

In [8]:
# file is gone

os.path.exists(tp)

False

In [9]:
# stat gets upset and throws an error if the file doesn't exist

os.stat(tp)

FileNotFoundError: [WinError 2] The system cannot find the file specified: 'C:\\Users\\justi\\AppData\\Local\\Temp\\tmpwg3q5flf'

In [10]:
# Returns list of files and dirs in a directory
# can use isfile and isdir to figure out 
# which is which

fds = os.listdir(os.path.expanduser('~/anaconda3'))
fds

['.nonadmin',
 'api-ms-win-core-console-l1-1-0.dll',
 'api-ms-win-core-datetime-l1-1-0.dll',
 'api-ms-win-core-debug-l1-1-0.dll',
 'api-ms-win-core-errorhandling-l1-1-0.dll',
 'api-ms-win-core-file-l1-1-0.dll',
 'api-ms-win-core-file-l1-2-0.dll',
 'api-ms-win-core-file-l2-1-0.dll',
 'api-ms-win-core-handle-l1-1-0.dll',
 'api-ms-win-core-heap-l1-1-0.dll',
 'api-ms-win-core-interlocked-l1-1-0.dll',
 'api-ms-win-core-libraryloader-l1-1-0.dll',
 'api-ms-win-core-localization-l1-2-0.dll',
 'api-ms-win-core-memory-l1-1-0.dll',
 'api-ms-win-core-namedpipe-l1-1-0.dll',
 'api-ms-win-core-processenvironment-l1-1-0.dll',
 'api-ms-win-core-processthreads-l1-1-0.dll',
 'api-ms-win-core-processthreads-l1-1-1.dll',
 'api-ms-win-core-profile-l1-1-0.dll',
 'api-ms-win-core-rtlsupport-l1-1-0.dll',
 'api-ms-win-core-string-l1-1-0.dll',
 'api-ms-win-core-synch-l1-1-0.dll',
 'api-ms-win-core-synch-l1-2-0.dll',
 'api-ms-win-core-sysinfo-l1-1-0.dll',
 'api-ms-win-core-timezone-l1-1-0.dll',
 'api-ms-win-core-

# 'walk' - gets all the files and dirs under a start dir
- very easy to use

In [14]:
# deeply nested list - use pretty printer

import pprint

e = os.path.expanduser('~\\anaconda3\\bin')
print(e)
g = os.walk(e)
ld = list(g)

pprint.pprint(ld, depth=2)

C:\Users\justi\anaconda3\bin
[]


# open function
- used to open files for reading and writing

# Writing files 
- no automatic newlines - you must write them out explicitly if you want them

In [15]:
# open file, write to file descriptor, close file descriptor
# can be error prone - easy to forget to close. also, if there
# is an error, the close call could be skipped
# not closing file descriptors can cause a server to crash
# 'w' is the 'open mode' - tells 'open' to 
# open the file for writing

fd = open(tp, 'w')
for e in ['one', 'two', 'three', 'four']:
    fd.write(e + '\n')
fd.close()

# with 
- 'with' is a 'context manager'
- binds return value from open to 'fd'
- note ':' and indenting defines a statement block over which 'fd' will be bound
- 'with' will automatically close the file when the 'with' block is exited, even if by error

In [22]:
with open(tp, 'w') as fd:
    for e in ['one', 'two', 'three', 'four']:
        fd.write(e + '\n')

In [23]:
# could do one write with join

with open(tp, 'w') as fd:
    fd.write('\n'.join(['one', 'two', 'three', 'four']))

In [24]:
# or write out the string with newlines

with open(tp, 'w') as fd:
    fd.write("one\ntwo\nthree\nfour\n")

In [25]:
# before append

os.stat(tp)

os.stat_result(st_mode=33206, st_ino=41939771529924184, st_dev=2322009872, st_nlink=1, st_uid=0, st_gid=0, st_size=23, st_atime=1538004149, st_mtime=1538004313, st_ctime=1538004149)

In [26]:
# can append(open mode 'a') to an existing file

with open(tp, 'a') as f:
    for l in ['five', 'six']:
        f.write(l + '\n')

In [27]:
# file is longer now

os.stat(tp)

os.stat_result(st_mode=33206, st_ino=41939771529924184, st_dev=2322009872, st_nlink=1, st_uid=0, st_gid=0, st_size=34, st_atime=1538004149, st_mtime=1538004315, st_ctime=1538004149)

# print function output can goto a file

In [28]:

with open(tp2, "w") as f:
    print(1,2,3,4, sep='\n', file=f)

with open(tp2, 'r') as f:
    print(f.read())

1
2
3
4



# Reading files - eager
- read the entire file immediately

In [29]:
# eager read - read the entire file into one string
# 'r' tells 'open' to open the file for reading

with open(tp, 'r') as fd:    
    print(fd.read())

one
two
three
four
five
six



In [30]:
# eager read - get a list of all the lines 

with open(tp,'r') as fd:
    print(fd.readlines())

['one\n', 'two\n', 'three\n', 'four\n', 'five\n', 'six\n']


# Reading files - lazy
- suppose you are looking for a substring in a huge unsorted file of text lines
    - lazy read probably wins
    - don't have to read in entire file before you can start searching
    - don't have to allocate memory to hold the whole file
    - once you find the substring, you don't have to read the rest of the file

In [31]:
# read one line at a time 

with open(tp, 'r') as fd:
    while True:
        x = fd.readline()
        # returns empty string when finished
        if x == '':
            break;
        print(x)

one

two

three

four

five

six



In [32]:
# note double spacing
# each line in the file has a newline,  
# plus print is adding one
# can turn off the print newline 
# with keyword arg 'end'

with open(tp, 'r') as fd:
    while True:
        x = fd.readline()
        # returns empty string when finished
        if x == '':
            break;
        print(x, end='')

one
two
three
four
five
six


In [33]:
fd = open(tp, 'r')
fd

<_io.TextIOWrapper name='C:\\Users\\justi\\AppData\\Local\\Temp\\tmpwg3q5flf' mode='r' encoding='cp949'>

In [34]:
# a file descriptor is an iterator 
# over the file lines

[fd, iter(fd), fd is iter(fd)]

[<_io.TextIOWrapper name='C:\\Users\\justi\\AppData\\Local\\Temp\\tmpwg3q5flf' mode='r' encoding='cp949'>,
 <_io.TextIOWrapper name='C:\\Users\\justi\\AppData\\Local\\Temp\\tmpwg3q5flf' mode='r' encoding='cp949'>,
 True]

In [35]:
next(fd)

'one\n'

In [36]:
# don't have to finish iterator...

next(fd)

'two\n'

In [37]:
# note with readline and readlines 
# each line has a trailing '\n', 
# which you usually don't want
# use strip() to remove
# can this cause a problem?

'one\n'.strip()

'one'

In [38]:
# read N chars at a time

with open(tp, 'r')  as f:
    while True:
        s = f.read(3)
        if s == '':
            break;
        print(s)
        

one

tw
o
t
hre
e
f
our

fi
ve

six




In [39]:
# ... or can finish iterator later on

[next(fd), next(fd), next(fd), next(fd)]

['three\n', 'four\n', 'five\n', 'six\n']

# Can do I/O in unicode or binary
- 'open' defaults to 'str' (unicode)
- pass 'b' flag to 'open' for 'bytes'(binary)


In [40]:
uni = '\U00002119\u01b4\u2602\u210c\xf8\u1f24'

utf8, utf16, utf32 = [uni.encode(et) \
                      for et in \
                      ['utf-8', 'utf-16', 'utf-32']]

[uni, utf8, utf16, utf32]

['ℙƴ☂ℌøἤ',
 b'\xe2\x84\x99\xc6\xb4\xe2\x98\x82\xe2\x84\x8c\xc3\xb8\xe1\xbc\xa4',
 b'\xff\xfe\x19!\xb4\x01\x02&\x0c!\xf8\x00$\x1f',
 b'\xff\xfe\x00\x00\x19!\x00\x00\xb4\x01\x00\x00\x02&\x00\x00\x0c!\x00\x00\xf8\x00\x00\x00$\x1f\x00\x00']

In [41]:
# won't work - file stream expects a
# 'str' by default, but utf32 is type 'bytes'

import tempfile

path = tempfile.NamedTemporaryFile().name

with open(path, "w") as f:
    f.write(utf32)

TypeError: write() argument must be str, not bytes

In [42]:
# make a binary stream by adding 'b' flag to 'open'

with open(path, 'bw') as f:
    f.write(utf32)

In [43]:
#  reading in 'str' mode defaults to utf-8, 
# but the file we wrote is utf-32
# so, this read fails

# but, somethimes if you give open the 
# wrong encoding, it will read
# w/o error and give you garbage!

with open(path, "r") as f:
    print(f.read())

UnicodeDecodeError: 'cp949' codec can't decode byte 0xff in position 0: illegal multibyte sequence

In [44]:
# tell 'open' the right unicode encoding

with open(path, "r" , encoding='utf-32') as f:
    print(f.read())

ℙƴ☂ℌøἤ


In [45]:
# can read file bytes

with open(path, "rb") as f:
    b = f.read()
b

b'\xff\xfe\x00\x00\x19!\x00\x00\xb4\x01\x00\x00\x02&\x00\x00\x0c!\x00\x00\xf8\x00\x00\x00$\x1f\x00\x00'

In [46]:
utf32

b'\xff\xfe\x00\x00\x19!\x00\x00\xb4\x01\x00\x00\x02&\x00\x00\x0c!\x00\x00\xf8\x00\x00\x00$\x1f\x00\x00'

# In memory "files"
- very useful 
- [doc](https://docs.python.org/3.5/library/io.html#io.StringIO)

In [47]:
import io

ios = io.StringIO()

print('one', file=ios)
ios.write('two')

ios.getvalue()

'one\ntwo'

In [48]:
ios = io.StringIO('asdfasdf')

ios.read()

'asdfasdf'


# shutil module 
- move, copy, delete file trees
- [doc](https://docs.python.org/3.5/library/shutil.html)

# glob - linux style filename matching
- [doc](https://docs.python.org/3.5/library/glob.html)

# modules that read/write archive formats, like zip and tar
- [doc](https://docs.python.org/3.5/library/archiving.html)
