---
### 2. System Programming with Python
---

In [1]:
# Operating system interfaces (os)
import os

# Current working directory

# Get the current working directory
cwd = os.getcwd()
print(cwd)

# Set the current working directory
os.chdir(cwd)

# Temporary directory
import tempfile
tmpdir = tempfile.gettempdir()

# Join paths
mytmpdir = os.path.join(tmpdir, "foobar")
print(mytmpdir)

# list containing the names of the entries in the directory given by path.
os.listdir(tmpdir)

# Create a directory
if not os.path.exists(mytmpdir):
    os.mkdir(mytmpdir)
os.makedirs(os.path.join(tmpdir, "foobar", "plop", "toto"), exist_ok=True)

c:\Users\Daniel\Documents\DataSci-101
C:\Users\Daniel\AppData\Local\Temp\foobar


In [2]:
# File input/output
filename = os.path.join(mytmpdir, "myfile.txt")
print(filename)

# Write
lines = ["Dans python tout est bon", "Enfin, presque"]

## write line by line
fd = open(filename, "w")
fd.write(lines[0] + "\n")
fd.write(lines[1]+ "\n")
fd.close()

## use a context manager to automatically close your file
with open(filename, 'w') as f:
    for line in lines:
        f.write(line + '\n')

# Read

## read one line at a time (entire file does not have to fit into memory)
f = open(filename, "r")
f.readline() # one string per line (including newlines)
f.readline() # next line
f.close()

## read one line at a time (entire file does not have to fit into memory)
f = open(filename, 'r')
f.readline() # one string per line (including newlines)
f.readline() # next line
f.close()

## read the whole file at once, return a list of lines
f = open(filename, 'r')
f.readlines() # one list, each line is one string
f.close()

## use list comprehension to duplicate readlines without reading entire file at once
f = open(filename, 'r')
[line for line in f]
f.close()

## use a context manager to automatically close your file
with open(filename, 'r') as f:
    lines = [line for line in f]

C:\Users\Daniel\AppData\Local\Temp\foobar\myfile.txt


In [4]:
# Explore, list directories

# Walk
WD = os.path.join(tmpdir, "foobar")
for dirpath, dirnames, filenames in os.walk(WD):
    print(dirpath, dirnames, filenames)

# glob, basename and file extension
import tempfile
import glob
tmpdir = tempfile.gettempdir()
filenames = glob.glob(os.path.join(tmpdir, "*", "*.txt"))
print(filenames)

# take basename then remove extension
basenames = [os.path.splitext(os.path.basename(f))[0] for f in filenames]
print(basenames)

C:\Users\Daniel\AppData\Local\Temp\foobar ['plop'] ['myfile.txt']
C:\Users\Daniel\AppData\Local\Temp\foobar\plop ['toto'] []
C:\Users\Daniel\AppData\Local\Temp\foobar\plop\toto [] []
['C:\\Users\\Daniel\\AppData\\Local\\Temp\\acrobat_sbx\\acroNGLLog.txt', 'C:\\Users\\Daniel\\AppData\\Local\\Temp\\foobar\\myfile.txt']
['acroNGLLog', 'myfile']


In [6]:
# shutil - High-level file operations
import shutil
src = os.path.join(tmpdir, "foobar", "myfile.txt")
dst = os.path.join(tmpdir, "foobar", "plop", "myfile.txt")
print("copy %s to %s" % (src, dst))

shutil.copy(src, dst)
print("File %s exists ?" % dst, os.path.exists(dst))

src = os.path.join(tmpdir, "foobar", "plop")
dst = os.path.join(tmpdir, "plop2")
print("copy tree %s under %s" % (src, dst))

try:
    shutil.copytree(src, dst)
    shutil.rmtree(dst)
    shutil.move(src, dst)
except (FileExistsError, FileNotFoundError) as e:
    pass

copy C:\Users\Daniel\AppData\Local\Temp\foobar\myfile.txt to C:\Users\Daniel\AppData\Local\Temp\foobar\plop\myfile.txt
File C:\Users\Daniel\AppData\Local\Temp\foobar\plop\myfile.txt exists ? True
copy tree C:\Users\Daniel\AppData\Local\Temp\foobar\plop under C:\Users\Daniel\AppData\Local\Temp\plop2


In [11]:
# Multiprocessing and multithreading

'''
Process
- A process is a name given to a program instance that has been loaded into memory and managed by the operating system.
- Process = address space + execution context (thread of control)

Threads
- Threads share the same address space (Data registers): access to code, heap and (global) data.

In Python
- The threading module uses threads.
- The multiprocessing module uses processes.
'''

# Multithreading
import time
import threading

def list_append(count, sign=1, out_list=None):
    if out_list is None:
        out_list = list()
    for i in range(count):
        out_list.append(sign * i)
        sum(out_list) # do some computation
    return out_list

size = 10000 # Number of numbers to add
out_list = list() # result is a simple list
thread1 = threading.Thread(target=list_append, args=(size, 1, out_list, ))
thread2 = threading.Thread(target=list_append, args=(size, -1, out_list, ))
startime = time.time()

# Will execute both in parallel
thread1.start()
thread2.start()

# Joins threads back to the parent process
thread1.join()
thread2.join()
print("Threading ellapsed time ", time.time() - startime)
print(out_list[:10])

Threading ellapsed time  2.0287418365478516
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [12]:
# Multiprocessing
import multiprocessing

# Sharing requires specific mecanism
out_list1 = multiprocessing.Manager().list()
p1 = multiprocessing.Process(target=list_append, args=(size, 1, None))
out_list2 = multiprocessing.Manager().list()
p2 = multiprocessing.Process(target=list_append, args=(size, -1, None))

startime = time.time()

p1.start()
p2.start()
p1.join()
p2.join()

print("Multiprocessing ellapsed time ", time.time() - startime)
print(out_list[:10]) 

Multiprocessing ellapsed time  0.09081721305847168
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [14]:
# Managers
'''
Managers provide a way to create data which can be shared between different processes, including
sharing over a network between processes running on different machines. A manager
object controls a server process which manages shared objects.
'''
size = int(size / 100) # Number of numbers to add

# Sharing requires specific mecanism
out_list = multiprocessing.Manager().list()
p1 = multiprocessing.Process(target=list_append, args=(size, 1, out_list))
p2 = multiprocessing.Process(target=list_append, args=(size, -1, out_list))

startime = time.time()

p1.start()
p2.start()
p1.join()
p2.join()

print(out_list[:10])
print("Multiprocessing with shared object ellapsed time ", time.time() - startime)

[]
Multiprocessing with shared object ellapsed time  0.08770084381103516


In [15]:
# Scripts and argument parsing
import os
import os.path
import argparse
import re
import pandas as pd
if __name__ == "__main__":
    # parse command line options
    output = "word_count.csv"
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input',
        help='list of input files.',
        nargs='+', type=str)
    parser.add_argument('-o', '--output',
        help='output csv file (default %s)' % output,
        type=str, default=output)
    options = parser.parse_args()
    
    if options.input is None :
        parser.print_help()
        raise SystemExit("Error: input files are missing")
    else:
        filenames = [f for f in options.input if os.path.isfile(f)]
    
    # Match words
    regex = re.compile("[a-zA-Z]+")
    
    count = dict()
    for filename in filenames:
        fd = open(filename, "r")
        for line in fd:
            for word in regex.findall(line.lower()):
                if not word in count:
                    count[word] = 1
                else:
                    count[word] += 1
    fd = open(options.output, "w")
    
    # Pandas
    df = pd.DataFrame([[k, count[k]] for k in count], columns=["word", "count"])
    df.to_csv(options.output, index=False)

usage: ipykernel_launcher.py [-h] [-i INPUT [INPUT ...]] [-o OUTPUT]
ipykernel_launcher.py: error: unrecognized arguments: --ip=127.0.0.1 --stdin=9003 --control=9001 --hb=9000 --Session.signature_scheme="hmac-sha256" --Session.key=b"5649d682-1af8-459c-9e6e-5aedd0fe3c3e" --shell=9002 --transport="tcp" --iopub=9004 --f=C:\Users\Daniel\AppData\Local\Temp\tmp-11832feQVlC9DlZu2.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
