In [None]:
# a generator is special type of iterator that allows
# you to iterate over a sequence of values one a time.
# without storing the entire sequence in memory.
# This makes generators more memory-efficient especially
# with large dataset of stream of data.

In [9]:
def my_generator(n):
    for i in range(n):
        yield i
gen=my_generator(10)
print(next(gen))
print(next(gen))
print(next(gen))
print(list(gen))

0
1
2
[3, 4, 5, 6, 7, 8, 9]


In [20]:
my_gen=(x*x for x in range(5))
print(next(my_gen))
print(next(my_gen))
print(list(my_gen))


0
1
[4, 9, 16]


In [8]:
# Create a big file with 5gb
import time
import tracemalloc
import os

tracemalloc.start()
start_time=time.time()

file_path="large_file.txt"
size_in_gb=1
line="This is test line for large file .\n"
line_size=len(line.encode('utf-8'))

num_lines=(size_in_gb*1024*1024*1024)//line_size
with open(file_path,"w") as f:
        for _ in range(num_lines):
            f.write(line)

end_time=time.time()
current,peak=tracemalloc.get_traced_memory()
tracemalloc.stop()

print(f"Execution time :{end_time-start_time:.2f} Seconds")
print(f"Peak memory used :{peak / 1024 / 1024:.2f} MB")
print(f"{file_path} create with size ~{size_in_gb}")

Execution time :119.93 Seconds
Peak memory used :0.04 MB
large_file.txt create with size ~1


In [5]:
# without generator
import time
import tracemalloc

file_path="large_file.txt"

def process(line):
    return len(line)

# memory trace
tracemalloc.start()
start_time=time.time()

with open(file_path,"r") as f:
    lines=f.readlines()

for line in lines:
    process(line)

end_time=time.time()
current,peak=tracemalloc.get_traced_memory()
tracemalloc.stop()

print(f"Execution time :{end_time-start_time:.2f} Seconds")
print(f"Peak memory used :{peak / 1024 / 1024:.2f} MB")

Execution time :27.97 Seconds
Peak memory used :2468.83 MB


In [3]:
# with generator
import time
import tracemalloc

file_path="large_file.txt"

def process(line):
    return len(line)

def file_read_generator(file_path):
    with open(file_path,"r") as f:
        for line in f:
            yield line
    
# memory trace
tracemalloc.start()
start_time=time.time()

for line in file_read_generator(file_path):
    process(line)
    
end_time=time.time()
current,peak=tracemalloc.get_traced_memory()
tracemalloc.stop()

print(f"Execution time :{end_time-start_time:.2f} Seconds")
print(f"Peak memory used :{peak / 1024 / 1024:.2f} MB")

Execution time :28.24 Seconds
Peak memory used :0.03 MB


In [6]:
# with generator read char wise
import time
import tracemalloc

file_path="large_file.txt"

def process(line):
    return len(line)

def read_file(file_path):
    with open(file_path,"r") as f:
        for line in f:
            yield line

def file_read_generator(file_path,chunk_size=1024):
    with open(file_path,"r") as f:
        while chunk := f.read(chunk_size):
            for char in chunk:
                yield char
    
# memory trace
tracemalloc.start()
start_time=time.time()

for line in file_read_generator(file_path):
    process(line)

for i,line in enumerate(read_file(file_path)):
    if i<10:
        print(line.strip())
    else:
        break
    
end_time=time.time()
current,peak=tracemalloc.get_traced_memory()
tracemalloc.stop()

print(f"Execution time :{end_time-start_time:.2f} Seconds")
print(f"Peak memory used :{peak / 1024 / 1024:.2f} MB")

This is test line for large file .
This is test line for large file .
This is test line for large file .
This is test line for large file .
This is test line for large file .
This is test line for large file .
This is test line for large file .
This is test line for large file .
This is test line for large file .
This is test line for large file .
Execution time :121.18 Seconds
Peak memory used :0.05 MB
