---
## Create big data with faker and use generator
### Tuesday: 27-02-2024

* Read the data normally

* Read the data using generator 

* Compare time and memory usage

---

In [20]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('..')))
import pandas as pd
from cProfile import Profile
from pstats import SortKey, Stats
from faker import Faker
from fakerHandler import FakerHandler
from Utils import Utils as utils

fake = Faker()
dataSize = 1000000
fake_user_filepath = "../data/csv/fake_users_"+str(dataSize)+".csv"

def createFakeData(filepath, size):
    if not os.path.exists(filepath):
        data = {
            'Name': [fake.name() for _ in range(size)],
            'Address': [fake.address().replace('\n', ', ') for _ in range(dataSize)],
            'Email': [fake.email() for _ in range(size)]
        }
        df = pd.DataFrame(data)
        df.to_csv(filepath, index=False)

def fakerHandler_read_csv():
    fH = FakerHandler()
    lines = fH.readCSV(fake_user_filepath)
    #print(lines[0])
    #print(lines[1])

def fakerHandler_read_csv_generator():
    fH = FakerHandler()
    line = fH.readCSVGenerator(fake_user_filepath)
    #print(next(line))
    #print(next(line))

# Create fake data
createFakeData(fake_user_filepath, dataSize)

# Measure the functions using own utility
print("-Measuring Data using own utility")
print(">>>Measuring fakerHandler_read_csv")
utils.measure_function(fakerHandler_read_csv)
print(">>>Measuring fakerHandler_read_csv_generator")
utils.measure_function(fakerHandler_read_csv_generator)

# Measure the functions with cProfile
print("-Measuring Data using cProfile")
print(">>>Measuring fakerHandler_read_csv")
with Profile() as pr:
    fakerHandler_read_csv()
    print(f"{fakerHandler_read_csv = }")
    (
        Stats(pr)
        .strip_dirs()
        .sort_stats('cumulative')
        .print_stats(10)
    )

print("\n"+">>>Measuring fakerHandler_read_csv_generator")
with Profile() as pr:
    fakerHandler_read_csv_generator()
    print(f"{fakerHandler_read_csv_generator = }")
    (
        Stats(pr)
        .strip_dirs()
        .sort_stats('cumulative')
        .print_stats(10)
    )


-Measuring Data using own utility
>>>Measuring fakerHandler_read_csv
Memory used: 2.3203125 MB
Time taken: 0.53125 seconds

>>>Measuring fakerHandler_read_csv_generator
Memory used: 0.0 MB
Time taken: 0.0 seconds

-Measuring Data using cProfile
>>>Measuring fakerHandler_read_csv
fakerHandler_read_csv = <function fakerHandler_read_csv at 0x00000226FEEE77E0>
         400 function calls (398 primitive calls) in 0.496 seconds

   Ordered by: cumulative time
   List reduced from 139 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.036    0.018    0.495    0.247 base_events.py:1908(_run_once)
        1    0.000    0.000    0.458    0.458 3361915950.py:25(fakerHandler_read_csv)
        1    0.005    0.005    0.458    0.458 fakerHandler.py:5(readCSV)
        1    0.168    0.168    0.326    0.326 {method 'read' of '_io.TextIOWrapper' objects}
        1    0.000    0.000    0.158    0.158 cp1252.py:22(decode)
        1    0.158

---
## Use big data and "Do something with it"
### Tuesday: 28-02-2024

* Count names normally

* Count names using generator

---

In [21]:
from Utils import Utils as utils

def countNames(name):
    fH = FakerHandler()
    count = 0
    lines = fH.getNameData(fake_user_filepath, name)
    for _ in lines:
        count += 1
    print("Number of "+name+": ", count)

def countNamesGenerator(name):
    fH = FakerHandler()
    count = 0
    lines = fH.getNameDataGenerator(fake_user_filepath, name)
    for _ in lines:
        count += 1
    print("Number of "+name+": ", count)


# Measure the functions using own utility
print("-Measuring Data using own utility")
print(">>>Measuring countNames")
utils.measure_function(countNames, "John")
print(">>>Measuring countNamesGenerator")
utils.measure_function(countNamesGenerator, "John")

# Measure the functions with cProfile
print("-Measuring Data using cProfile")
print(">>>Measuring fakerHandler_read_csv")
with Profile() as pr:
    fakerHandler_read_csv()
    print(f"{fakerHandler_read_csv = }")
    (
        Stats(pr)
        .strip_dirs()
        .sort_stats('cumulative')
        .print_stats(10)
    )

print("\n"+">>>Measuring fakerHandler_read_csv_generator")
with Profile() as pr:
    fakerHandler_read_csv_generator()
    print(f"{fakerHandler_read_csv_generator = }")
    (
        Stats(pr)
        .strip_dirs()
        .sort_stats('cumulative')
        .print_stats(10)
    )


-Measuring Data using own utility
>>>Measuring countNames
Number of John:  63698
Memory used: 0.1015625 MB
Time taken: 0.578125 seconds

>>>Measuring countNamesGenerator
Number of John:  63698
Memory used: 0.04296875 MB
Time taken: 0.421875 seconds

-Measuring Data using cProfile
>>>Measuring fakerHandler_read_csv
fakerHandler_read_csv = <function fakerHandler_read_csv at 0x00000226FEEE77E0>
         401 function calls (399 primitive calls) in 0.604 seconds

   Ordered by: cumulative time
   List reduced from 139 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000    0.342    0.171 events.py:86(_run)
        2    0.170    0.085    0.342    0.171 {method 'run' of '_contextvars.Context' objects}
        2    0.000    0.000    0.170    0.085 base_events.py:1908(_run_once)
        1    0.000    0.000    0.135    0.135 ioloop.py:742(_run_callback)
        1    0.000    0.000    0.135    0.135 iostream.py:616(_flu

---
## Lets test a multithreaded example
### Tuesday: 28-02-2024

* Count names from 2 files normally

* Count names from 2 files using generators and threaded

---

In [22]:
from Utils import Utils as utils
from concurrent.futures import ThreadPoolExecutor

def countNames(name):
    fH = FakerHandler()
    fake_user_filepath = "../data/csv/fake_users_1000000.csv"
    fake_user_filepath_2 = "../data/csv/fake_users_100000.csv"
    count = 0
    lines = fH.getNameData(fake_user_filepath, name)
    for _ in lines:
        count += 1
    lines = fH.getNameData(fake_user_filepath_2, name)
    for _ in lines:
        count += 1
    print("Number of "+name+": ", count)

def countThreadedNamesGenerator(name):
    fH = FakerHandler()
    fake_user_filepath = "../data/csv/fake_users_1000000.csv"
    fake_user_filepath_2 = "../data/csv/fake_users_100000.csv"

    with ThreadPoolExecutor(max_workers=2) as executor:
        future1 = executor.submit(fH.countNames, fake_user_filepath, name)
        future2 = executor.submit(fH.countNames, fake_user_filepath_2, name)

        count1 = future1.result()
        count2 = future2.result()

    total_count = count1 + count2
    print("Number of "+name+": ", total_count)

# Measure the functions using own utility
print("-Measuring Data using own utility")
print(">>>Measuring countNames")
utils.measure_function(countNames, "John")
print(">>>Measuring countThreadedNamesGenerator")
utils.measure_function(countThreadedNamesGenerator, "John")

# Measure the functions with cProfile
print("-Measuring Data using cProfile")
print(">>>Measuring countNames")
with Profile() as pr:
    countNames("John")
    print(f"{countNames = }")
    (
        Stats(pr)
        .strip_dirs()
        .sort_stats('cumulative')
        .print_stats(10)
    )

print(">>>Measuring countNames multithreaded")
with Profile() as pr:
    countThreadedNamesGenerator("John")
    print(f"{countThreadedNamesGenerator = }")
    (
        Stats(pr)
        .strip_dirs()
        .sort_stats('cumulative')
        .print_stats(10)
    )

-Measuring Data using own utility
>>>Measuring countNames
Number of John:  70112
Memory used: 0.9609375 MB
Time taken: 0.5 seconds

>>>Measuring countThreadedNamesGenerator
Number of John:  70112
Memory used: -1.6640625 MB
Time taken: 0.359375 seconds

-Measuring Data using cProfile
>>>Measuring countNames
Number of John:  70112
countNames = <function countNames at 0x00000226DEE43D80>
         70628 function calls (70624 primitive calls) in 0.749 seconds

   Ordered by: cumulative time
   List reduced from 160 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        4    0.000    0.000    0.548    0.137 events.py:86(_run)
        4    0.167    0.042    0.416    0.104 {method 'run' of '_contextvars.Context' objects}
        9    0.101    0.011    0.333    0.037 socket.py:621(send)
        3    0.000    0.000    0.204    0.068 selectors.py:319(select)
        3    0.001    0.000    0.204    0.068 selectors.py:313(_select)
        2   

---
## Lets combine what we learned
### Tuesday: 28-02-2024

* Count names from all csv files efficiently

* Use generators

* Use threads

---

In [24]:
import os
from Utils import Utils as utils
from concurrent.futures import ThreadPoolExecutor

def countThreadedNamesGenerator(name):
    fH = FakerHandler()
    directory = "../data/csv/"
    file_paths = [os.path.join(directory, file) for file in os.listdir(directory)]

    num_workers = min(os.cpu_count(), len(file_paths)) # Limit workers to number of files or number of cores

    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = [executor.submit(fH.countNames, file_path, name) for file_path in file_paths]

    total_count = sum(future.result() for future in futures)
    print("Number of "+name+": ", total_count)

# Measure the functions using own utility
print("-Measuring Data using own utility")
print(">>>Measuring countThreadedNamesGenerator")
utils.measure_function(countThreadedNamesGenerator, "John")

# Measure the functions with cProfile
print("-Measuring Data using cProfile")
print(">>>Measuring countNames multithreaded")
with Profile() as pr:
    countThreadedNamesGenerator("John")
    print(f"{countThreadedNamesGenerator = }")
    (
        Stats(pr)
        .strip_dirs()
        .sort_stats('cumulative')
        .print_stats(10)
    )

-Measuring Data using own utility
>>>Measuring countThreadedNamesGenerator
Number of John:  70851
Memory used: 0.01171875 MB
Time taken: 0.375 seconds

-Measuring Data using cProfile
>>>Measuring countNames multithreaded
Number of John:  70851
countThreadedNamesGenerator = <function countThreadedNamesGenerator at 0x00000226FEEE7C40>
         95582 function calls (82205 primitive calls) in 0.564 seconds

   Ordered by: cumulative time
   List reduced from 215 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    59/46    0.000    0.000    0.468    0.010 {method 'acquire' of '_thread.lock' objects}
        5    0.000    0.000    0.468    0.094 threading.py:1115(join)
      7/6    0.000    0.000    0.468    0.078 threading.py:1153(_wait_for_tstate_lock)
      2/1    0.000    0.000    0.461    0.461 1394957030.py:5(countThreadedNamesGenerator)
        1    0.000    0.000    0.461    0.461 _base.py:646(__exit__)
        1    0.000    0.00