---
## Create big data with faker and use generator
### Tuesday: 27-02-2024

* Read the data normally

* Read the data using generator 

* Compare time and memory usage

---

In [33]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('..')))
import pandas as pd
from cProfile import Profile
from pstats import Stats
from faker import Faker
from handlers.fakerHandler import FakerHandler
from Utils import Utils as utils

fake = Faker()
dataSize = 10000000
fake_user_filepath = "../data/csv/fake_users_"+str(dataSize)+".csv"

def createFakeData(filepath, size):
    if not os.path.exists(filepath):
        data = {
            'Name': [fake.name() for _ in range(size)],
            'Address': [fake.address().replace('\n', ', ') for _ in range(dataSize)],
            'Email': [fake.email() for _ in range(size)]
        }
        df = pd.DataFrame(data)
        df.to_csv(filepath, index=False)

def fakerHandler_read_csv():
    fH = FakerHandler()
    lines = fH.readCSV(fake_user_filepath)
    #print(lines[0])
    #print(lines[1])

def fakerHandler_read_csv_generator():
    fH = FakerHandler()
    line = fH.readCSVGenerator(fake_user_filepath)
    #print(next(line))
    #print(next(line))

# Create fake data
createFakeData(fake_user_filepath, dataSize)

# Measure the functions using own utility
print("-Measuring Data using own utility")
print(">>>Measuring fakerHandler_read_csv")
utils.measure_function(fakerHandler_read_csv)
print(">>>Measuring fakerHandler_read_csv_generator")
utils.measure_function(fakerHandler_read_csv_generator)

# Measure the functions with cProfile
print("-Measuring Data using cProfile")
print(">>>Measuring fakerHandler_read_csv")
with Profile() as pr:
    fakerHandler_read_csv()
    print(f"{fakerHandler_read_csv = }")
    (
        Stats(pr)
        .strip_dirs()
        .sort_stats('cumulative')
        .print_stats(10)
    )

print("\n"+">>>Measuring fakerHandler_read_csv_generator")
with Profile() as pr:
    fakerHandler_read_csv_generator()
    print(f"{fakerHandler_read_csv_generator = }")
    (
        Stats(pr)
        .strip_dirs()
        .sort_stats('cumulative')
        .print_stats(10)
    )


-Measuring Data using own utility
>>>Measuring fakerHandler_read_csv
Memory used: 0.0 MB
Time taken: 3.70639705657959 seconds

>>>Measuring fakerHandler_read_csv_generator
Memory used: 0.0 MB
Time taken: 0.0 seconds

-Measuring Data using cProfile
>>>Measuring fakerHandler_read_csv
fakerHandler_read_csv = <function fakerHandler_read_csv at 0x0000019C2E65D630>
         50 function calls in 3.080 seconds

   Ordered by: cumulative time
   List reduced from 37 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    3.079    3.079 4191684558.py:25(fakerHandler_read_csv)
        1    0.026    0.026    3.079    3.079 fakerHandler.py:5(readCSV)
        1    1.729    1.729    1.729    1.729 {method 'splitlines' of 'str' objects}
        1    1.165    1.165    1.324    1.324 {method 'read' of '_io.TextIOWrapper' objects}
        1    0.000    0.000    0.159    0.159 codecs.py:319(decode)
        1    0.159    0.159   

---
## Use big data and "Do something with it"
### Tuesday: 27-02-2024

* Count names normally

* Count names using generator

---

In [28]:
from Utils import Utils as utils

def countNames(name):
    fH = FakerHandler()
    count = 0
    lines = fH.getNameData(fake_user_filepath, name)
    for _ in lines:
        count += 1
    print("Number of "+name+": ", count)

def countNamesGenerator(name):
    fH = FakerHandler()
    count = 0
    lines = fH.getNameDataGenerator(fake_user_filepath, name)
    for _ in lines:
        count += 1
    print("Number of "+name+": ", count)


# Measure the functions using own utility
print("-Measuring Data using own utility")
print(">>>Measuring countNames")
utils.measure_function(countNames, "John")
print(">>>Measuring countNamesGenerator")
utils.measure_function(countNamesGenerator, "John")

# Measure the functions with cProfile
print("-Measuring Data using cProfile")
print(">>>Measuring fakerHandler_read_csv")
with Profile() as pr:
    fakerHandler_read_csv()
    print(f"{fakerHandler_read_csv = }")
    (
        Stats(pr)
        .strip_dirs()
        .sort_stats('cumulative')
        .print_stats(10)
    )

print("\n"+">>>Measuring fakerHandler_read_csv_generator")
with Profile() as pr:
    fakerHandler_read_csv_generator()
    print(f"{fakerHandler_read_csv_generator = }")
    (
        Stats(pr)
        .strip_dirs()
        .sort_stats('cumulative')
        .print_stats(10)
    )


-Measuring Data using own utility
>>>Measuring countNames
Number of John:  637201
Memory used: 0.01953125 MB
Time taken: 4.594434976577759 seconds

>>>Measuring countNamesGenerator
Number of John:  637201
Memory used: 0.0 MB
Time taken: 2.551888942718506 seconds

-Measuring Data using cProfile
>>>Measuring fakerHandler_read_csv
fakerHandler_read_csv = <function fakerHandler_read_csv at 0x0000019C2E65D7E0>
         50 function calls in 3.131 seconds

   Ordered by: cumulative time
   List reduced from 37 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    3.131    3.131 4191684558.py:25(fakerHandler_read_csv)
        1    0.027    0.027    3.131    3.131 fakerHandler.py:5(readCSV)
        1    1.800    1.800    1.800    1.800 {method 'splitlines' of 'str' objects}
        1    1.153    1.153    1.303    1.303 {method 'read' of '_io.TextIOWrapper' objects}
        1    0.000    0.000    0.150    0.150 codec

---
## Lets test a multithreaded example
### Wednesday: 28-02-2024

* Count names from 2 files normally

* Count names from 2 files using generators and threaded

---

In [31]:
from Utils import Utils as utils
from concurrent.futures import ThreadPoolExecutor

def countNames(name):
    fH = FakerHandler()
    fake_user_filepath = "../data/csv/fake_users_1000000.csv"
    fake_user_filepath_2 = "../data/csv/fake_users_100000.csv"
    count = 0
    lines = fH.getNameData(fake_user_filepath, name)
    for _ in lines:
        count += 1
    lines = fH.getNameData(fake_user_filepath_2, name)
    for _ in lines:
        count += 1
    print("Number of "+name+": ", count)

def countThreadedNamesGenerator(name):
    fH = FakerHandler()
    fake_user_filepath = "../data/csv/fake_users_1000000.csv"
    fake_user_filepath_2 = "../data/csv/fake_users_100000.csv"

    with ThreadPoolExecutor(max_workers=2) as executor:
        future1 = executor.submit(fH.countNames, fake_user_filepath, name)
        future2 = executor.submit(fH.countNames, fake_user_filepath_2, name)

        count1 = future1.result()
        count2 = future2.result()

    total_count = count1 + count2
    print("Number of "+name+": ", total_count)

# Measure the functions using own utility
print("-Measuring Data using own utility")
print(">>>Measuring countNames")
utils.measure_function(countNames, "John")
print(">>>Measuring countThreadedNamesGenerator")
utils.measure_function(countThreadedNamesGenerator, "John")

# Measure the functions with cProfile
print("-Measuring Data using cProfile")
print(">>>Measuring countNames")
with Profile() as pr:
    countNames("John")
    print(f"{countNames = }")
    (
        Stats(pr)
        .strip_dirs()
        .sort_stats('cumulative')
        .print_stats(10)
    )

print(">>>Measuring countNames multithreaded")
with Profile() as pr:
    countThreadedNamesGenerator("John")
    print(f"{countThreadedNamesGenerator = }")
    (
        Stats(pr)
        .strip_dirs()
        .sort_stats('cumulative')
        .print_stats(10)
    )

-Measuring Data using own utility
>>>Measuring countNames
Number of John:  69995
Memory used: 1.29296875 MB
Time taken: 0.36900949478149414 seconds

>>>Measuring countThreadedNamesGenerator
Number of John:  69995
Memory used: 0.0 MB
Time taken: 0.29004859924316406 seconds

-Measuring Data using cProfile
>>>Measuring countNames
Number of John:  69995
countNames = <function countNames at 0x0000019C56B53130>
         70099 function calls in 0.384 seconds

   Ordered by: cumulative time
   List reduced from 38 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.027    0.027    0.384    0.384 1114827413.py:4(countNames)
        2    0.091    0.046    0.357    0.179 fakerHandler.py:28(getNameData)
        2    0.126    0.063    0.144    0.072 {method 'read' of '_io.TextIOWrapper' objects}
        2    0.117    0.059    0.117    0.059 {method 'splitlines' of 'str' objects}
        2    0.000    0.000    0.018    0.009 codecs.py

---
## Lets combine what we learned
### Wednesday: 28-02-2024

* Count names from all csv files efficiently

* Use generators

* Use threads

---

In [30]:
import os
from Utils import Utils as utils
from concurrent.futures import ThreadPoolExecutor

def countThreadedNamesGenerator(name):
    fH = FakerHandler()
    directory = "../data/csv/"
    file_paths = [os.path.join(directory, file) for file in os.listdir(directory)]
    
    # Print file names
    file_names = [os.path.basename(file_path) for file_path in file_paths]
    print("File names to count from: ", file_names)

    num_workers = min(os.cpu_count(), len(file_paths)) # Limit workers to number of files or number of cores
    print("Number of workers: ", num_workers)

    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = [executor.submit(fH.countNames, file_path, name) for file_path in file_paths]

    total_count = sum(future.result() for future in futures)
    print("Number of "+name+": ", total_count)

# Measure the functions using own utility
print("-Measuring Data using own utility")
print(">>>Measuring countThreadedNamesGenerator")
utils.measure_function(countThreadedNamesGenerator, "John")

# Measure the functions with cProfile
print("-Measuring Data using cProfile")
print(">>>Measuring countNames multithreaded")
with Profile() as pr:
    countThreadedNamesGenerator("John")
    print(f"{countThreadedNamesGenerator = }")
    (
        Stats(pr)
        .strip_dirs()
        .sort_stats('cumulative')
        .print_stats(10)
    )

-Measuring Data using own utility
>>>Measuring countThreadedNamesGenerator
File names to count from:  ['fake_users', 'fake_users_10.csv', 'fake_users_100.csv', 'fake_users_1000.csv', 'fake_users_10000.csv', 'fake_users_100000.csv', 'fake_users_1000000.csv', 'fake_users_10000000.csv']
Number of workers:  8
Number of John:  707883
Memory used: 0.0 MB
Time taken: 3.173654794692993 seconds

-Measuring Data using cProfile
>>>Measuring countNames multithreaded
File names to count from:  ['fake_users', 'fake_users_10.csv', 'fake_users_100.csv', 'fake_users_1000.csv', 'fake_users_10000.csv', 'fake_users_100000.csv', 'fake_users_1000000.csv', 'fake_users_10000000.csv']
Number of workers:  8
Number of John:  707883
countThreadedNamesGenerator = <function countThreadedNamesGenerator at 0x0000019C56B53C70>
         905 function calls in 2.976 seconds

   Ordered by: cumulative time
   List reduced from 94 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(