---
## Create big data with faker and use generator
### Tuesday: 27-02-2024

* Read the data normally

* Read the data using generator 

* Compare time and memory usage

---

In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('..')))
import pandas as pd
from cProfile import Profile
from pstats import SortKey, Stats
from faker import Faker
from fakerHandler import FakerHandler
from Utils import Utils as utils

fake = Faker()
dataSize = 1000000
fake_user_filepath = "../data/csv/fake_users_"+str(dataSize)+".csv"

def createFakeData(filepath, size):
    if not os.path.exists(filepath):
        data = {
            'Name': [fake.name() for _ in range(size)],
            'Address': [fake.address().replace('\n', ', ') for _ in range(dataSize)],
            'Email': [fake.email() for _ in range(size)]
        }
        df = pd.DataFrame(data)
        df.to_csv(filepath, index=False)

def fakerHandler_read_csv():
    fH = FakerHandler()
    lines = fH.readCSV(fake_user_filepath)
    #print(lines[0])
    #print(lines[1])

def fakerHandler_read_csv_generator():
    fH = FakerHandler()
    line = fH.readCSVGenerator(fake_user_filepath)
    #print(next(line))
    #print(next(line))

# Create fake data
createFakeData(fake_user_filepath, dataSize)

# Measure the functions using own utility
print("-Measuring Data using own utility")
print(">>>Measuring fakerHandler_read_csv")
utils.measure_function(fakerHandler_read_csv)
print(">>>Measuring fakerHandler_read_csv_generator")
utils.measure_function(fakerHandler_read_csv_generator)

# Measure the functions with cProfile
print("-Measuring Data using cProfile")
print(">>>Measuring fakerHandler_read_csv")
with Profile() as pr:
    fakerHandler_read_csv()
    print(f"{fakerHandler_read_csv = }")
    (
        Stats(pr)
        .strip_dirs()
        .sort_stats('cumulative')
        .print_stats(10)
    )

print("\n"+">>>Measuring fakerHandler_read_csv_generator")
with Profile() as pr:
    fakerHandler_read_csv_generator()
    print(f"{fakerHandler_read_csv_generator = }")
    (
        Stats(pr)
        .strip_dirs()
        .sort_stats('cumulative')
        .print_stats(10)
    )


-Measuring Data using own utility
>>>Measuring fakerHandler_read_csv
Memory used: 0.67578125 MB
Time taken: 0.46875 seconds

>>>Measuring fakerHandler_read_csv_generator
Memory used: 0.0 MB
Time taken: 0.0 seconds

-Measuring Data using cProfile
>>>Measuring fakerHandler_read_csv
fakerHandler_read_csv = <function fakerHandler_read_csv at 0x00000226FED7B420>
         400 function calls (398 primitive calls) in 0.468 seconds

   Ordered by: cumulative time
   List reduced from 139 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.039    0.020    0.467    0.233 base_events.py:1908(_run_once)
        1    0.000    0.000    0.427    0.427 3361915950.py:25(fakerHandler_read_csv)
        1    0.006    0.006    0.427    0.427 fakerHandler.py:5(readCSV)
        1    0.166    0.166    0.293    0.293 {method 'read' of '_io.TextIOWrapper' objects}
        1    0.129    0.129    0.129    0.129 {method 'splitlines' of 'str' objects}

---
## Use big data and "Do something with it"
### Tuesday: 28-02-2024

* Count names normally

* Count names using generator

---

In [6]:
from Utils import Utils as utils

def countNames(name):
    fH = FakerHandler()
    count = 0
    lines = fH.getNameData(fake_user_filepath, name)
    for _ in lines:
        count += 1
    print("Number of "+name+": ", count)

def countNamesGenerator(name):
    fH = FakerHandler()
    count = 0
    lines = fH.getNameDataGenerator(fake_user_filepath, name)
    for _ in lines:
        count += 1
    print("Number of "+name+": ", count)


# Measure the functions using own utility
print("-Measuring Data using own utility")
print(">>>Measuring countNames")
utils.measure_function(countNames, "John")
print(">>>Measuring countNamesGenerator")
utils.measure_function(countNamesGenerator, "John")

# Measure the functions with cProfile
print("-Measuring Data using cProfile")
print(">>>Measuring fakerHandler_read_csv")
with Profile() as pr:
    fakerHandler_read_csv()
    print(f"{fakerHandler_read_csv = }")
    (
        Stats(pr)
        .strip_dirs()
        .sort_stats('cumulative')
        .print_stats(10)
    )

print("\n"+">>>Measuring fakerHandler_read_csv_generator")
with Profile() as pr:
    fakerHandler_read_csv_generator()
    print(f"{fakerHandler_read_csv_generator = }")
    (
        Stats(pr)
        .strip_dirs()
        .sort_stats('cumulative')
        .print_stats(10)
    )


-Measuring Data using own utility
>>>Measuring countNames
Number of John:  63698
Memory used: 1.3125 MB
Time taken: 0.5 seconds

>>>Measuring countNamesGenerator
Number of John:  63698
Memory used: -1.2265625 MB
Time taken: 0.359375 seconds

-Measuring Data using cProfile
>>>Measuring fakerHandler_read_csv
fakerHandler_read_csv = <function fakerHandler_read_csv at 0x00000226FED7B420>
         401 function calls (399 primitive calls) in 0.716 seconds

   Ordered by: cumulative time
   List reduced from 139 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000    0.493    0.247 events.py:86(_run)
        2    0.222    0.111    0.493    0.247 {method 'run' of '_contextvars.Context' objects}
        9    0.006    0.001    0.270    0.030 socket.py:621(send)
        2    0.000    0.000    0.186    0.093 base_events.py:1908(_run_once)
        1    0.183    0.183    0.183    0.183 {built-in method _codecs.charmap_deco

---
## Lets test a multithreaded example
### Tuesday: 28-02-2024

* Count names from 2 files normally

* Count names from 2 files using generators and threaded

---

In [7]:
from Utils import Utils as utils
from concurrent.futures import ThreadPoolExecutor

def countNames(name):
    fH = FakerHandler()
    fake_user_filepath = "../data/csv/fake_users_1000000.csv"
    fake_user_filepath_2 = "../data/csv/fake_users_100000.csv"
    count = 0
    lines = fH.getNameData(fake_user_filepath, name)
    for _ in lines:
        count += 1
    lines = fH.getNameData(fake_user_filepath_2, name)
    for _ in lines:
        count += 1
    print("Number of "+name+": ", count)

def countThreadedNamesGenerator(name):
    fH = FakerHandler()
    fake_user_filepath = "../data/csv/fake_users_1000000.csv"
    fake_user_filepath_2 = "../data/csv/fake_users_100000.csv"

    with ThreadPoolExecutor(max_workers=2) as executor:
        future1 = executor.submit(fH.countNames, fake_user_filepath, name)
        future2 = executor.submit(fH.countNames, fake_user_filepath_2, name)

        count1 = future1.result()
        count2 = future2.result()

    total_count = count1 + count2
    print("Number of "+name+": ", total_count)

# Measure the functions using own utility
print("-Measuring Data using own utility")
print(">>>Measuring countNames")
utils.measure_function(countNames, "John")
print(">>>Measuring countThreadedNamesGenerator")
utils.measure_function(countThreadedNamesGenerator, "John")

# Measure the functions with cProfile
print("-Measuring Data using cProfile")
print(">>>Measuring countNames")
with Profile() as pr:
    countNames("John")
    print(f"{countNames = }")
    (
        Stats(pr)
        .strip_dirs()
        .sort_stats('cumulative')
        .print_stats(10)
    )

print(">>>Measuring countNames multithreaded")
with Profile() as pr:
    countThreadedNamesGenerator("John")
    print(f"{countThreadedNamesGenerator = }")
    (
        Stats(pr)
        .strip_dirs()
        .sort_stats('cumulative')
        .print_stats(10)
    )

-Measuring Data using own utility
>>>Measuring countNames
Number of John:  70112
Memory used: 0.0 MB
Time taken: 0.375 seconds

>>>Measuring countThreadedNamesGenerator
Number of John:  70112
Memory used: -0.01171875 MB
Time taken: 0.34375 seconds

-Measuring Data using cProfile
>>>Measuring countNames
Number of John:  70112
countNames = <function countNames at 0x00000226FED7BEC0>
         93604 function calls (93602 primitive calls) in 0.442 seconds

   Ordered by: cumulative time
   List reduced from 137 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    70114    0.310    0.000    0.433    0.000 fakerHandler.py:44(getNameDataGenerator)
        2    0.000    0.000    0.328    0.164 base_events.py:1908(_run_once)
        2    0.000    0.000    0.328    0.164 selectors.py:319(select)
        1    0.007    0.007    0.328    0.328 480917615.py:4(countNames)
    11523    0.003    0.000    0.123    0.000 cp1252.py:22(decode)
    11523 