---
## Create big data with faker and use generator
### Tuesday: 27-02-2024

* Read the data normally

* Read the data using generator 

* Compare time and memory usage

---

In [3]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('..')))
import pandas as pd
from cProfile import Profile
from pstats import SortKey, Stats
from faker import Faker
from fakerHandler import FakerHandler
from Utils import Utils as utils

fake = Faker()
dataSize = 1000000
fake_user_filepath = "../data/csv/fake_users_"+str(dataSize)+".csv"

def createFakeData(filepath, size):
    if not os.path.exists(filepath):
        data = {
            'Name': [fake.name() for _ in range(size)],
            'Address': [fake.address().replace('\n', ', ') for _ in range(dataSize)],
            'Email': [fake.email() for _ in range(size)]
        }
        df = pd.DataFrame(data)
        df.to_csv(filepath, index=False)

def fakerHandler_read_csv():
    fH = FakerHandler()
    lines = fH.readCSV(fake_user_filepath)
    #print(lines[0])
    #print(lines[1])

def fakerHandler_read_csv_generator():
    fH = FakerHandler()
    line = fH.readCSVGenerator(fake_user_filepath)
    #print(next(line))
    #print(next(line))

# Create fake data
createFakeData(fake_user_filepath, dataSize)

# Measure the functions using own utility
print("-Measuring Data using own utility")
print(">>>Measuring fakerHandler_read_csv")
utils.measure_function(fakerHandler_read_csv)
print(">>>Measuring fakerHandler_read_csv_generator")
utils.measure_function(fakerHandler_read_csv_generator)

# Measure the functions with cProfile
print("-Measuring Data using cProfile")
print(">>>Measuring fakerHandler_read_csv")
with Profile() as pr:
    fakerHandler_read_csv()
    print(f"{fakerHandler_read_csv = }")
    (
        Stats(pr)
        .strip_dirs()
        .sort_stats('cumulative')
        .print_stats(10)
    )

print("\n"+">>>Measuring fakerHandler_read_csv_generator")
with Profile() as pr:
    fakerHandler_read_csv_generator()
    print(f"{fakerHandler_read_csv_generator = }")
    (
        Stats(pr)
        .strip_dirs()
        .sort_stats('cumulative')
        .print_stats(10)
    )


-Measuring Data using own utility
>>>Measuring fakerHandler_read_csv
Memory used: 1.875 MB
Time taken: 0.40625 seconds

>>>Measuring fakerHandler_read_csv_generator
Memory used: 0.0 MB
Time taken: 0.0 seconds

-Measuring Data using cProfile
>>>Measuring fakerHandler_read_csv
fakerHandler_read_csv = <function fakerHandler_read_csv at 0x000001F28E007240>
         400 function calls (398 primitive calls) in 0.452 seconds

   Ordered by: cumulative time
   List reduced from 139 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.037    0.018    0.451    0.226 base_events.py:1908(_run_once)
        1    0.000    0.000    0.414    0.414 3361915950.py:25(fakerHandler_read_csv)
        1    0.006    0.006    0.414    0.414 fakerHandler.py:8(readCSV)
        1    0.156    0.156    0.283    0.283 {method 'read' of '_io.TextIOWrapper' objects}
        1    0.000    0.000    0.127    0.127 cp1252.py:22(decode)
        1    0.127    

---
## Use big data and "Do something with it"
### Tuesday: 28-02-2024

* Count names normally

* Count names using generator

* Count names using generators and threading

---

In [4]:
from Utils import Utils as utils

def countNames(name):
    fH = FakerHandler()
    count = 0
    lines = fH.getNameData(fake_user_filepath, name)
    for l in lines:
        count += 1
    print("Number of "+name+": ", count)

def countNamesGenerator(name):
    fH = FakerHandler()
    count = 0
    lines = fH.getNameDataGenerator(fake_user_filepath, name)
    for l in lines:
        count += 1
    print("Number of "+name+": ", count)

# Measure the functions using own utility
print("-Measuring Data using own utility")
print(">>>Measuring countNames")
utils.measure_function(countNames, "John")
print(">>>Measuring countNamesGenerator")
utils.measure_function(countNamesGenerator, "John")

# Measure the functions with cProfile
print("-Measuring Data using cProfile")
print(">>>Measuring fakerHandler_read_csv")
with Profile() as pr:
    fakerHandler_read_csv()
    print(f"{fakerHandler_read_csv = }")
    (
        Stats(pr)
        .strip_dirs()
        .sort_stats('cumulative')
        .print_stats(10)
    )

print("\n"+">>>Measuring fakerHandler_read_csv_generator")
with Profile() as pr:
    fakerHandler_read_csv_generator()
    print(f"{fakerHandler_read_csv_generator = }")
    (
        Stats(pr)
        .strip_dirs()
        .sort_stats('cumulative')
        .print_stats(10)
    )

-Measuring Data using own utility
>>>Measuring countNames
Number of John:  63698
Memory used: 0.1171875 MB
Time taken: 0.484375 seconds

>>>Measuring countNamesGenerator
Number of John:  63698
Memory used: -1.56640625 MB
Time taken: 0.390625 seconds

>>>Measuring countNamesThread
Number of John:  63698
Memory used: 0.04296875 MB
Time taken: 0.65625 seconds

-Measuring Data using cProfile
>>>Measuring fakerHandler_read_csv
fakerHandler_read_csv = <function fakerHandler_read_csv at 0x000001F28E007240>
         401 function calls (399 primitive calls) in 0.577 seconds

   Ordered by: cumulative time
   List reduced from 139 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000    0.322    0.161 events.py:86(_run)
        2    0.165    0.083    0.322    0.161 {method 'run' of '_contextvars.Context' objects}
        2    0.000    0.000    0.168    0.084 base_events.py:1908(_run_once)
        2    0.000    0.000    