In [None]:
import csv
from time import time
from datetime import timedelta

import inflect

def create_data(num=None):
    p = inflect.engine()

    filename=["first", "second", "third", "fourth", "fifth","sixth", "seventh", "eight", "ninth", "tenth"]
    if num:
        filename = filename[:num]

    for index, file in enumerate(filename):
        with open(f"{file}.csv", "w", newline="") as csvfile:
            csvwriter = csv.writer(csvfile)
            columns = 1_0
            rows = 1_0
            st = time()
            csvwriter.writerow([f"Header {p.number_to_words(c)}" for c in range(columns)])
            csvwriter.writerows([[file]+[f"{p.number_to_words(r+1)}_{p.number_to_words(c+1)}" for c in range(columns)] for r in range(rows)])
            et = time()
            print(f"[HH:MM:SS]{timedelta(seconds=et-st)}")
            print(f"{(index+1)*100/len(filename)}%") # percent complete

In [None]:
create_data()

In [75]:
%%file remove_cumulative.py
import os 

def remove_cumulative():
    output_file = "cumulative.csv"
    if os.path.exists(output_file):
        os.remove(output_file)
    #     print("Done")
    # else:
    #     print("Nothing to delete")

if __name__ == "__main__":
    remove_cumulative()

Writing remove_cumulative.py


In [33]:
%reload_ext memory_profiler
%reload_ext filprofiler

In [70]:
%%file reducer1.py
import os
from glob import glob
from functools import reduce

def reducer(num=None):
    output_file = "cumulative.csv"
    received_csv_files = [file for file in glob("*.csv") if file != output_file]

    if not num or 0 > num > len(received_csv_files):
        num = len(received_csv_files)
        
    received_csv_files = received_csv_files[:num]
    
    with open(output_file, "wb") as outfile:
        def combine_files(out_file, in_file_path:str):
            with open(in_file_path, "rb") as in_file:
                if out_file.tell() !=0:
                    next(in_file)
                out_file.writelines(in_file)
                return out_file
        
        reduce(combine_files, received_csv_files, outfile)
        
if __name__ == "__main__":
    reducer()

Overwriting reducer1.py


In [44]:
%%timeit -r 10 -n 10_000
reducer()
remove_cumulative()

187 µs ± 2.38 µs per loop (mean ± std. dev. of 10 runs, 10,000 loops each)


In [29]:
%memit reducer()
remove_cumulative()

peak memory: 138.07 MiB, increment: 0.00 MiB


In [30]:
%%filprofile
reducer()
remove_cumulative()

=fil-profile= Preparing to write to fil-result/tmp2a_1bbum
=fil-profile= Wrote flamegraph to "fil-result/tmp2a_1bbum/peak-memory.svg"
=fil-profile= Wrote flamegraph to "fil-result/tmp2a_1bbum/peak-memory-reversed.svg"


In [71]:
%%file reducer2.py
# Opening the product file in append mode
import os
from glob import glob
from functools import reduce

def reducer(num=None):
    output_file = "cumulative.csv"
    received_csv_files = [file for file in glob("*.csv") if file != output_file]

    if not num or 0 > num > len(received_csv_files):
        num = len(received_csv_files)
        
    received_csv_files = received_csv_files[:num]
    
    with open(output_file, "ab") as outfile:
        def combine_files(out_file, in_file_path:str):
            with open(in_file_path, "rb") as in_file:
                if out_file.tell() !=0:
                    next(in_file)
                out_file.writelines(in_file)
                return out_file
        
        reduce(combine_files, received_csv_files, outfile)

if __name__ == "__main__":
    reducer()

Overwriting reducer2.py


In [8]:
%%timeit -r 10 -n 10_000
reducer()
remove_cumulative()

176 µs ± 1.84 µs per loop (mean ± std. dev. of 10 runs, 10,000 loops each)


In [9]:
%memit reducer()
remove_cumulative()

peak memory: 111.08 MiB, increment: 0.11 MiB


In [10]:
%%filprofile
reducer()
remove_cumulative()

=fil-profile= Preparing to write to fil-result/tmpsa51n43z
=fil-profile= Wrote flamegraph to "fil-result/tmpsa51n43z/peak-memory.svg"
=fil-profile= Wrote flamegraph to "fil-result/tmpsa51n43z/peak-memory-reversed.svg"


In [11]:
# from glob import glob
# from functools import reduce

# import pandas as pd

# def reducer(num=None):
#     CHUNK_SIZE = 1
#     output_file = "cumulative.csv"
#     received_csv_files = [file for file in glob("*.csv") if file != output_file]

#     if not num or 0 > num > len(received_csv_files):
#         num = len(received_csv_files)
        
#     received_csv_files = received_csv_files[:num]
#     first_one = True
#     for csv_file_name in received_csv_files:
#         if not first_one: # if it is not the first csv file then skip the header row (row 0) of that file
#             skip_row = [0]
#         else:
#             skip_row = []
        
#         chunk_container = pd.read_csv(csv_file_name, chunksize=CHUNK_SIZE,skiprows = skip_row)
#         first_first_one = True
#         for chunk in chunk_container:
#             chunk.to_csv(output_file, mode="a", index=False, header=first_one and first_first_one)
#             first_first_one = False
#         first_one = False

In [12]:
# %%timeit -r 10 -n 10_000
# reducer()
# remove_cumulative()

In [13]:
# %memit reducer()
# remove_cumulative()

In [72]:
%%file reducer3.py
from glob import glob
import shutil

def reducer(num=None):
    output_file = "cumulative.csv"
    received_csv_files = [file for file in glob("*.csv") if file != output_file]
    with open(output_file, 'wb') as outfile:
        for i, filename in enumerate(received_csv_files):
            if filename == output_file:
                continue
            with open(filename, 'rb') as readfile:
                if i != 0:
                    readfile.readline()
                shutil.copyfileobj(readfile, outfile)
                
if __name__ == "__main__":
    reducer()

Overwriting reducer3.py


In [15]:
%%timeit -r 10 -n 10_000
reducer()
remove_cumulative()

171 µs ± 1.5 µs per loop (mean ± std. dev. of 10 runs, 10,000 loops each)


In [16]:
%memit reducer()
remove_cumulative()

peak memory: 111.71 MiB, increment: 0.10 MiB


In [17]:
%%filprofile
reducer()
remove_cumulative()

=fil-profile= Preparing to write to fil-result/tmpp9f4dp86
=fil-profile= Wrote flamegraph to "fil-result/tmpp9f4dp86/peak-memory.svg"
=fil-profile= Wrote flamegraph to "fil-result/tmpp9f4dp86/peak-memory-reversed.svg"


In [73]:
%%file reducer4.py
# Opening the files in text mode instead of binary
import os
from glob import glob
from functools import reduce

def reducer(num=None):
    output_file = "cumulative.csv"
    received_csv_files = [file for file in glob("*.csv") if file != output_file]

    if not num or 0 > num > len(received_csv_files):
        num = len(received_csv_files)
        
    received_csv_files = received_csv_files[:num]
    
    with open(output_file, "wt") as outfile:
        def combine_files(out_file, in_file_path:str):
            with open(in_file_path, "rt") as in_file:
                if out_file.tell() !=0:
                    next(in_file)
                out_file.writelines(in_file)
                return out_file
        
        reduce(combine_files, received_csv_files, outfile)

if __name__ == "__main__":
    reducer()

Overwriting reducer4.py


In [19]:
%%timeit -r 10 -n 10_000
reducer()
remove_cumulative()

269 µs ± 1.78 µs per loop (mean ± std. dev. of 10 runs, 10,000 loops each)


In [20]:
%memit reducer()
remove_cumulative()

peak memory: 112.84 MiB, increment: 0.00 MiB


In [21]:
%%filprofile
reducer()
remove_cumulative()

=fil-profile= Preparing to write to fil-result/tmp9js_0m9w
=fil-profile= Wrote flamegraph to "fil-result/tmp9js_0m9w/peak-memory.svg"
=fil-profile= Wrote flamegraph to "fil-result/tmp9js_0m9w/peak-memory-reversed.svg"


In [74]:
%%file reducer5.py
import os
from glob import glob
from functools import reduce

def reducer(num=None):
    output_file = "cumulative.csv"
    received_csv_files = [file for file in glob("*.csv") if file != output_file]

    if not num or 0 > num > len(received_csv_files):
        num = len(received_csv_files)
        
    received_csv_files = received_csv_files[:num]
    
    with open(output_file, "wb") as outfile:
        with open(received_csv_files[0],"rb") as one:
            with open(received_csv_files[1],"rb") as two:
                with open(received_csv_files[2],"rb") as three:
                    with open(received_csv_files[3],"rb") as four:
                        with open(received_csv_files[4],"rb") as five:
                            with open(received_csv_files[5],"rb") as six:
                                with open(received_csv_files[6],"rb") as seven:
                                    with open(received_csv_files[7],"rb") as eight:
                                        with open(received_csv_files[8],"rb") as nine:
                                            with open(received_csv_files[9],"rb") as ten:
                                                outfile.writelines(one)
                                                outfile.writelines(two)
                                                outfile.writelines(three)
                                                outfile.writelines(four)
                                                outfile.writelines(five)
                                                outfile.writelines(six)
                                                outfile.writelines(seven)
                                                outfile.writelines(eight)
                                                outfile.writelines(nine)
                                                outfile.writelines(ten)
                                
                    
if __name__ == "__main__":
    reducer()

Overwriting reducer5.py


In [61]:
%%timeit -r 10 -n 10_000
reducer()
remove_cumulative()

173 µs ± 2.78 µs per loop (mean ± std. dev. of 10 runs, 10,000 loops each)


In [62]:
%memit reducer()
remove_cumulative()

peak memory: 142.78 MiB, increment: 0.00 MiB


In [63]:
%%filprofile
reducer()
remove_cumulative()

=fil-profile= Preparing to write to fil-result/tmpregwv0cg
=fil-profile= Wrote flamegraph to "fil-result/tmpregwv0cg/peak-memory.svg"
=fil-profile= Wrote flamegraph to "fil-result/tmpregwv0cg/peak-memory-reversed.svg"
