First method "using pandas.read_csv(chunk size)

In [1]:
import pandas as pd
import time
import psutil 
csv_path = r"D:\data\amazon_all.csv"

chunksize = 500000
process = psutil.Process()
memory_before = process.memory_info().rss / (1024 ** 2)
start_time = time.time()
total_rows = 0
first_chunk = None
max_memory = 0

for chunk in pd.read_csv(csv_path, chunksize=chunksize):
    total_rows += len(chunk)

    if first_chunk is None:
        first_chunk = chunk

    memory = chunk.memory_usage(deep=True).sum() / (1024**2)
    if memory > max_memory:
        max_memory = memory

end_time = time.time()
memory_after = process.memory_info().rss / (1024 ** 2)
memory_used = memory_after - memory_before 
print(f"The time taken to read the file is: {end_time - start_time:.2f} seconds")
print(f"Total rows read: {total_rows}")
print(f"Maximum memory used by a chunk: {max_memory:.2f} MB")
print(f"Memory used by using pandas: {memory_used:.2f} MB")
first_chunk.head()

  for chunk in pd.read_csv(csv_path, chunksize=chunksize):


The time taken to read the file is: 332.68 seconds
Total rows read: 25363163
Maximum memory used by a chunk: 539.90 MB
Memory used by using pandas: 1080.43 MB


Unnamed: 0,BehaviouralFeatureResult,label,summary,reviewText,overall,reviewerName,helpful,reviewTime,asin,category,reviewerID,unixReviewTime,_id,class
0,,,A++++,Best phone case ever . Everywhere I go I get a...,5.0,BiancaNicole,"[4, 4]","01 13, 2013",110400550,Cell_Phones_and_Accessories,A3HVRXV0LVJN7,1358035200,{'$oid': '5a1321d5741a2384e802c552'},1.0
1,,,ITEM NOT SENT!!,ITEM NOT SENT from Blue Top Company in Hong Ko...,1.0,"cf ""t""","[0, 3]","01 30, 2013",110400550,Cell_Phones_and_Accessories,A1BJGDS0L1IO6I,1359504000,{'$oid': '5a1321d5741a2384e802c557'},0.0
2,,,Great product,Saw this same case at a theme park store for 2...,5.0,Andrea Busch,"[0, 0]","11 22, 2012",110400550,Cell_Phones_and_Accessories,A1YX2RBMS1L9L,1353542400,{'$oid': '5a1321d5741a2384e802c550'},1.0
3,,,Perfect,case fits perfectly and I always gets complime...,5.0,Aniya pennington,"[3, 3]","07 18, 2013",110400550,Cell_Phones_and_Accessories,A180NNPPKWCCU0,1374105600,{'$oid': '5a1321d5741a2384e802c553'},1.0
4,,,Cool purchase.,I got this for my 14 year old sister. She lov...,4.0,Gene,"[1, 1]","03 20, 2013",110400550,Cell_Phones_and_Accessories,A30P2CYOUYAJM8,1363737600,{'$oid': '5a1321d5741a2384e802c559'},1.0


Second method "using Dask

In [None]:
!pip install dask psutil

In [None]:
import dask.dataframe as dd
import psutil

start_time = time.time()
process = psutil.Process()
memory_before = process.memory_info().rss / (1024 ** 2)

df = dd.read_csv(r"D:\data\amazon_all.csv", dtype={'asin': 'object', 'BehaviouralFeatureResult': 'object'})
df.compute()
memory_after = process.memory_info().rss / (1024 ** 2)
end_time = time.time()

print(f"The time taken to read the file with Dask is: {end_time - start_time:.2f} seconds")
print(f"Memory used by Dask: {memory_after - memory_before:.2f} MB")
df.head()

In [None]:
import dask.dataframe as dd
import psutil

start_time = time.time()
process = psutil.Process()
memory_before = process.memory_info().rss / (1024 ** 2)

df = dd.read_csv(r"D:\data\amazon_all.csv",dtype={'asin': 'object', 'BehaviouralFeatureResult': 'object'})
first_part = df.head(1000)  
memory_after = process.memory_info().rss / (1024 ** 2)
end_time = time.time()

print(f"The time taken to read the file with Dask is: {end_time - start_time:.2f} seconds")
print(f"Memory used by Dask: {memory_after - memory_before:.2f} MB")

The time taken to read the file with Dask is: 2.97 seconds
Memory used by Dask: 7.85 MB


Third method "with Compression"

In [7]:
import pandas as pd
csv_path = r"D:\data\amazon_all.csv.gz"
chunksize = 500000 
process = psutil.Process()
 
memory_before = process.memory_info().rss / (1024 ** 2)
start_time = time.time()

csv_iterator = pd.read_csv(csv_path, compression='gzip', chunksize=chunksize)
total_rows = 0
first_chunk = None
max_memory = 0

for chunk in csv_iterator:
    total_rows += len(chunk)

    if first_chunk is None:
        first_chunk = chunk

    memory = chunk.memory_usage(deep=True).sum() / (1024 ** 2)
    if memory > max_memory:
        max_memory = memory

end_time = time.time()
memory_after = process.memory_info().rss / (1024 ** 2)
memory_used = memory_after - memory_before

print(f"The time taken to read compressed CSV (gzip): {end_time - start_time:.2f} seconds")
print(f"Total rows read: {total_rows:,}")
print(f"Maximum memory used by a chunk: {max_memory:.2f} MB")
print(f"memory used by using compression: {memory_used:.2f} MB")
first_chunk.head()

  for chunk in csv_iterator:


The time taken to read compressed CSV (gzip): 431.96 seconds
Total rows read: 25,363,163
Maximum memory used by a chunk: 539.90 MB
memory used by using compression: 824.30 MB


Unnamed: 0,BehaviouralFeatureResult,label,summary,reviewText,overall,reviewerName,helpful,reviewTime,asin,category,reviewerID,unixReviewTime,_id,class
0,,,A++++,Best phone case ever . Everywhere I go I get a...,5.0,BiancaNicole,"[4, 4]","01 13, 2013",110400550,Cell_Phones_and_Accessories,A3HVRXV0LVJN7,1358035200,{'$oid': '5a1321d5741a2384e802c552'},1.0
1,,,ITEM NOT SENT!!,ITEM NOT SENT from Blue Top Company in Hong Ko...,1.0,"cf ""t""","[0, 3]","01 30, 2013",110400550,Cell_Phones_and_Accessories,A1BJGDS0L1IO6I,1359504000,{'$oid': '5a1321d5741a2384e802c557'},0.0
2,,,Great product,Saw this same case at a theme park store for 2...,5.0,Andrea Busch,"[0, 0]","11 22, 2012",110400550,Cell_Phones_and_Accessories,A1YX2RBMS1L9L,1353542400,{'$oid': '5a1321d5741a2384e802c550'},1.0
3,,,Perfect,case fits perfectly and I always gets complime...,5.0,Aniya pennington,"[3, 3]","07 18, 2013",110400550,Cell_Phones_and_Accessories,A180NNPPKWCCU0,1374105600,{'$oid': '5a1321d5741a2384e802c553'},1.0
4,,,Cool purchase.,I got this for my 14 year old sister. She lov...,4.0,Gene,"[1, 1]","03 20, 2013",110400550,Cell_Phones_and_Accessories,A30P2CYOUYAJM8,1363737600,{'$oid': '5a1321d5741a2384e802c559'},1.0


In [None]:
data = {
    "Method": ["Pandas (chunksize)", "Dask", "Compressed (Gzip)"],
    "Time (seconds)": [332.68, 2.97, 431.96],
    "Max Memory per Chunk (MB)": [539.90, None, 539.90],
    "Total Memory Used (MB)": [1080.43, 7.85, 824.30]
}
df_comparison = pd.DataFrame(data)
print("\nThe comparison:\n")
print(df_comparison.to_string(index=False))


The comparison:

            Method  Time (seconds)  Max Memory per Chunk (MB)  Total Memory Used (MB)
Pandas (chunksize)          332.68                      539.9                 1080.43
              Dask            2.97                        NaN                    7.85
 Compressed (Gzip)          431.96                      539.9                  824.30
