In [1]:
import numpy as np
import pandas as pd

In [4]:
rows = 100000
data = {
    'culumn_name' : np.random.randint(1,100,size=rows),
    'other_column' : np.random.choice(['A','B','C'],size=rows)
}

large_data = pd.DataFrame(data)

large_data.to_csv('large_dataset.csv',index=False)
print("Dataset created...")


Dataset created...


In [6]:
chunk_size = 10000
chunks = pd.read_csv('large_dataset.csv',chunksize=chunk_size)

total_sum = 0

for i,chunk in enumerate(chunks):
    chunk_sum = chunk['culumn_name'].sum()
    total_sum += chunk_sum
    print(f"Chunk {i+1} sum: {chunk_sum}")

print(f"Total sum: {total_sum}")

Chunk 1 sum: 507915
Chunk 2 sum: 500004
Chunk 3 sum: 501125
Chunk 4 sum: 497148
Chunk 5 sum: 498545
Chunk 6 sum: 500013
Chunk 7 sum: 501738
Chunk 8 sum: 497660
Chunk 9 sum: 504006
Chunk 10 sum: 506370
Total sum: 5014524


In [8]:
output_file = 'processed_dataset.csv'

for i, chunk in enumerate(pd.read_csv('large_dataset.csv', chunksize=chunk_size)):
    # Add a new column
    chunk['new_column'] = chunk['culumn_name'] * 2

    # Save the processed chunk to a new CSV file
    if i == 0:
        chunk.to_csv(output_file, index=False, mode='w')  # Write header for the first chunk
    else:
        chunk.to_csv(output_file, index=False, mode='a', header=False)  # Append without header

    print(f"Processed and saved chunk {i+1}")

print(f"Processed dataset saved as '{output_file}'.")


Processed and saved chunk 1
Processed and saved chunk 2
Processed and saved chunk 3
Processed and saved chunk 4
Processed and saved chunk 5
Processed and saved chunk 6
Processed and saved chunk 7
Processed and saved chunk 8
Processed and saved chunk 9
Processed and saved chunk 10
Processed dataset saved as 'processed_dataset.csv'.


In [9]:
# Load the processed dataset
processed_data = pd.read_csv('processed_dataset.csv')
print(processed_data.head())


   culumn_name other_column  new_column
0            6            B          12
1           93            A         186
2           22            C          44
3            4            C           8
4            5            B          10


In [10]:
import pandas as pd
import numpy as np

# Generate a large sample dataset
rows = 200000
data = {
    'ID': np.arange(1, rows + 1),
    'Category': np.random.choice(['Electronics', 'Books', 'Clothing', 'Groceries'], size=rows),
    'Sales': np.random.uniform(10, 1000, size=rows).round(2),
    'Quantity': np.random.randint(1, 20, size=rows),
    'Region': np.random.choice(['North', 'South', 'East', 'West'], size=rows),
    'Date': pd.date_range(start='2023-01-01', periods=rows, freq='T')  # Minute intervals
}

large_data = pd.DataFrame(data)

# Save to a CSV file
large_data.to_csv('large_advanced_dataset.csv', index=False)
print("Advanced dataset created and saved as 'large_advanced_dataset.csv'.")


  'Date': pd.date_range(start='2023-01-01', periods=rows, freq='T')  # Minute intervals


Advanced dataset created and saved as 'large_advanced_dataset.csv'.


In [11]:
chunk_size = 20000
filtered_output_file = 'filtered_dataset.csv'

chunks = pd.read_csv('large_advanced_dataset.csv', chunksize=chunk_size)

for i, chunk in enumerate(chunks):
    # Apply filtering conditions
    filtered_chunk = chunk[(chunk['Sales'] > 500) & (chunk['Category'] == 'Electronics')]

    # Save filtered results
    if i == 0:
        filtered_chunk.to_csv(filtered_output_file, index=False, mode='w')  # Write header
    else:
        filtered_chunk.to_csv(filtered_output_file, index=False, mode='a', header=False)

    print(f"Filtered and saved chunk {i+1}")

print("Filtered dataset saved.")


Filtered and saved chunk 1
Filtered and saved chunk 2
Filtered and saved chunk 3
Filtered and saved chunk 4
Filtered and saved chunk 5
Filtered and saved chunk 6
Filtered and saved chunk 7
Filtered and saved chunk 8
Filtered and saved chunk 9
Filtered and saved chunk 10
Filtered dataset saved.


In [12]:
chunks = pd.read_csv('large_advanced_dataset.csv', chunksize=chunk_size)

aggregated_data = []

for i, chunk in enumerate(chunks):
    # Group by category and aggregate
    group_agg = chunk.groupby('Category').agg(
        total_sales=('Sales', 'sum'),
        average_quantity=('Quantity', 'mean')
    )
    aggregated_data.append(group_agg)

# Combine results from all chunks
final_aggregation = pd.concat(aggregated_data).groupby('Category').sum()

print("Final Aggregated Data:")
print(final_aggregation)


Final Aggregated Data:
             total_sales  average_quantity
Category                                  
Books        25285160.63         99.882859
Clothing     25438742.18         99.972522
Electronics  25474910.56         99.819337
Groceries    25131238.28         99.812194


In [13]:
chunks = pd.read_csv('large_advanced_dataset.csv', chunksize=chunk_size, parse_dates=['Date'])

monthly_orders = []

for i, chunk in enumerate(chunks):
    # Add a 'Month' column
    chunk['Month'] = chunk['Date'].dt.to_period('M')

    # Group by region and month
    monthly_agg = chunk.groupby(['Region', 'Month']).size().reset_index(name='Order_Count')
    monthly_orders.append(monthly_agg)

# Combine all chunks and aggregate results
final_monthly_orders = pd.concat(monthly_orders).groupby(['Region', 'Month']).sum().reset_index()

print("Monthly Orders by Region:")
print(final_monthly_orders.head())


Monthly Orders by Region:
  Region    Month  Order_Count
0   East  2023-01        11327
1   East  2023-02         9925
2   East  2023-03        11133
3   East  2023-04        10910
4   East  2023-05         6815


In [14]:
processed_output_file = 'processed_advanced_dataset.csv'

chunks = pd.read_csv('large_advanced_dataset.csv', chunksize=chunk_size)

for i, chunk in enumerate(chunks):
    # Create new calculated columns
    chunk['Revenue'] = chunk['Sales'] * chunk['Quantity']  # Calculate revenue
    chunk['High_Sales'] = chunk['Sales'] > 750             # Add a flag for high sales

    # Save processed data
    if i == 0:
        chunk.to_csv(processed_output_file, index=False, mode='w')  # Write header
    else:
        chunk.to_csv(processed_output_file, index=False, mode='a', header=False)

    print(f"Processed and saved chunk {i+1}")

print(f"Processed dataset saved as '{processed_output_file}'.")


Processed and saved chunk 1
Processed and saved chunk 2
Processed and saved chunk 3
Processed and saved chunk 4
Processed and saved chunk 5
Processed and saved chunk 6
Processed and saved chunk 7
Processed and saved chunk 8
Processed and saved chunk 9
Processed and saved chunk 10
Processed dataset saved as 'processed_advanced_dataset.csv'.


In [15]:
# Load and display processed data
processed_data = pd.read_csv('processed_advanced_dataset.csv')
print(processed_data.head())


   ID  Category   Sales  Quantity Region                 Date  Revenue  \
0   1  Clothing  420.92         2   East  2023-01-01 00:00:00   841.84   
1   2     Books   67.03        11   East  2023-01-01 00:01:00   737.33   
2   3  Clothing  739.17         7  South  2023-01-01 00:02:00  5174.19   
3   4  Clothing  572.20         7  South  2023-01-01 00:03:00  4005.40   
4   5  Clothing  198.44         2  South  2023-01-01 00:04:00   396.88   

   High_Sales  
0       False  
1       False  
2       False  
3       False  
4       False  
