In [16]:
import pandas as pd
import numpy as np
import sys

# Define the file path for the uploaded CSV
file_path = 'covid_project_dataset.csv'

print("# Section 4: Handling Large Datasets (Python Code and Output)")
print("This section demonstrates techniques for handling large datasets, including chunk-based loading, memory optimization, and vectorized operations.")

print("\n## 1. Chunk-based Loading and Processing")
print("This method is useful when your dataset is too large to fit into memory. We process it in smaller chunks.")
print(f"\n*Note: The file '{file_path}' might not be 'large' enough to strictly require chunking, but this code demonstrates the principle.*")

chunk_size = 50000  # Define a chunk size

total_rows_processed = 0
total_confirmed_cases_chunked = 0

try:
    print(f"\nReading '{file_path}' in chunks of {chunk_size} rows...")
    for i, chunk in enumerate(pd.read_csv(file_path, chunksize=chunk_size)):
        print(f"\n--- Processing Chunk {i+1} ---")
        print(f"Chunk shape: {chunk.shape}")

        # Rename columns for consistency, assuming OWID-like structure as before
        chunk.rename(columns={
            'location': 'Country/Region',
            'date': 'Date',
            'total_cases': 'Confirmed',
            'total_deaths': 'Deaths',
            'total_recoveries': 'Recovered',
            'population': 'Population' # Include population if available for later use
        }, inplace=True)

        # Basic cleaning for chunk: Convert date, fill NAs for key columns
        if 'Date' in chunk.columns:
            chunk['Date'] = pd.to_datetime(chunk['Date'], errors='coerce')
            chunk.dropna(subset=['Date'], inplace=True)
        for col in ['Confirmed', 'Deaths', 'Recovered']:
            if col in chunk.columns:
                chunk[col] = pd.to_numeric(chunk[col], errors='coerce').fillna(0)

        # Example processing: sum confirmed cases in this chunk
        if 'Confirmed' in chunk.columns:
            chunk_confirmed_sum = chunk['Confirmed'].sum()
            total_confirmed_cases_chunked += chunk_confirmed_sum
            print(f"Confirmed cases in this chunk: {chunk_confirmed_sum:.0f}")

        total_rows_processed += len(chunk)
        print(f"First 5 rows of Chunk {i+1}:")
        print(chunk.head())

    print(f"\nFinished processing all chunks. Total rows processed: {total_rows_processed}")
    print(f"Total confirmed cases aggregated from chunks: {total_confirmed_cases_chunked:.0f}")

except FileNotFoundError:
    print(f"Error: '{file_path}' not found. Please ensure the CSV file is in the correct directory.")
except Exception as e:
    print(f"An error occurred during chunked loading: {e}")

# --- Load the full dataset for memory optimization demonstration ---
print("\n## 2. Optimize Memory Usage")
print("We'll load the full dataset and then apply data type optimizations to reduce memory footprint.")

try:
    df_full = pd.read_csv(file_path)
    # Rename columns consistently for memory optimization part
    df_full.rename(columns={
        'location': 'country', # Corrected from 'Country/Region' to 'country' based on prior inspection
        'date': 'Date',
        'total_cases': 'Confirmed',
        'total_deaths': 'Deaths',
        'total_recoveries': 'Recovered',
        'population': 'Population'
    }, inplace=True)

    # Convert Date column for consistency
    if 'Date' in df_full.columns:
        df_full['Date'] = pd.to_datetime(df_full['Date'], errors='coerce')
        df_full.dropna(subset=['Date'], inplace=True)

    # Initial memory usage
    initial_memory_mb = df_full.memory_usage(deep=True).sum() / (1024**2)
    print(f"\nInitial memory usage of DataFrame: {initial_memory_mb:.2f} MB")

    # Apply memory optimization
    for col in df_full.columns:
        col_type = df_full[col].dtype

        # Convert object columns to 'category' if suitable
        if col_type == 'object':
            num_unique_values = df_full[col].nunique()
            num_total_values = len(df_full[col])
            # A common heuristic: if unique values are less than 50% of total rows and less than 500
            if num_unique_values / num_total_values < 0.5 and num_unique_values < 500:
                df_full[col] = df_full[col].astype('category')
                print(f"  Converted '{col}' (object) to 'category' (Unique: {num_unique_values}).")

        # Downcast numerical columns
        elif pd.api.types.is_numeric_dtype(col_type):
            if pd.api.types.is_integer_dtype(col_type):
                # Attempt to downcast integers
                df_full[col] = pd.to_numeric(df_full[col], downcast='integer', errors='ignore')
                if df_full[col].dtype != col_type:
                    print(f"  Downcasted '{col}' (int) from {col_type} to {df_full[col].dtype}.")
            elif pd.api.types.is_float_dtype(col_type):
                # Attempt to downcast floats
                df_full[col] = pd.to_numeric(df_full[col], downcast='float', errors='ignore')
                if df_full[col].dtype != col_type:
                    print(f"  Downcasted '{col}' (float) from {col_type} to {df_full[col].dtype}.")

    # Final memory usage
    final_memory_mb = df_full.memory_usage(deep=True).sum() / (1024**2)
    print(f"\nFinal memory usage of DataFrame after optimization: {final_memory_mb:.2f} MB")
    print(f"Memory reduction: {((initial_memory_mb - final_memory_mb) / initial_memory_mb) * 100:.2f}%")
    print("\nDataFrame dtypes after optimization:")
    print(df_full.dtypes)

except FileNotFoundError:
    print(f"Error: '{file_path}' not found for memory optimization. Skipping this section.")
except Exception as e:
    print(f"An error occurred during memory optimization: {e}")

print("\n## 3. Implement Vectorized Operations using NumPy for Better Performance")
print("Pandas and NumPy are designed for vectorized operations, which are significantly faster than explicit Python loops for large datasets.")
print("Many operations in previous sections (like `.diff()`, `.groupby().agg()`, arithmetic operations on columns) already leverage vectorization.")

try:
    # Ensure 'Confirmed' and 'Population' columns exist and are numeric
    if 'Confirmed' in df_full.columns and 'Population' in df_full.columns:
        df_full['Confirmed'] = pd.to_numeric(df_full['Confirmed'], errors='coerce').fillna(0)
        df_full['Population'] = pd.to_numeric(df_full['Population'], errors='coerce').fillna(1) # Fill 1 to avoid div by zero

        print("\nDemonstrating vectorized operation: Calculating 'Cases per Million' (approximate, using total population).")
        # Vectorized operation
        df_full['Cases_Per_Million'] = (df_full['Confirmed'] / df_full['Population']) * 1_000_000
        print("\nFirst 5 rows with 'Cases_Per_Million' calculated:")
        # Corrected column name: use 'country' instead of 'Country/Region'
        print(df_full[['country', 'Date', 'Confirmed', 'Population', 'Cases_Per_Million']].head())

        # Compare with a hypothetical loop (not executed for performance, but for illustration)
        print("\n*Conceptual illustration of why vectorized operations are preferred over loops:*")
        print("```python")
        print("  # Slow loop-based approach (AVOID for large datasets):")
        print("  # cases_per_million_list = []")
        print("  # for index, row in df_full.iterrows():")
        print("  #     if row['Population'] > 0:")
        print("  #         cases_per_million_list.append((row['Confirmed'] / row['Population']) * 1_000_000)")
        print("  #     else:")
        print("  #         cases_per_million_list.append(0)")
        print("  # df_full['Cases_Per_Million_Loop'] = cases_per_million_list")
        print("```")
        print("\nThe direct column operation `(df_full['Confirmed'] / df_full['Population']) * 1_000_000` is highly optimized by Pandas/NumPy and executes much faster.")
    else:
        print("Skipping 'Cases per Million' calculation: 'Confirmed' or 'Population' column not found in DataFrame or are not numeric after type conversion.")

except Exception as e:
    print(f"An error occurred during vectorized operations demonstration: {e}")

# Section 4: Handling Large Datasets (Python Code and Output)
This section demonstrates techniques for handling large datasets, including chunk-based loading, memory optimization, and vectorized operations.

## 1. Chunk-based Loading and Processing
This method is useful when your dataset is too large to fit into memory. We process it in smaller chunks.

*Note: The file 'covid_project_dataset.csv' might not be 'large' enough to strictly require chunking, but this code demonstrates the principle.*

Reading 'covid_project_dataset.csv' in chunks of 50000 rows...

--- Processing Chunk 1 ---
Chunk shape: (50000, 10)
Confirmed cases in this chunk: 305996741461
First 5 rows of Chunk 1:
        Date      country  Confirmed  new_cases  Deaths  new_deaths  \
0 2020-01-05  Afghanistan        0.0        0.0     0.0         0.0   
1 2020-01-06  Afghanistan        0.0        0.0     0.0         0.0   
2 2020-01-07  Afghanistan        0.0        0.0     0.0         0.0   
3 2020-01-08  Afghanistan    

  df_full[col] = pd.to_numeric(df_full[col], downcast='float', errors='ignore')
  df_full[col] = pd.to_numeric(df_full[col], downcast='float', errors='ignore')
  df_full[col] = pd.to_numeric(df_full[col], downcast='float', errors='ignore')
  df_full[col] = pd.to_numeric(df_full[col], downcast='float', errors='ignore')
  df_full[col] = pd.to_numeric(df_full[col], downcast='integer', errors='ignore')
  df_full[col] = pd.to_numeric(df_full[col], downcast='float', errors='ignore')
  df_full[col] = pd.to_numeric(df_full[col], downcast='integer', errors='ignore')
