In [6]:
import os
import gzip
import shutil

# Path to the data directory
data_dir = "data/airnow/country=us/"

# Get total number of .csv.gz files
total_files = sum(1 for root, _, files in os.walk(data_dir) 
                 for f in files if f.endswith('.csv.gz'))

print(f"Found {total_files} .csv.gz files to process")

# Walk through all subdirectories in the data directory
processed = 0
for root, dirs, files in os.walk(data_dir):
    for filename in files:
        if filename.endswith('.csv.gz'):
            processed += 1
            gz_path = os.path.join(root, filename)
            csv_path = os.path.join(root, filename[:-3])  # Remove .gz extension
            
            print(f"Processing file {processed}/{total_files}: {filename}")
            
            # Open gzipped file and extract to csv
            with gzip.open(gz_path, 'rb') as f_in:
                with open(csv_path, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
            
            # Optionally remove the original .gz file after extraction
            os.remove(gz_path)

print("\nAll .csv.gz files have been extracted!")

Found 3604292 .csv.gz files to process
Processing file 1/3604292: location-1015-20181011.csv.gz
Processing file 2/3604292: location-1015-20181012.csv.gz
Processing file 3/3604292: location-1015-20181013.csv.gz
Processing file 4/3604292: location-1015-20181121.csv.gz
Processing file 5/3604292: location-1015-20181122.csv.gz
Processing file 6/3604292: location-1015-20181123.csv.gz
Processing file 7/3604292: location-1015-20181124.csv.gz
Processing file 8/3604292: location-1015-20181125.csv.gz
Processing file 9/3604292: location-1015-20181126.csv.gz
Processing file 10/3604292: location-1015-20181127.csv.gz
Processing file 11/3604292: location-1015-20181128.csv.gz
Processing file 12/3604292: location-1015-20181129.csv.gz
Processing file 13/3604292: location-1015-20181130.csv.gz
Processing file 14/3604292: location-1015-20181201.csv.gz
Processing file 15/3604292: location-1015-20181202.csv.gz
Processing file 16/3604292: location-1015-20181203.csv.gz
Processing file 17/3604292: location-1015-

KeyboardInterrupt: 

In [7]:
import pandas as pd
import os

# Initialize an empty list to store dataframes
dfs = []

# Get total number of CSV files first
total_files = sum(1 for root, _, files in os.walk(data_dir) 
                 for f in files if f.endswith('.csv'))

print(f"Found {total_files} CSV files to process")

# Walk through all subdirectories in the data directory
processed = 0
for root, dirs, files in os.walk(data_dir):
    for filename in files:
        if filename.endswith('.csv'):
            file_path = os.path.join(root, filename)
            df = pd.read_csv(file_path)
            dfs.append(df)
            processed += 1
            percentage = (processed / total_files) * 100
            print(f"Processing file {processed}/{total_files} ({percentage:.4f}%): {filename}")

# Concatenate all dataframes
if dfs:
    combined_df = pd.concat(dfs, ignore_index=True)
    print(f"\nCombined DataFrame shape: {combined_df.shape}")
    print("\nFirst few rows:")
    display(combined_df.head())
else:
    print("No CSV files found in the data directory.")


Found 30412 CSV files to process
Processing file 1/30412 (0.0033%): location-1000-20160306.csv
Processing file 2/30412 (0.0066%): location-1000-20160307.csv
Processing file 3/30412 (0.0099%): location-1000-20160310.csv
Processing file 4/30412 (0.0132%): location-1000-20160311.csv
Processing file 5/30412 (0.0164%): location-1000-20160312.csv
Processing file 6/30412 (0.0197%): location-1000-20160313.csv
Processing file 7/30412 (0.0230%): location-1000-20160314.csv
Processing file 8/30412 (0.0263%): location-1000-20160315.csv
Processing file 9/30412 (0.0296%): location-1000-20160316.csv
Processing file 10/30412 (0.0329%): location-1000-20160317.csv
Processing file 11/30412 (0.0362%): location-1000-20160318.csv
Processing file 12/30412 (0.0395%): location-1000-20160319.csv
Processing file 13/30412 (0.0427%): location-1000-20160320.csv
Processing file 14/30412 (0.0460%): location-1000-20160321.csv
Processing file 15/30412 (0.0493%): location-1000-20160322.csv
Processing file 16/30412 (0.052

KeyboardInterrupt: 