In [None]:
import sqlite3
import zipfile
import tempfile
import pandas as pd
import os

In [None]:
def extract_sqlite_to_temp(zip_path, sqlite_filename, temp_dir="Data/Temp"):
    import os
    os.makedirs(temp_dir, exist_ok=True)

    temp_file_path = os.path.join(temp_dir, "temp_extracted.sqlite")
    
    with zipfile.ZipFile(zip_path, 'r') as z:
        with z.open(sqlite_filename) as zipped_db:
            with open(temp_file_path, 'wb') as out_file:
                for chunk in iter(lambda: zipped_db.read(1024 * 1024), b''):
                    out_file.write(chunk)

    return temp_file_path


In [None]:

def get_sqlite_table_names(sqlite_path):
    conn = sqlite3.connect(sqlite_path)
    cursor = conn.cursor()
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = [t[0] for t in cursor.fetchall()]
    conn.close()
    return tables

In [None]:
def fetch_sqlite_chunk(conn, table_name, chunk_size, offset):
    query = f"SELECT * FROM {table_name} LIMIT {chunk_size} OFFSET {offset}"
    df = pd.read_sql_query(query, conn)
    return df

In [None]:
def write_chunk_to_csv(df, output_folder, part_num):
    os.makedirs(output_folder, exist_ok=True)
    output_path = os.path.join(output_folder, f"reddit_comments_part_{part_num}.csv")
    df.to_csv(output_path, index=False)
    print(f"✅ Saved chunk {part_num} with {len(df)} rows → {output_path}")


In [None]:
def extract_chunks_to_csv(zip_path, sqlite_filename, table_name, total_rows, chunk_size, output_folder):
    sqlite_path = extract_sqlite_to_temp(zip_path, sqlite_filename)
    print(f"📁 Temp SQLite file created at: {sqlite_path}")

    conn = sqlite3.connect(sqlite_path)

    offset = 0
    part = 1
    while offset < total_rows:
        df = fetch_sqlite_chunk(conn, table_name, chunk_size, offset)
        write_chunk_to_csv(df, output_folder, part)
        offset += chunk_size
        part += 1

    conn.close()
    os.remove(sqlite_path)
    print("🧹 Temp file deleted. Extraction complete.")


In [None]:
# Main script entry point
if __name__ == "__main__":
    ZIP_PATH = "D:/Portfolio/reddit-analytics-pipeline/data/raw/reddit-comments-may-2015.zip"
    SQLITE_FILENAME = "database.sqlite"
    OUTPUT_FOLDER = "Data/Partitions"

    # Step 1: Extract to temp and inspect table names
    temp_sqlite_path = extract_sqlite_to_temp(ZIP_PATH, SQLITE_FILENAME)
    print(temp_sqlite_path)
    table_names = get_sqlite_table_names(temp_sqlite_path)
    print("📋 Tables inside database:", table_names)
    
      # OPTIONAL: Pause here if you want to inspect table names before continuing
    # Example: Comment out below line until you're ready
    extract_chunks_to_csv(
        zip_path=ZIP_PATH,
        sqlite_filename=SQLITE_FILENAME,
        table_name="May2015",  # Replace with selected table
        total_rows=250000,
        chunk_size=50000,
        output_folder=OUTPUT_FOLDER
    )

   