<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [None]:
!python3 -m pip install "dask[complete]"

In [12]:
from dask.distributed import Client, LocalCluster
from dask import dataframe as dd
import pandas as pd
import time

In [None]:
N_REPS = 1
DATA_DIR = "../data/"
COMPLAINTS_NAME = "complaints_python_script.csv"
ZIP_NAME = "zip_zcta_python_script.csv"
OUTPUT_NAME = "311_reduced_python_script.csv"
USE_DASK = False
TESTING = False

# Setup general parametes
save_params = dict(index=False, na_rep="null", date_format="%m/%d/%Y %I:%M:%S %p")
read_params = dict(
        usecols=list(rename_dict.keys()),
        parse_dates=["Created Date"], 
        dtype={'Incident Zip': 'object'},
        low_memory=False,
)

# Get around different reading/merging functions later on
custom_read_func = dd.read_csv if USE_DASK else pd.read_csv
custom_merge_func = dd.merge if USE_DASK else pd.merge

# Dask/Pandas specific parameters
if USE_DASK:
    cluster = LocalCluster()
    client = Client(cluster)
    save_params["single_file"] = True
elif TESTING:
    read_params["nrows"] = 1000

rename_dict = {  # We are only interested in some columns
    "Created Date": "date",
    "Complaint Type": "type",
    "Descriptor": "descriptor",
    "Incident Zip": "zip"
}

all_times = []
for i in range(N_REPS):
    # Setup
    start_time = time.time()
    print(f"Start time {i}:", start_time)

    # Read data
    df = custom_read_func(DATA_DIR+COMPLAINTS_NAME, **read_params)
    if USE_DASK and TESTING:  # Dask doesn't have nrows parameter
        df = df.head(n=1000) 
    zips = custom_read_func(DATA_DIR+ZIP_NAME, dtype={'zip': 'object'})

    # Rename, filter valid dates, join and save.
    df = df.rename(columns=rename_dict)
    df.query("date >= '12/01/2010' and date < '11/01/2020'", inplace=True)
    merged = custom_merge_func(df, zips, on="zip", how="inner").drop(columns=["zip"])
    merged.to_csv(DATA_DIR+OUTPUT_NAME, **save_params)
    
    # Log
    end_time = time.time()
    all_times.append(end_time-start_time)
    print(f"End time {i}:", end_time)
    print("\n")

if USE_DASK:
    client.close()

Start time 0: 1607205125.9379709


In [None]:
# Uses 30 GB RAM

In [7]:
# Uses at most ~GB RAM
print(f"Dask took {1810/60:.2f} minutes")

Dask took 30.17 minutes
