# "Big Pandas" - Dask from the Inside
## Part 3 - Pandas with many large csvs 
## PyData Berlin tutorial, 30 June 2017
## Stephen Simmons

In [None]:
# Standard modules
import io
import logging
import lzma
import multiprocessing
import os
import ssl
import time
import urllib.request
import zipfile

# Third-party modules
import fastparquet      # Needs python-snappy
import graphviz         # To visualize Dask graphs 
import numpy as np
import pandas as pd
import psutil           # Memory stats
import dask
import dask.dataframe as dd

# Support multiple lines of output in each cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Don't wrap tables
pd.options.display.max_rows = 20
pd.options.display.max_columns = 20
pd.options.display.width = 300

# Show matplotlib graphs inline in Jupyter notebook
%matplotlib inline

np.__version__, pd.__version__, dask.__version__

In [None]:
def memory_usage():
    """String with current memory usage in MB. Requires `psutil` package."""
    pid = os.getpid()
    mem_bytes = psutil.Process(pid).memory_info().rss
    return "[Process %s uses %0.1fMB]" % (pid, mem_bytes/1024.0/1024.0)

memory_usage()

In [None]:
%%time
df = d.read_csv('flights-2016-01.xz', nrows=4, dialect="excel")

In [None]:
df.T

In [None]:
memory_usage()

In [None]:
%%time
df = pd.read_csv('flights-2016-01.xz', dialect="excel")

In [None]:
memory_usage()

In [None]:
df.info()

In [None]:
df.memory_usage(deep=True).sum() / 1024 / 1024 

In [None]:
import textwrap
print('\n'.join(textwrap.wrap(', '.join(df.columns), 60)))

In [None]:
%%time
def load_months(months):
    dfs = [ 
        pd.read_csv('flights-%s.xz' % month, dialect="excel")
            for month in months 
          ]
    return pd.concat(dfs)

In [None]:
df = load_months(['2015-12','2016-01','2016-02'])

In [None]:
memory_usage()

In [None]:
df.info()

In [None]:
df.memory_usage(deep=True).sum() / 1024 / 1024 