# Code to parallelize read_csv

# To be used with split files generated by script_to_split.sh

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from scipy.stats import norm

import calendar

import matplotlib
matplotlib.use('Agg')

import os
from multiprocessing import Pool

import matplotlib.pyplot as plt
import seaborn as sns

import copy

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
%load_ext autotime

In [2]:
def read_csv(filename):
    'converts a filename to a pandas dataframe'
    return pd.read_csv(filename, parse_dates = ['Created Date', 'Closed Date']) # You can specify optional parameters here


def read_parallel(directory): # Directory in which split files are located
    
    # set up your pool
    pool = Pool(processes=24) # Num processors found in script

    # get a list of split file names
    files = os.listdir(directory)
    file_list = [(directory+filename) for filename in files if filename.split('.')[1]=='csv']
    file_list.sort()

    # have your pool map the file names to dataframes
    df_list = pool.map(read_csv, file_list)

    # reduce the list of dataframes to a single dataframe
    combined_df = pd.concat(df_list, ignore_index=True)
    combined_df = combined_df.drop(0).reset_index(drop=True)
    
    return combined_df

time: 5.43 ms


In [3]:
data_orig = read_parallel('./311_split/')

time: 54.1 s


In [4]:
data_serial_run = pd.read_csv('./311_Service_Requests_from_2018_to_Present.csv', parse_dates = ['Created Date', 'Closed Date'])

time: 10min 36s


As you can see, the serial run takes more than 10 times as long to perform the same task!