In [2]:
import pandas as pd

In [27]:
months = [201710, 201711, 201712]
template = 'consumption_{month}.csv'
chunksize = 3

chunks = []
for month in months:
    filename = template.format(month=month)
    print(filename)

    chunk_iterator = pd.read_csv(filename, sep=',', chunksize=chunksize)
    for chunk in chunk_iterator:
        chunk.columns = chunk.columns.str.replace(str(month) + '_', '')
        
        chunk_sum = pd.DataFrame(
            data={
                'MORNING': chunk['TV_M'] + chunk['VOD_M'],
                'AFTERNOON': chunk['TV_A'] + chunk['VOD_A'],
                'NIGHT': chunk['TV_N'] + chunk['VOD_N'],
                'USER_ID': chunk['USER_ID'],
                'YEAR_MONTH': month,
            }
        )
        chunks.append(chunk_sum)

df = pd.concat(chunks)

consumption_201710.csv
consumption_201711.csv
consumption_201712.csv


In [28]:
df

Unnamed: 0,AFTERNOON,MORNING,NIGHT,USER_ID,YEAR_MONTH
0,98,834,6000,1,201710
0,98,834,6000,1,201711
0,98,834,6000,1,201712


1. Computation is easy, testing is hard: need to create files, etc.
2. Suppose we want to aggregate over TV, VOD => lots of redundant code
3. Extract iteration over chunks
4. Extract removal of month: show how to chain iterators, leave as exercise
5. We want to do better and remove all redundancy: generator that takes a list of filenames and return chunks. Need to introduce zip and other built-in generators
6. `dask` version

In [32]:
def iterate_files_in_chunks(filenames, chunksize):
    for filename in filenames:
        chunk_iterator = pd.read_csv(filename, sep=',', chunksize=chunksize)
        for chunk in chunk_iterator:
            yield chunk

In [33]:
def months_to_filename(months):
    for month in months:
        yield template.format(month=month)

In [47]:
# ??? clean_columns needs `month`
for month, chunk in zip(months, clean_columns(iterate_chunks(months_to_filename(months), chunksize=chunksize)):
    chunk.columns = chunk.columns.str.replace(str(month) + '_', '')


SyntaxError: invalid syntax (<ipython-input-47-50fcb5f621c3>, line 2)

In [44]:
months = [201710, 201711, 201712]
template = 'consumption_{month}.csv'
chunksize = 3

chunks = []
for month, chunk in zip(months, iterate_files_in_chunks(months_to_filename(months), chunksize=chunksize)):
    chunk.columns = chunk.columns.str.replace(str(month) + '_', '')

    chunk_sum = pd.DataFrame(
        data={
            'MORNING': chunk['TV_M'] + chunk['VOD_M'],
            'AFTERNOON': chunk['TV_A'] + chunk['VOD_A'],
            'NIGHT': chunk['TV_N'] + chunk['VOD_N'],
            'USER_ID': chunk['USER_ID'],
            'YEAR_MONTH': month,
        }
    )
    chunks.append(chunk_sum)

df = pd.concat(chunks).set_index(['USER_ID', 'YEAR_MONTH'])

In [45]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,AFTERNOON,MORNING,NIGHT
USER_ID,YEAR_MONTH,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,201710,98,834,6000
1,201711,98,834,6000
1,201712,98,834,6000
