In [None]:
import os
import sys

from time import time, strftime, gmtime
from datetime import datetime
from dateutil import parser

from pathlib import Path
import warnings

# Definitions
- Define your start and end dates in format: `MM/DD/YYYY HH:MM:SS`
- Define your destination file (you cant use full or relative path)

*Note: If your destination file already exists, it will be detelted*

In [None]:
DATE_START = '10/11/2018 15:00:00'
DATE_END = '02/03/2019 23:59:00'

DEST_FILE = './filtered.csv'

tmp_file = Path(DEST_FILE)
if tmp_file.is_file():
    print ('WARNING: File {} already exists. It will be deleted!'.format(DEST_FILE))


- Define the list of all tmcs that shell be extracted. You may write the list explicitly or use your code.
- If you want to exatract all TMCs, just set `TMC_LIST = None`

In [None]:
TMC_LIST = ['110+04099', '110+04103', '110+05792']


# Code
- Just run the cells below to extract your data

In [None]:
def getFileList(date_start = DATE_START, date_end = DATE_END):
    files = []
    
    if date_start is None:
        date_start = '01/01/2018'
    if date_end is None:
        date_end = '12/31/2019'
        
    
    date_start = parser.parse(date_start)
    date_end = parser.parse(date_end)
    
    if date_start < datetime(2018, 1, 1):
        date_start = datetime(2018, 1, 1)
    if date_end > datetime(2019, 12, 31):
        date_end = datetime(2019, 12, 31)
    
    for datenr in range(date_start.year * 12 + date_start.month - 1, date_end.year * 12 + date_end.month):
        year = int (datenr / 12)
        month = datenr % 12 + 1
        file = '/hdd3/Volumes/MD{}/estimates/estimates_{}.csv'.format(year, str(month).zfill(2))
        my_file = Path(file)
        if my_file.is_file():
            files.append(file)
        else:
            warnings.warn("File {} does not exist.".format(file))
    return files

def getFileSize(files):
    filesize = 0
    for f in files:
        filesize += os.path.getsize(f)
    return filesize

In [None]:
t = time()


date_start = DATE_START
date_end = DATE_END

if date_start is None:
    date_start = '01/01/2018'

if date_end is None:
    date_end = '12/31/2019'

date_start = str(parser.parse(DATE_START))    
date_end = str(parser.parse(DATE_END))

    
files = getFileList()
bytes_processed = 0
lines_all = 0
lines_saved = 0
filesize = getFileSize(files)


with open(DEST_FILE, 'w') as dest:
    for filename_source in files: 
        save_first_line = True    
        with open(filename_source, 'r') as source:
            print ('Start reading from {}.                                           '.format(filename_source))

            line = source.readline()
            if save_first_line:
                dest.write(line)
                save_first_line = False
            bytes_processed += len(line)
            lines_all += 1
            
            while True:
                line = source.readline()

                if lines_all % 5e5 == 0:
                    time_to_finish = (time() - t) * (filesize - bytes_processed) / bytes_processed
                    sys.stdout.write('{:,} lines processed, {:,} lines saved. {}s to finish.                 \r'.
                                     format(lines_all, lines_saved, strftime('%H:%M:%S', gmtime(time_to_finish))))
                
                
                if not line:
                    break

                lines_all += 1
                line_split = line.split(',')
                tmc = line_split[0]
                dt = line_split[4]
                
                bytes_processed += len(line)

                if TMC_LIST is not None:
                    if tmc not in TMC_LIST:
                        continue
                
                if dt < date_start:
                    continue
                if dt > date_end:
                    continue
                
                dest.write(line)
                lines_saved += 1            


print ('Done in {}s. {:,} lines processed, {:,} lines saved in {}'.format(
    strftime('%H:%M:%S', gmtime(time() - t)),
    lines_all, lines_saved, 
    DEST_FILE
))