In [None]:
# As the data processing will be pretty expensive in computation terms,
# the process was made in the Google Colab workspace so the data is stored in Google Drive
# to process the data at a local level or using data from a different source,
# the rout must be changed

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#**********************************************************************************************
# @Name: Agrupamiento de demanda por dia, ruta, hora y link
# @Author: Team 21
# @Date: 2020/10/28 20:40:47
# @Help: 
#**********************************************************************************************

# Libraries Import ----------------------------------------------------------------------------
import datetime as dt                 # Date manipulation
import pandas as pd                   # Data manipulation
import numpy as np                    # Numeric manipulation
import os, glob                       # System management
from google.colab import data_table

# Transformation Process -----------------------------------------------------------------------
#1. Filtering by the FAR column == 'False'
#2. Aggregating the data:
#    - Datekey.
#    - CodigoRuta.
#    - Hour.
#    - Link.
#3. Summary:
#    - PaxUp sum
#    - PaxDw sum

# Development ----------------------------------------------------------------------------------

# Local path where the original files are in
path = "/content/drive/My Drive/ds4a-project/outputs/"

# Local path where the transformed files will be
path_to = "/content/drive/My Drive/ds4a-project/stats/pax_by_link/"

# List of files to process
lista_files = [f for f in os.listdir(path) if f.endswith('.csv')]

In [None]:
# Data transformation process
for l in lista_files:
    print("Cargando: " + l) # prints when the file is loaded
    # Read file
    df = pd.read_csv((path + l), 
                     decimal = ".",
                     usecols = ['DATEKEY', 'CODIGORUTA', 'HOUR', 'LINK', 'SUBENDELANTERA', 'SUBENTRASERA', 'BAJANDELANTERA', 'BAJANTRASERA', 'FAR'], 
                     keep_default_na = False)
    # convert to numeric
    df['SUBENDELANTERA'] = pd.to_numeric(df['SUBENDELANTERA'])
    df['SUBENTRASERA'] = pd.to_numeric(df['SUBENTRASERA'])
    df['BAJANDELANTERA'] = pd.to_numeric(df['BAJANDELANTERA'])
    df['BAJANTRASERA'] = pd.to_numeric(df['BAJANTRASERA'])
    # Filter by FAR
    #df = df[df['FAR'] == False]
    # add columns
    df['PAXUP'] = df['SUBENDELANTERA'] + df['SUBENTRASERA']
    df['PAXDW'] = df['BAJANDELANTERA'] + df['BAJANTRASERA']
    # Group by date, route, hour and link
    df = df.groupby(['DATEKEY', 'CODIGORUTA', 'HOUR', 'LINK'], as_index = False).agg(PAXUP = ('PAXUP', 'sum'), PAXDW = ("PAXDW", 'sum'))
    # save result
    df.to_csv(path_to + str(df.DATEKEY[0]) + '_pax_link.csv', index = False)
    print("Guardado: " + l) # print when the file is saved after transforming it

Cargando: 20191101.csv
Guardado: 20191101.csv
Cargando: 20191102.csv
Guardado: 20191102.csv
Cargando: 20191103.csv
Guardado: 20191103.csv
Cargando: 20191104.csv
Guardado: 20191104.csv
Cargando: 20191105.csv
Guardado: 20191105.csv
Cargando: 20191106.csv
Guardado: 20191106.csv
Cargando: 20191107.csv
Guardado: 20191107.csv
Cargando: 20191108.csv
Guardado: 20191108.csv
Cargando: 20191109.csv
Guardado: 20191109.csv
Cargando: 20191110.csv
Guardado: 20191110.csv
Cargando: 20191111.csv
Guardado: 20191111.csv
Cargando: 20191112.csv
Guardado: 20191112.csv
Cargando: 20191113.csv
Guardado: 20191113.csv
Cargando: 20191114.csv
Guardado: 20191114.csv
Cargando: 20191115.csv
Guardado: 20191115.csv
Cargando: 20191116.csv
Guardado: 20191116.csv
Cargando: 20191117.csv
Guardado: 20191117.csv
Cargando: 20191118.csv
Guardado: 20191118.csv
Cargando: 20191119.csv
Guardado: 20191119.csv
Cargando: 20191120.csv
Guardado: 20191120.csv
Cargando: 20191121.csv
Guardado: 20191121.csv
Cargando: 20191122.csv
Guardado: 2