# Transforming & Normalizing

In [1]:
import yaml
from pathlib import Path
import json
import pandas as pd
import os

ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))

with open(f"{ROOT_DIR}/configs/config.yml") as f:
    configs = yaml.safe_load(f)

line_id = configs['api']['line_id']

line_folder = Path(ROOT_DIR) / "data" / "raw" / str(line_id)

with open(line_folder / f"line_{line_id}.json", "r", encoding="utf-8") as f:
    lines_data = json.load(f)

with open(line_folder / f"stop_{line_id}.json", "r", encoding="utf-8") as f:
    stops_data = json.load(f)

with open(line_folder / f"estimated_arrival_{line_id}.json", "r", encoding="utf-8") as f:
    arrival_data = json.load(f)

## Transform .json into dataframes

In [2]:
lines_df = pd.DataFrame(lines_data)
stops_df = pd.DataFrame(stops_data)
arrival_df = pd.DataFrame(arrival_data)

## Initial Cleaning

In [3]:
from pandas import json_normalize

lines_df.drop_duplicates(inplace=True)
lines_df = lines_df.rename(columns={
    "cl": "codigo_linha",
    "lc": "linha_circular",     
    "lt": "numero_linha",       
    "tl": "tipo_linha",         
    "sl": "sentido",            
    "tp": "terminal_principal",  
    "ts": "terminal_secundario"      
})

stops_df.drop_duplicates(inplace=True)
stops_df = stops_df.rename(columns={
    "cp": "codigo_parada",
    "np": "nome_parada",
    "py": "ponto_y",
    "px": "ponto_x"
})

arrival_df = json_normalize(
    arrival_data,
    record_path=['p', 'l', 'vs'],   # vehicles
    meta=[                          # keep stops and lines info
        ['p', 'cp'], ['p', 'np'], ['p', 'py'], ['p', 'px'],   # stops
        ['p', 'l', 'cl'], ['p', 'l', 'lt'], ['p', 'l', 'tp'], ['p', 'l', 'sl']  # lines
    ],
    errors='ignore'
)
arrival_df.drop_duplicates(inplace=True)
arrival_df = arrival_df.rename(columns={
    "p": "prefixo_veiculo",
    "t": "hora_chegada",
    "a": "tem_acessibilidade",
    "ta": "data_captura",
    "py": "latitude",
    "px": "longitude",
    "p.cp": "codigo_parada",
    "p.np": "nome_parada",
    "p.py": "latitude_parada",
    "p.px": "longitude_parada",
    "p.l.cl": "codigo_linha",
    "p.l.lt": "numero_linha",
    "p.l.tp": "tipo_linha",
    "p.l.sl": "sentido_linha"
})
arrival_df.reset_index(drop=True, inplace=True)

## Relationing Tables

In [4]:
merged_df = stops_df.merge(arrival_df, left_on="codigo_parada", right_on="codigo_parada")
merged_df = merged_df.merge(lines_df, left_on="codigo_linha", right_on="codigo_linha")

## Save Processed Data

In [5]:
processed_dir = f"{ROOT_DIR}/data/processed"
Path(processed_dir).mkdir(parents=True, exist_ok=True)

lines_df.to_parquet(f"{processed_dir}/lines.parquet", index=False)
stops_df.to_parquet(f"{processed_dir}/stops.parquet", index=False)
arrival_df.to_parquet(f"{processed_dir}/arrivals.parquet", index=False)
merged_df.to_parquet(f"{processed_dir}/merged.parquet", index=False)
