# About
* **Author**: Adil Rashitov (adil.rashitov.98@gmail.com)
* **Created at**: 08.12.2022


In [None]:
# Imports / Configs / Global vars

# Import of native python tools
import os
import json
from functools import reduce

# Import of base ML stack libs
import numpy as np
import sklearn as sc

# Visualization libraries
# import plotly.express as px

# Logging configuraiton
import logging
logging.basicConfig(format='[ %(asctime)s ][ %(levelname)s ]: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
logger = logging.getLogger()
logger.setLevel(logging.INFO)


# Ipython configs
from IPython.core.display import display, HTML
from IPython.core.interactiveshell import InteractiveShell
display(HTML("<style>.container { width:100% !important; }</style>"))
InteractiveShell.ast_node_interactivity = 'all'

# Pandas configs
import pandas as pd
import geopandas as gpd
pd.options.display.max_rows = 350
pd.options.display.max_columns = 250

# Jupyter configs
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

# Configure project PATH
from pathlib import Path
import sys
PROJECT_PATH = os.getcwd().rsplit('/', 1)[0]


if PROJECT_PATH not in sys.path:
    sys.path.append(PROJECT_PATH)

# Data

In [None]:
# Resolving paths to GPS records & Route plans
from src import path
from src import models


def resolve_files_path(directory: str):
    files = os.listdir(directory)
    directories = list(map(
        lambda x,y: f"{x}/{y}".replace("//", "/"),
        [directory] * len(files),
        files
    ))
    return directories


gps_records_reports_paths = resolve_files_path(path.DIR_01_RAW_GPS)
route_plans_paths = resolve_files_path(path.DIR_01_RAW_PLANS)

In [None]:
# Reading & reformatting GPS records
from src.gps_formatting_pipeline import factory_raw_gps_formatter_pipeline


gps_formatting_pipeline = factory_raw_gps_formatter_pipeline()


gps_records = []
for gps_report_path in gps_records_reports_paths:
    try:
        gps_report = pd.read_excel(gps_report_path, skiprows=5)
        gps_report = gps_formatting_pipeline.fit_transform(gps_report)
        gps_records.append(gps_report)
    except Exception as exc:
        logging.error(f"{gps_report_path}: {exc}")

gps_records = pd.concat(gps_records).reset_index(drop=True)
gps_records = gps_records.drop_duplicates(subset=["datetime", "plate_no"]).reset_index(drop=True)

In [None]:
# Route plan
route_plans = []
for route_plan_path in route_plans_paths:
    try:
        route_plan = pd.read_csv(route_plan_path, delimiter=";", dtype="str")
        route_plans.append(route_plan)
    except Exception as exc:
        logging.error(f"{route_plan_path}: {exc}")

del route_plan
route_plans = pd.concat(route_plans).reset_index()
route_plans = models.raw_plan.validate(route_plans)
route_plans["date"] = pd.to_datetime(route_plans["date"], format='%Y%m%d').astype(str)
route_plans = route_plans.rename(columns={"地址": "address"})
route_plans = route_plans.drop(columns=["index"], errors="ignore")
route_plans = route_plans.drop_duplicates(subset=["CRN#", "truck code", "date", "address"])

In [None]:
# Geocodes
geocodes = pd.read_csv("../data/01_raw/geocodes/locations.csv")

# Main

In [None]:
# Join geocodes & route plan
route_plans = route_plans.merge(geocodes, how="left")

In [None]:
route_plans.to_parquet("../data/02_intermediate/route_plan.parquet", index=False)
gps_records.to_parquet("../data/02_intermediate/gps_records.parquet", index=False)