# Exploratory Data Analysis

First of all, it's necessary make a exploration on the data contained on ./data/raw to see the types of information what we will work with.

In [57]:
import yaml
import os
import pandas as pd

ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), '..'))
REPORTS_DIR = os.path.join(ROOT_DIR, "reports")
os.makedirs(REPORTS_DIR, exist_ok=True)

with open(ROOT_DIR+"\\configs\\config.yml") as f:
    configs = yaml.safe_load(f)

In [58]:
raw_path = configs['paths']['raw_data']

agency_file = os.path.join(ROOT_DIR+'\\'+raw_path, configs['files']['agency'])
stops_file = os.path.join(ROOT_DIR+'\\'+raw_path, configs['files']['stops'])
trips_file = os.path.join(ROOT_DIR+'\\'+raw_path, configs['files']['trips'])
calendar_file = os.path.join(ROOT_DIR+'\\'+raw_path, configs['files']['calendar'])
routes_file = os.path.join(ROOT_DIR+'\\'+raw_path, configs['files']['routes'])

stops = pd.read_csv(stops_file)
trips = pd.read_csv(trips_file)
calendar = pd.read_csv(calendar_file)
routes = pd.read_csv(routes_file)
agency = pd.read_csv(agency_file)

models = {
    "stops": stops,
    "trips": trips,
    "calendar": calendar,
    "routes": routes,
    "agency": agency
}

## General Overview Reports

In [None]:
for name, model in models.items():
    report_path = os.path.join(REPORTS_DIR, f"{name}_exploration.txt")
    with open(report_path, "w", encoding="utf-8") as f:
        f.write("="*20 + f" {name.upper()} " + "="*20 + "\n")
        
        f.write(f"[INFO] Shape: {model.shape}\n")
        f.write(f"[COLUMNS] {list(model.columns)}\n\n")
        
        f.write("[HEAD]\n" + model.head().to_string() + "\n\n")
        f.write("[TAIL]\n" + model.tail().to_string() + "\n\n")
        
        f.write("[DTYPES]\n" + str(model.dtypes) + "\n\n")
        
        f.write("[NULLS]\n" + str(model.isnull().sum()) + "\n\n")
        
        f.write(f"[DUPLICATED] {model.duplicated().sum()}\n\n")
        
        f.write("[DESCRIBE]\n" + model.describe(include="all").to_string() + "\n\n")
        
        f.write("[VALUE COUNTS - Top 3 colunas]\n")
        for col in model.columns[:3]:
            f.write(f"\nColuna: {col}\n")
            f.write(str(model[col].value_counts().head()) + "\n")
        
    print(f"✅ Relatório gerado: {report_path}")