## Setup


In [None]:
#!pip install -r requirements.txt
import os
import io
import time
import json
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

BASE = '.'
ART_DIR = os.path.join(BASE, 'artifacts')
DATA_DIR = os.path.join(ART_DIR, 'datasets')
LOG_DIR = os.path.join(ART_DIR, 'logs')
for d in [ART_DIR, DATA_DIR, LOG_DIR]:
    os.makedirs(d, exist_ok=True)

## Load — From File


In [None]:
file_path = 'sample_edges.csv'  # replace with your CSV path if needed
df = pd.read_csv(file_path)
len(df), df.head()

## Load — From URL (optional)


In [None]:
# url = 'https://raw.githubusercontent.com/plotly/datasets/master/2014_usa_states.csv'  # example CSV
# df = pd.read_csv(url)  # uncomment to test if internet is available
# df.head()

## Exploration


In [None]:
df.describe().T

In [None]:
col = 'distance_km'
plt.figure()
df[col].hist(bins=30)
plt.title(f'Distribution of {col}')
plt.show()

In [None]:
x, y = 'distance_km', 'travel_time_est'
plt.figure()
plt.scatter(df[x], df[y])
plt.title(f'{x} vs {y}')
plt.show()

## Cleaning & Preprocessing


In [None]:
clean = df.copy()
drop_na, cap_iqr, normalize = False, True, True
if drop_na:
    clean = clean.dropna()
for c in ['distance_km', 'travel_time_est', 'fuel_rate']:
    if c in clean.columns:
        q1, q3 = clean[c].quantile([0.25, 0.75])
        iqr = q3-q1
        lo, hi = q1-1.5*iqr, q3+1.5*iqr
        clean[c] = clean[c].clip(lo, hi)
num_cols = [
    c for c in clean.columns if pd.api.types.is_numeric_dtype(clean[c])]
if normalize and num_cols:
    scaler = MinMaxScaler()
    clean[num_cols] = scaler.fit_transform(clean[num_cols])
clean.head()

## Save & Log


In [None]:
out_path = os.path.join(DATA_DIR, f'cleaned_{int(time.time())}.csv')
clean.to_csv(out_path, index=False)
log = {'ts': int(time.time()), 'source': file_path,
       'rows_in': int(len(df)), 'rows_out': int(len(clean))}
with open(os.path.join(LOG_DIR, 'runs.jsonl'), 'a') as f:
    f.write(json.dumps(log)+'\n')
out_path