In [1]:
import pandas as pd
import pyarrow as pa
import matplotlib.pyplot as plt
import seaborn as sns
import json
import plotly.express as px
import numpy as np
import os
import plotly.graph_objects as go
from datetime import datetime
import glob
import pyarrow

In [2]:
def getDayData(df): 
    # third column is one string, containing many different data items as dictionaries, separate those and flatten df
    df["parsed"] = df[2].apply(json.loads) 
    df2_flat = pd.json_normalize(df["parsed"])

    # Combine back with your original DataFrame 
    df2_flat = pd.concat([df, df2_flat], axis=1)
    df2_flat['extra.rentalBikes'] = df2_flat['extra.rentalBikes'].astype(float)
    df2_flat[0] = pd.to_datetime(df2_flat[0], format="ISO8601", errors="coerce", utc=True)

    df2_flat = df2_flat.rename(columns={0: "time"})

    # Drop unwanted columns all at once 
    drop_cols = [
        1, 2, 3, "link.uri", "thumbnail.uri", "extra.serviceType",
        "extra.type", "extraInfo", "apps", "sites", "houseNumber",
        "street", "openingHours", "open", "ovFiets", 
        "nearbyMeLocationId.type", "nearbyMeLocationId.value",
        "extra.locationCode", "name", "extra.fetchTime",
        "parsed", "infoImages", "postalCode"
    ]
    
    existing_drop_cols = [c for c in drop_cols if c in df2_flat.columns]
    if existing_drop_cols:
        df2_flat = df2_flat.drop(columns=existing_drop_cols)

    stations_to_keep = ["ASD", "DT", "GVC", "EHV", "UT", "MT", "RTD", "MDB", "NM", "ZL", "ES", "GN"]
    df_filtered = df2_flat[df2_flat["stationCode"].isin(stations_to_keep)]



    # Remove outliers (remove missing data)
    indices_to_drop = df_filtered[df_filtered['extra.rentalBikes'] == df_filtered["extra.rentalBikes"].max()].index ## change
    df_filtered = df_filtered.drop(indices_to_drop)

    ## reset index
    new_df = df_filtered.reset_index(drop=True)

    return new_df

In [4]:
# folder_1 = "ovfiets_november"
# existing_table_2023 = pd.DataFrame()
in_folder = 'ovfiets_2023'
out_folder = "ovfiets_parquet"
# os.makedirs(out_folder, exist_ok=True)

start_from = "OVFiets_2023-04-01.csv.xz"  # change this to the file you want to start from
end_at = "OVFiets_2023-07-01.csv.xz"


# Get sorted list of files
all_files = sorted(os.listdir(in_folder))

# If a start file is set, skip everything before it
if start_from in all_files:
    start_index = all_files.index(start_from)
    end_index = all_files.index(end_at)
    files_to_process = all_files[start_index:end_index]


# Get 2024 data
for name in files_to_process:    
    path = os.path.join(in_folder, name)
    open_df = pd.read_csv(path, header = None)

    print (f'Opened file -> {name}')

    reworked_df = getDayData(open_df)
    # print(reworked_df)

    # print (f'Data successfully reformated {name}')

    # create a useful output filename, e.g. base + .parquet
    base = os.path.basename(name).replace(".csv.xz", "")
    out_path = os.path.join(out_folder, f"{base}.parquet")

    # save as parquet (use 'pyarrow' or 'fastparquet' backend if available)
    reworked_df.to_parquet(out_path, index=False, engine='pyarrow')
    print("Saved:", out_path)

    # print (f'Data done and saved => {name}')

Opened file -> OVFiets_2023-04-01.csv.xz
Saved: ovfiets_parquet\OVFiets_2023-04-01.parquet
Opened file -> OVFiets_2023-04-02.csv.xz
Saved: ovfiets_parquet\OVFiets_2023-04-02.parquet
Opened file -> OVFiets_2023-04-03.csv.xz
Saved: ovfiets_parquet\OVFiets_2023-04-03.parquet
Opened file -> OVFiets_2023-04-04.csv.xz
Saved: ovfiets_parquet\OVFiets_2023-04-04.parquet
Opened file -> OVFiets_2023-04-05.csv.xz
Saved: ovfiets_parquet\OVFiets_2023-04-05.parquet
Opened file -> OVFiets_2023-04-06.csv.xz
Saved: ovfiets_parquet\OVFiets_2023-04-06.parquet
Opened file -> OVFiets_2023-04-07.csv.xz
Saved: ovfiets_parquet\OVFiets_2023-04-07.parquet
Opened file -> OVFiets_2023-04-08.csv.xz
Saved: ovfiets_parquet\OVFiets_2023-04-08.parquet
Opened file -> OVFiets_2023-04-09.csv.xz
Saved: ovfiets_parquet\OVFiets_2023-04-09.parquet
Opened file -> OVFiets_2023-04-10.csv.xz
Saved: ovfiets_parquet\OVFiets_2023-04-10.parquet
Opened file -> OVFiets_2023-04-11.csv.xz
Saved: ovfiets_parquet\OVFiets_2023-04-11.parquet

KeyboardInterrupt: 