# Analyze to create dataset

# 1. Imports

## 1.1 Packages

In [149]:
import gc
import os
import sys
from pathlib import Path

import hopsworks
import pandas as pd


## 1.2 Options

In [150]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [151]:

project_root: Path=Path().absolute() / ".."
sys.path.append(os.path.join(project_root, "src"))

## 1.3 Functions

In [152]:
from velib_prediction.pipelines.data_engineering.nodes import (
    create_feature_description,
    create_idx,
    list_parquet_files,
)


# 2. Data

## 2.1 Load data

In [153]:
list_files = list_parquet_files(path="../data/01_raw/")
list_files.pop(0)
list_files

['../data/01_raw/velib_2024_10_20T01_01_10_463z.parquet',
 '../data/01_raw/velib_2024_10_19T16_12_14_282z.parquet',
 '../data/01_raw/velib_2024_10_20T08_15_24_155z.parquet',
 '../data/01_raw/velib_2024_10_20T09_12_38_781z.parquet',
 '../data/01_raw/velib_2024_10_19T23_12_13_598z.parquet',
 '../data/01_raw/velib_2024_10_20T10_13_15_125z.parquet',
 '../data/01_raw/velib_2024_10_20T02_26_01_849z.parquet',
 '../data/01_raw/velib_2024_10_19T22_12_24_379z.parquet',
 '../data/01_raw/velib_2024_10_19T17_11_41_188z.parquet',
 '../data/01_raw/velib_2024_10_20T05_11_57_356z.parquet',
 '../data/01_raw/velib_2024_10_19T18_15_02_973z.parquet',
 '../data/01_raw/velib_2024_10_19T20_12_28_942z.parquet',
 '../data/01_raw/velib_2024_10_20T13_18_05_871z.parquet',
 '../data/01_raw/velib_2024_10_20T04_15_11_668z.parquet',
 '../data/01_raw/velib_2024_10_20T12_22_05_300z.parquet',
 '../data/01_raw/velib_2024_10_20T06_16_33_549z.parquet',
 '../data/01_raw/velib_2024_10_19T19_10_45_042z.parquet',
 '../data/01_r

In [154]:
df = pd.read_parquet(list_files[0])

In [155]:
def merge_datasets(list_files: list[str]) -> pd.DataFrame:
    """Merge all datasets into one

    Args:
        list_files (list[str]): List of paths to all datasets in parquet files.
    Returns:
        df_final (pd.DataFrame): Output dataframe
    """
    df_final = pd.DataFrame()
    for file in list_files:
        df = pd.read_parquet(file)
        df_final = pd.concat([df_final, df])
        # Free memory
        del df
        gc.collect()
    return df_final

In [156]:
df = merge_datasets(list_files)

In [157]:
df = create_idx(df)

In [158]:
df.sample(5, random_state=12)

Unnamed: 0,idx,stationcode,name,is_installed,capacity,numdocksavailable,numbikesavailable,mechanical,ebike,is_renting,is_returning,duedate,coordonnees_geo,nom_arrondissement_communes,code_insee_commune
1,140141729352667,14014,Jourdan - Stade Charléty,OUI,60,34,22,15,7,OUI,OUI,2024-10-19 15:44:27+00:00,"{'lat': 48.819428333369, 'lon': 2.3433353751898}",Paris,75056
9,170261729424762,17026,Jouffroy d'Abbans - Wagram,OUI,40,38,2,1,1,OUI,OUI,2024-10-20 11:46:02+00:00,"{'lat': 48.881973298352, 'lon': 2.301132157445}",Paris,75056
5,110251729403194,11025,Froment - Bréguet,OUI,43,0,40,28,12,OUI,OUI,2024-10-20 05:46:34+00:00,"{'lat': 48.8570414504784, 'lon': 2.37289470306...",Paris,75056
2,140141729410389,14014,Jourdan - Stade Charléty,OUI,60,28,28,17,11,OUI,OUI,2024-10-20 07:46:29+00:00,"{'lat': 48.819428333369, 'lon': 2.3433353751898}",Paris,75056
1,111041729406695,11104,Charonne - Robert et Sonia Delaunay,OUI,20,18,2,0,2,OUI,OUI,2024-10-20 06:44:55+00:00,"{'lat': 48.855907555969, 'lon': 2.3925706744194}",Paris,75056


## 2.2 Log to Hopsworks

In [159]:
project = hopsworks.login()
fs = project.get_feature_store()

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/24732
Connected. Call `.close()` to terminate connection gracefully.


In [160]:
print(fs)

<hsfs.feature_store.FeatureStore object at 0x17f944f10>


In [161]:
data_fg = fs.get_or_create_feature_group(
    name="velib_prediction",
    version=1,
    primary_key=["idx"],
    online_enabled=True,
    event_time="duedate",
    description="Predict velib availability",
)

In [162]:
data_fg.insert(df)

Uploading Dataframe: 0.00% |          | Rows 0/210 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: velib_prediction_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/24732/jobs/named/velib_prediction_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x17f957a90>, None)

In [163]:
feature_descriptions = create_feature_description()
feature_descriptions

[{'name': 'idx',
  'description': 'Idx based on the station code and datetime as timestamp'},
 {'name': 'stationcode', 'description': 'Code of the velib station'},
 {'name': 'name', 'description': 'Name of the velib station'},
 {'name': 'is_installed', 'description': 'Is the velib station available'},
 {'name': 'capacity', 'description': 'Capacity of the velib station'},
 {'name': 'numdocksavailable',
  'description': 'Number of docks available at the velib station'},
 {'name': 'numbikesavailable',
  'description': 'Number of bikes available at the velib station'},
 {'name': 'mechanical',
  'description': 'Number of mechanical bikes available at the station'},
 {'name': 'ebike', 'description': 'Number of ebikes available at the station'},
 {'name': 'is_renting', 'description': 'Bikes available for renting'},
 {'name': 'is_returning', 'description': 'Places available to return bikes'},
 {'name': 'duedate', 'description': 'Date of the data info'},
 {'name': 'coordonnees_geo',
  'descript

In [164]:
for desc in feature_descriptions:
    data_fg.update_feature_description(desc["name"], desc["description"])

In [165]:
data_fg.statistics_config = {
    "enabled": True,
    "histograms": True,
    "correlations": True,
}

# data_fg.update_statistics_config()
# data_fg.compute_statistics()