In [1]:
import pandas as pd
import os
import glob
from datetime import datetime
from collections import defaultdict

# === Config ===
DATA_FOLDER = "/home/timeworid/Documents/TSI Project/data_output/divvydata"
OUTPUT_FILE = "/home/timeworid/Documents/TSI Project/data_output/flux/flux_par_station_par_jour.csv"
DATE_FORMATS = ["%Y-%m-%d %H:%M:%S", "%m/%d/%Y %H:%M", "%Y-%m-%d %H:%M", "%m/%d/%Y %H:%M:%S"]

# === Log erreurs ===
errors = []
loaded_rows = 0

# === Colonnes possibles ===
start_time_cols = ['started_at', 'starttime', 'start_time']
end_time_cols = ['ended_at', 'stoptime', 'end_time']
start_station_cols = ['start_station_name', 'from_station_name']
end_station_cols = ['end_station_name', 'to_station_name']
bike_type_cols = ['rideable_type']
allowed_bike_values = ['electric_bike']
station_id_start_cols = ['start_station_id', 'from_station_id']
station_id_end_cols = ['end_station_id', 'to_station_id']

# === Flux net par (station, date) ===
net_flow = defaultdict(int)

# === Fonctions utilitaires ===
def parse_date(date_str):
    for fmt in DATE_FORMATS:
        try:
            return datetime.strptime(str(date_str), fmt).date()
        except:
            continue
    return None

def get_first_available(df, columns):
    for col in columns:
        if col in df.columns:
            return df[col]
    return None

# === Lecture des CSV ===
csv_files = glob.glob(os.path.join(DATA_FOLDER, "*.csv"))

for file in csv_files:
    try:
        df = pd.read_csv(file)
        loaded_rows += len(df)

        # Colonnes standardisées
        start_time = get_first_available(df, start_time_cols)
        end_time = get_first_available(df, end_time_cols)
        start_station = get_first_available(df, start_station_cols)
        end_station = get_first_available(df, end_station_cols)
        start_station_id = get_first_available(df, station_id_start_cols)
        end_station_id = get_first_available(df, station_id_end_cols)
        bike_type = get_first_available(df, bike_type_cols)

        # Skip si pas de dates
        if start_time is None and end_time is None:
            errors.append(f"{file}: pas de colonnes de date valides")
            continue

        # Filtrer e-bikes si info dispo
        if bike_type is not None:
            df = df[bike_type.isin(allowed_bike_values)]

        # Ajout flux départ
        if start_time is not None and start_station is not None:
            for s, t in zip(start_station, start_time):
                date = parse_date(t)
                if pd.notna(s) and date:
                    key = (str(s).strip(), date)
                    net_flow[key] -= 1

        # Ajout flux arrivée
        if end_time is not None and end_station is not None:
            for s, t in zip(end_station, end_time):
                date = parse_date(t)
                if pd.notna(s) and date:
                    key = (str(s).strip(), date)
                    net_flow[key] += 1

    except Exception as e:
        errors.append(f"{file}: {str(e)}")

# === DataFrame final ===
result = pd.DataFrame([
    {'station_name': k[0], 'date': k[1], 'net_flow': v}
    for k, v in net_flow.items()
])

# === Export CSV ===
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
result.to_csv(OUTPUT_FILE, index=False)

# === Infos log ===
print(f"CSV traités : {len(csv_files)}")
print(f"Lignes lues : {loaded_rows}")
print(f"Erreurs : {len(errors)}")
if errors:
    print("Quelques erreurs rencontrées :")
    for err in errors[:5]:
        print(" -", err)


  df = pd.read_csv(file)


CSV traités : 97
Lignes lues : 47961198
Erreurs : 10
Quelques erreurs rencontrées :
 - /home/timeworid/Documents/TSI Project/data_output/divvydata/Divvy_Stations_2014-Q3Q4.csv: pas de colonnes de date valides
 - /home/timeworid/Documents/TSI Project/data_output/divvydata/Divvy_Stations_2016_Q3.csv: pas de colonnes de date valides
 - /home/timeworid/Documents/TSI Project/data_output/divvydata/Divvy_Stations_2016_Q4.csv: pas de colonnes de date valides
 - /home/timeworid/Documents/TSI Project/data_output/divvydata/Divvy_Stations_2017_Q3Q4.csv: pas de colonnes de date valides
 - /home/timeworid/Documents/TSI Project/data_output/divvydata/Divvy_Trips_2019_Q2.csv: pas de colonnes de date valides


In [None]:
#V2_code

import pandas as pd
import os
import glob
from datetime import datetime
from collections import defaultdict

# === Config ===
DATA_FOLDER = "/home/timeworid/Documents/TSI Project/data_output/divvydata"
OUTPUT_FILE = "/home/timeworid/Documents/TSI Project/data_output/flux/flux_par_station_par_jour_pivot.csv"
DATE_FORMATS = ["%Y-%m-%d %H:%M:%S", "%m/%d/%Y %H:%M", "%Y-%m-%d %H:%M", "%m/%d/%Y %H:%M:%S"]

# === Log erreurs ===
errors = []
loaded_rows = 0

# === Colonnes possibles ===
start_time_cols = ['started_at', 'starttime', 'start_time']
end_time_cols = ['ended_at', 'stoptime', 'end_time']
start_station_cols = ['start_station_name', 'from_station_name']
end_station_cols = ['end_station_name', 'to_station_name']
bike_type_cols = ['rideable_type']
allowed_bike_values = ['electric_bike']

# === Flux net par (station, date) ===
net_flow = defaultdict(int)

# === Fonctions utilitaires ===
def parse_date(date_str):
    for fmt in DATE_FORMATS:
        try:
            return datetime.strptime(str(date_str), fmt).date()
        except:
            continue
    return None

def get_first_available(df, columns):
    for col in columns:
        if col in df.columns:
            return df[col]
    return None

# === Lecture des CSV ===
csv_files = glob.glob(os.path.join(DATA_FOLDER, "*.csv"))

for file in csv_files:
    try:
        df = pd.read_csv(file, low_memory=False)
        loaded_rows += len(df)

        # Colonnes standardisées
        start_time = get_first_available(df, start_time_cols)
        end_time = get_first_available(df, end_time_cols)
        start_station = get_first_available(df, start_station_cols)
        end_station = get_first_available(df, end_station_cols)
        bike_type = get_first_available(df, bike_type_cols)

        # Skip si pas de dates
        if start_time is None and end_time is None:
            errors.append(f"{file}: pas de colonnes de date valides")
            continue

        # Filtrer e-bikes si info dispo
        if bike_type is not None:
            df = df[bike_type.isin(allowed_bike_values)]

        # Ajout flux départ
        if start_time is not None and start_station is not None:
            for s, t in zip(start_station, start_time):
                date = parse_date(t)
                if pd.notna(s) and date:
                    key = (date, str(s).strip())
                    net_flow[key] -= 1

        # Ajout flux arrivée
        if end_time is not None and end_station is not None:
            for s, t in zip(end_station, end_time):
                date = parse_date(t)
                if pd.notna(s) and date:
                    key = (date, str(s).strip())
                    net_flow[key] += 1

    except Exception as e:
        errors.append(f"{file}: {str(e)}")

# === DataFrame brut ===
raw_df = pd.DataFrame([
    {'date': k[0], 'station_name': k[1], 'net_flow': v}
    for k, v in net_flow.items()
])

# === Pivot : date en ligne / station en colonne ===
pivot_df = raw_df.pivot(index="date", columns="station_name", values="net_flow").fillna(0).astype(int)
pivot_df.reset_index(inplace=True)

# === Export CSV ===
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
pivot_df.to_csv(OUTPUT_FILE, index=False)

# === Infos log ===
print(f"CSV traités : {len(csv_files)}")
print(f"Lignes lues : {loaded_rows}")
print(f"Erreurs : {len(errors)}")
if errors:
    print("Quelques erreurs rencontrées :")
    for err in errors[:5]:
        print(" -", err)


In [1]:
import pandas as pd

# === Charger le fichier pivoté ===
pivot_path = "/home/timeworid/Documents/TSI Project/data_output/flux/flux_par_station_par_jour_pivot.csv"
df = pd.read_csv(pivot_path)

# === Récupérer les noms de stations (toutes les colonnes sauf 'date') ===
station_names = [col for col in df.columns if col != "date"]

# === Créer un mapping nom -> ID ===
station_id_map = {name: idx + 1 for idx, name in enumerate(station_names)}

# === Renommer les colonnes ===
df_renamed = df.rename(columns=station_id_map)

# === Exporter le fichier avec IDs ===
output_path = "/home/timeworid/Documents/TSI Project/data_output/flux/flux_par_station_IDs.csv"
df_renamed.to_csv(output_path, index=False)

# === Sauvegarder aussi le mapping si besoin ===
mapping_df = pd.DataFrame(list(station_id_map.items()), columns=["station_name", "station_id"])
mapping_df.to_csv("/home/timeworid/Documents/TSI Project/data_output/flux/station_name_to_id.csv", index=False)

print("✅ Fichier exporté avec ID + mapping sauvegardé.")


✅ Fichier exporté avec ID + mapping sauvegardé.


In [None]:
import pandas as pd
import numpy as np

# === Charger le fichier par ID ===
df = pd.read_csv("/home/timeworid/Documents/TSI Project/data_output/flux/data_clean_variation.csv")

# === Calculer le flux total absolu par station ===
flux_totaux = df.drop(columns="date").abs().sum().sort_values(ascending=False)

# === Top 30 stations les plus fréquentées ===
top_30_ids = flux_totaux.head(30).index.tolist()

# === Séparer les DataFrames ===
df_top30 = df[["date"] + top_30_ids]
df_others = df.drop(columns=top_30_ids)

# === Exporter ===
df_top30.to_csv("/home/timeworid/Documents/TSI Project/data_output/flux/flux_top30.csv", index=False)
df_others.to_csv("/home/timeworid/Documents/TSI Project/data_output/flux/flux_other_stations.csv", index=False)

print("✅ Fichiers exportés : top 30 et autres stations.")


✅ Fichiers exportés : top 30 et autres stations.
