# Upload data to GCS Storage

In [2]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'gcs-key.json'

In [3]:
from google.cloud import storage

In [4]:
def upload_partitioned_dataset_skip_existing(local_root, bucket_name, gcs_root):
    client = storage.Client()
    bucket = client.bucket(bucket_name)

    for root, dirs, files in os.walk(local_root):
        for file in files:
            local_path = os.path.join(root, file)
            relative_path = os.path.relpath(local_path, local_root)
            gcs_path = os.path.join(gcs_root, relative_path).replace("\\", "/")

            blob = bucket.blob(gcs_path)

            if blob.exists():
                print(f"⏩ Skipped existing file: gs://{bucket_name}/{gcs_path}")
            else:
                blob.upload_from_filename(local_path)
                print(f"✅ Uploaded {local_path} to gs://{bucket_name}/{gcs_path}")

In [5]:
import pandas as pd
import requests

In [6]:
df = pd.read_parquet("gs://bernacho-ecobici-datahub/partitioned_historical_data", filters=[("year",'>=',2022),("year",'<=',2024)])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39503816 entries, 0 to 39503815
Data columns (total 15 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   Genero_Usuario         object        
 1   Edad_Usuario           Int64         
 2   Bici                   object        
 3   Ciclo_Estacion_Retiro  object        
 4   Fecha_Retiro           datetime64[ns]
 5   Hora_Retiro            object        
 6   Ciclo_Estacion_Arribo  object        
 7   Fecha_Arribo           datetime64[ns]
 8   Hora_Arribo            object        
 9   date_start             datetime64[ns]
 10  date_end               datetime64[ns]
 11  duration               int32         
 12  file                   object        
 13  year                   category      
 14  month                  category      
dtypes: Int64(1), category(2), datetime64[ns](4), int32(1), object(7)
memory usage: 3.8+ GB


In [8]:
df['year'] = df['year'].astype(int)
df['month'] = df['month'].astype(int)

In [9]:
df[['year','month']].value_counts().sort_index()

year  month
2022  1         393115
      2         434262
      3         557931
      4         523923
      5         594329
      6         527685
      7         405082
      8         358892
      9         203064
      10        232276
      11        422208
      12        431484
2023  1         586402
      2         664948
      3         817979
      4         813608
      5         945162
      6        1009992
      7         999706
      8        1109194
      9        1189627
      10       1401777
      11       1410644
      12       1228319
2024  1        1558095
      2        1699393
      3        1829741
      4        1961618
      5        2053545
      6        1893513
      7        1825635
      8        1891298
      9        1852147
      10       2054419
      11       1942324
      12       1680479
Name: count, dtype: int64

In [10]:
df[(df.year==2022) & (df.month<9)]['Ciclo_Estacion_Retiro'].value_counts().loc[["2"]].head(20)

Ciclo_Estacion_Retiro
2    9249
Name: count, dtype: int64

In [12]:
df[df.year>=2023]['Ciclo_Estacion_Retiro'].value_counts().loc[["002"]].head(20)

Ciclo_Estacion_Retiro
002    76436
Name: count, dtype: int64

In [13]:
df[df['Ciclo_Estacion_Retiro'].isin(["2","002"])].groupby(['year','month']).size().sort_index()

year  month
2022  1        1066
      2        1146
      3        1302
      4        1242
      5        1433
      6         930
      7        1103
      8        1027
      9         489
      10        389
      11       2020
      12       1834
2023  1        2381
      2        2412
      3        2591
      4        2515
      5        2930
      6        2483
      7        2497
      8        2925
      9        2685
      10       3172
      11       3380
      12       2605
2024  1        3594
      2        3714
      3        3642
      4        4022
      5        4141
      6        3872
      7        3895
      8        3653
      9        3479
      10       3543
      11       3397
      12       2908
dtype: int64

In [38]:
df[(df.year==2022) & (df.month<8)]['Ciclo_Estacion_Retiro'].value_counts().loc[lambda x: x.index.str.len()==4]

Ciclo_Estacion_Retiro
3002    1
Name: count, dtype: int64

In [30]:
df[df.year==2024]['Ciclo_Estacion_Retiro'].value_counts().loc[lambda x: x.index.str.len()==4]

Ciclo_Estacion_Retiro
1000    8
1002    1
Name: count, dtype: int64

In [23]:
df[df['Ciclo_Estacion_Retiro']=="2"]['Ciclo_Estacion_Arribo'].value_counts().head(10)

Ciclo_Estacion_Arribo
2      366
1      319
16     312
35     294
34     293
32     292
9      258
24     251
103    199
27     177
Name: count, dtype: int64

In [24]:
df[df['Ciclo_Estacion_Retiro']=="002"]['Ciclo_Estacion_Arribo'].value_counts().head(10)

Ciclo_Estacion_Arribo
002        2629
001        2264
271-272    2181
016        1806
027        1659
032        1428
014        1344
056        1175
007        1096
019        1006
Name: count, dtype: int64

In [16]:
df[df['Ciclo_Estacion_Retiro']=="271-272"].groupby(['year','month']).size().sort_index()

year  month
2022  10        1434
      11        7829
      12        9576
2023  1        10306
      2         8537
      3         9231
      4         9789
      5        13228
      6        13968
      7        13723
      8        13208
      9        15884
      10       17622
      11       17139
      12       17132
2024  1        18905
      2        16639
      3        18124
      4        19293
      5        24203
      6        22869
      7        22951
      8        22421
      9        22284
      10       24460
      11       20731
      12       12317
dtype: int64

In [5]:
data_path = "data/partitioned_historical_data"

In [7]:
upload_partitioned_dataset_skip_existing(
    local_root=data_path,
    bucket_name='bernacho-ecobici-datahub',
    gcs_root='partitioned_historical_data'
)


✅ Uploaded data/partitioned_historical_data\year=2019\month=1\786eac27676e413bb2fa0bbe0b2e6579-0.parquet to gs://bernacho-ecobici-datahub/ecobici_partitioned_data/year=2019/month=1/786eac27676e413bb2fa0bbe0b2e6579-0.parquet
✅ Uploaded data/partitioned_historical_data\year=2019\month=10\b992a1416dde40468076a636d0a80afd-0.parquet to gs://bernacho-ecobici-datahub/ecobici_partitioned_data/year=2019/month=10/b992a1416dde40468076a636d0a80afd-0.parquet
✅ Uploaded data/partitioned_historical_data\year=2019\month=11\9ea28ccbdfd74f409e90254c51d1b2aa-0.parquet to gs://bernacho-ecobici-datahub/ecobici_partitioned_data/year=2019/month=11/9ea28ccbdfd74f409e90254c51d1b2aa-0.parquet
✅ Uploaded data/partitioned_historical_data\year=2019\month=12\775d14e8e5a74e0791c905939c748aa5-0.parquet to gs://bernacho-ecobici-datahub/ecobici_partitioned_data/year=2019/month=12/775d14e8e5a74e0791c905939c748aa5-0.parquet
✅ Uploaded data/partitioned_historical_data\year=2019\month=2\76edf059dc134ad581276ca077913aba-0.p