# Dim-taulujen luonti ja fact-taulun päivitys

In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable
from datetime import datetime
from pyspark.sql import Window

In [0]:
parameter_table = spark.table('vr_hopea.pipeline_parameters')

is_first_run = (
    parameter_table    
    .filter(parameter_table.param_name == 'is_first_run')
    .select('param_value')
    .first()[0]
)

print(f'Parametri: {is_first_run = }')


Parametri: is_first_run = 1


In [0]:
silver_df = spark.table('vr_hopea.vr_processed')
display(silver_df.limit(5))

cancelled,departure_date,train_number,train_category,train_type,commuter_line_id,station_short_code,event_type,scheduled_time,actual_time,difference_in_minutes,operator_uic_code,operator_short_code,difference_in_seconds,event_nk,actual_hour
False,2025-10-17,9693,Commuter,HL,D,PUR,DEPARTURE,2025-10-17T13:41:36.000Z,,2,10,vr,-120.0,1495f1d5bdc5d409fb21bbd5f4492175a2aa29f94c056d7dc580959890b27362,13
False,2025-10-17,9693,Commuter,HL,D,JK,ARRIVAL,2025-10-17T13:45:00.000Z,2025-10-17T13:47:22.000Z,2,10,vr,-142.0,228564789a72189ffde29c7e78872dc65a74d4a2bfdf32f134df4522efdd10f3,13
False,2025-10-17,9693,Commuter,HL,D,JK,DEPARTURE,2025-10-17T13:45:00.000Z,2025-10-17T13:47:22.000Z,2,10,vr,-142.0,754bd29c25ea4e415938d4035dc1776759513dbf18c9c9ad784f327ee5838a61,13
False,2025-10-17,9693,Commuter,HL,D,PLP,ARRIVAL,2025-10-17T13:47:54.000Z,2025-10-17T13:49:48.000Z,2,10,vr,-114.0,3c8dc45bd169070dae33029ff6d24e5d277545d9d096d874408bcf06f5a60464,13
False,2025-10-17,9693,Commuter,HL,D,PLP,DEPARTURE,2025-10-17T13:47:54.000Z,2025-10-17T13:49:48.000Z,2,10,vr,-114.0,b70dc2648e9d2170c0039a602fd0621a20561015b010336686e4776fd44ae75c,13


In [0]:
dim_date = spark.table('workspace.vr_hopea.dim_date')

## Luodaan dim-taulut
### Otetaan asemat nimien ja lyhenteiden kera csv-tiedostosta ja yhdistetään asemien nimet alkuperäiseen asemadataan -> Muodostetaan dim_stations

In [0]:
stations_df = spark.read.csv('/Volumes/workspace/vr_hopea/files/VR_traffic_points.csv', header=True)
display(stations_df.limit(5))

stations_dim_df =  (
    silver_df
    .select('station_short_code')
    .dropDuplicates()
    .withColumn('station_id', F.row_number().over(Window.orderBy('station_short_code')))
    .withColumn('updated_at', F.current_timestamp())
)

dim_stations = stations_dim_df.join(stations_df, stations_dim_df.station_short_code == stations_df.stationShortCode, 'left').drop('stationShortCode')
dim_stations.write.mode('overwrite').saveAsTable('vr_hopea.dim_stations')

stationName,stationShortCode
Ahonpää,AHO
Ahvenus,AHV
Ainola,AIN
Airaksela,ARL
Aittaluoto,ATL


### Muodostetaan dim_operators

In [0]:
dim_operators = (
  silver_df
  .select('operator_short_code')
  .dropDuplicates()
  .withColumn('operator_id', F.row_number().over(Window.orderBy('operator_short_code')))
  .withColumn('updated_at', F.current_timestamp())
)
dim_operators.write.mode('overwrite').saveAsTable('vr_hopea.dim_operators')


### Muodostetaan dim_trains

In [0]:
dim_trains =  (
    silver_df
    .select('train_number', 'train_category', 'train_type', 'commuter_line_id')
    .dropDuplicates()
    .withColumn('train_id', F.row_number().over(Window.orderBy('train_number')))
    .withColumn('updated_at', F.current_timestamp())
)

dim_trains.write.mode('overwrite').saveAsTable('vr_hopea.dim_trains')



## Luodaan fact-taulu / lisätään uudet rivit deltatauluun

In [0]:
# Jos dataputki ajetaan ensikertaa, muodostetaan fact

if is_first_run:
    fact_trains = (
        silver_df
        .join(dim_trains, ['train_number', 'train_category', 'train_type', 'commuter_line_id'])
        .join(dim_operators, ['operator_short_code'])
        .join(dim_stations, ['station_short_code'])
        .join(dim_date, silver_df.departure_date == dim_date.date)
        .select(
            'event_nk',
            'cancelled',
            'event_type',
            'train_stopping',
            'commercial_stop',
            F.to_date('scheduled_time').alias('scheduled_date'),
            'scheduled_time',
            'actual_time',
            'difference_in_minutes',
            'actual_hour',
            'station_id',
            'operator_id',
            'train_id',
            'date_id'
        )
    )
    fact_trains.write.mode('overwrite').saveAsTable('vr_hopea.fact_trains')

    # Change pipeline parameter value
    delta = DeltaTable.forName(spark, 'vr_hopea.pipeline_parameters')
    delta.update(
        condition = "param_name = 'is_first_run'", set = {'param_value': F.lit(0)}
    )

    
# Dataputken viikottaiset ajot ensiajon jälkeen
else:
    fact_event_nks = spark.table('vr_hopea.fact_trains').select('event_nk')

    # left_anti -> kaikki rivit hopean koontitaulusta (vasemman puolisesta), joilla ei ole vastinetta fact-taulussa -> eli kaikki uudet rivit koontitaulusta fact-tauluun
    new_fact_rows = (
        silver_df.alias('s')
        .join(fact_event_nks.alias('f'),
            F.col('s.event_nk') == F.col('f.event_nk'), 'left_anti')
        )
    
    fact_batch = (
        new_fact_rows
        .join(dim_trains, ['train_number', 'train_category', 'train_type', 'commuter_line_id'])
        .join(dim_operators, ['operator_short_code'])
        .join(dim_stations, ['station_short_code'])
        .join(dim_date, new_fact_rows.departure_date == dim_date.date)
        .select(
            'event_nk',
            'cancelled',
            'event_type',
            'train_stopping',
            'commercial_stop',
            F.to_date('scheduled_time').alias('scheduled_date'),
            'scheduled_time',
            'actual_time',
            'difference_in_minutes',
            'actual_hour',
            'station_id',
            'operator_id',
            'train_id',
            'date_id'
        )
    )
    
    fact_table = DeltaTable.forName(spark, 'vr_hopea.fact_trains')
    
    (
        fact_table.alias('target')
        .merge(
            fact_batch.alias('source'),
            'target.event_nk = source.event_nk'
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )



In [0]:
print('Data kirjoitettu dim- ja fact-tauluihin!')

Data kirjoitettu dim- ja fact-tauluihin!
