In [None]:
# This notebook is used to create the PROVIDER table
# Based on the following documentations: 
#   https://ohdsi.github.io/CommonDataModel/cdm53.html#provider
#   https://documentation-snds.health-data-hub.fr/omop/documentation_etl/provider.html#description

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, crc32

# Spark initialization
spark = SparkSession.builder \
    .appName("OMOP Provider Table Unified") \
    .getOrCreate()

# Loading data
ir_spe_v = spark.read.option("header", True).csv("../data/raw/ir_spe_v.csv")



In [None]:
ir_spe_v.show()

In [None]:
# Rename columns
providers_df = ir_spe_v.select(
    col("PFS_SPE_COD").alias("provider_source_value"),
    col("LABEL").alias("specialty_source_value")
)

providers_df.show()

In [None]:
# Mapping dictionnaire OMOP using Athena documentation to find mapping: https://athena.ohdsi.org/search-terms/terms?domain=Provider&standardConcept=Standard&page=1&pageSize=15&query=
providers_df = providers_df.withColumn(
    "specialty_concept_id",
    when(col("specialty_source_value") == "Médecin généraliste", 38004446)
    .when(col("specialty_source_value") == "Radiologue", 45756825)
    .otherwise(None)
)

providers_df.show()

In [None]:
# Generate a pseudonymized integer identifier from provider_source_value (using crc32),
# ensuring it is positive and within the range of a signed 32-bit integer
MAX_INT = 2**31 - 1

providers_df = providers_df.withColumn(
    "provider_id",
    (crc32(col("provider_source_value").cast("string")) % MAX_INT).cast("int")
)

providers_df.show()

In [None]:
providers_df.printSchema()

In [None]:
# Save as parquet with snappy compression
providers_df.coalesce(1).write \
    .mode("overwrite") \
    .option("compression", "snappy") \
    .parquet("../data/processed/PROVIDER.parquet")

# Stop Spark
spark.stop()