In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession \
.builder \
.appName("App") \
.master("local[*]") \
.getOrCreate()

In [3]:
spark

In [None]:
spark.sql("""
CREATE TABLE IF NOT EXISTS ds.md_account_d (
    data_actual_date DATE,
    data_actual_end_date DATE,
    account_rk DECIMAL,
    account_number STRING,
    char_type STRING,
    currence_rk DECIMAL,
    currence_code STRING
)
STORED AS PARQUET
"""
)

In [4]:
# Define the schema for the MD_ACCOUNT_D table
from pyspark.sql.types import StructType, StructField, DateType, IntegerType, StringType



In [20]:
schema_md_account_d = StructType([
    StructField("data_actual_date", DateType(), nullable=False),
    StructField("data_actual_end_date", DateType(), nullable=False),
    StructField("account_rk", IntegerType(), nullable=False),
    StructField("account_number", StringType(), nullable=False),
    StructField("char_type", StringType(), nullable=False),
    StructField("currency_rk", IntegerType(), nullable=False),
    StructField("currency_code", StringType(), nullable=False)
])

In [32]:
# Load the CSV file with the defined schema
df = spark.read.csv(
    "data/md_account_d.csv",
    schema=schema_md_account_d,
    header=True,  # assuming the CSV has a header row
    sep=";"
)

# Convert to PARQUET format
df.write.parquet("data/md_account_d.parquet", mode="overwrite")

In [4]:
# Optional: verify the data
parquet_df = spark.read.parquet("data/md_account_d.parquet")

In [5]:
parquet_df.printSchema()

root
 |-- data_actual_date: date (nullable = true)
 |-- data_actual_end_date: date (nullable = true)
 |-- account_rk: integer (nullable = true)
 |-- account_number: string (nullable = true)
 |-- char_type: string (nullable = true)
 |-- currency_rk: integer (nullable = true)
 |-- currency_code: string (nullable = true)



In [7]:
parquet_df.show(5)

+----------------+--------------------+----------+--------------------+---------+-----------+-------------+
|data_actual_date|data_actual_end_date|account_rk|      account_number|char_type|currency_rk|currency_code|
+----------------+--------------------+----------+--------------------+---------+-----------+-------------+
|      2018-01-01|          2018-01-31|  36237725|30425840700000583001|        А|         35|          840|
|      2018-01-01|          2018-01-31|     24656|30114840700000770002|        А|         35|          840|
|      2018-01-01|          2018-01-31|  18849846|30109810500000435003|        П|         34|          643|
|      2018-01-01|          2018-01-31|   1972647|30111810700000908001|        П|         34|          643|
|      2018-01-01|          2018-01-31|  34157174|30424810100000583001|        А|         34|          643|
+----------------+--------------------+----------+--------------------+---------+-----------+-------------+
only showing top 5 rows

