In [None]:
!pip install pyspark

In [None]:
from datetime import date
from pathlib import Path
from typing import Union
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col, when, expr, lit, concat, to_date, get_json_object, count, desc, asc
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, DoubleType
from uuid import uuid4
import glob
import os
from datetime import date, timedelta

In [None]:
def prepare_raw_logs(
        external_system_name: str,
        date: Union[str, date],
        raw_logs_path: Union[str, Path],
        prepared_logs_path: Union[str, Path]
) -> None:
    """
    Внутри функции должно происходить чтение логов с помощью pyspark,
    приведение их к общей схеме и запись в директорию с препарированными логами.
    Препарированные логи должны быть сгрупированны по internal_app_id и date
    """
    spark = SparkSession\
    .builder\
    .master("local[*]")\
    .appName('PrepareRawLogs')\
    .getOrCreate()

    system_path = f"{raw_logs_path}/external_system_{external_system_name}"
    app_dirs = [name for name in os.listdir(system_path) if os.path.isdir(os.path.join(system_path, name))]
    app_id = app_dirs[0].split('=')[1]
    df = spark.read.parquet(f"{raw_logs_path}/external_system_{external_system_name}/application_id={app_id}/date={date}")
    log_schema = StructType([
        StructField('external_did', StringType(), True),
        StructField('event_name', StringType(), True),
        StructField('event_datetime', TimestampType(), True),
        StructField('event_json', StringType(), True),
        StructField('date', TimestampType(), True),
        StructField('push_token', StringType(), True),
        StructField('ios_ifa', StringType(), True),
        StructField('external_profile_id', StringType(), True),
        StructField('external_app_id', StringType(), True),
        StructField('external_system', StringType(), True),
        StructField('internal_app_id', StringType(), True)])
    df_new = spark.createDataFrame([], log_schema)
    if external_system_name == '1':
        app_uuid = '24a7a8f5-35f0-4c3a-9e51-02c7f62f7f06'
        df_new = df.select(df["device_id"].alias("external_did"),
                       df["event_type"].alias("event_name"),
                       df["event_time"].alias("event_datetime"),
                       concat(df["user_properties"], df["event_properties"]).alias("event_json"),
                       df["event_time"].cast("date").alias("date"),
                       get_json_object(col("user_properties"), "$.registration_id").alias("push_token"),
                       df["idfa"].alias("ios_ifa"),
                       df["user_id"].alias("external_profile_id"),
                       lit(app_id).alias("external_app_id"),
                       df["external_system"],
                       lit(app_uuid).alias("internal_app_id"))
    elif external_system_name == '2':
        app_uuid = '86ff5d12-55db-4bdf-a849-1b685bdff00b'
        df_new = df.select(df["uniq_device_id"].alias("external_did"),
                       df["event_name"].alias("event_name"),
                       df["event_datetime"].alias("event_datetime"),
                       df["event_json"],
                       df["event_datetime"].cast("date").alias("date"),
                       df["uniq_device_id"].alias("push_token"),
                       df["ios_ifa"].alias("ios_ifa"),
                       df["profile_id"].alias("external_profile_id"),
                       lit(app_id).alias("external_app_id"),
                       df["external_system"],
                       lit(app_uuid).alias("internal_app_id"))
    dirs = prepared_logs_path.split('/')
    prepared_logs_path = f'{dirs[0]}/{app_uuid}/{dirs[1]}'
    df_new.write.parquet(prepared_logs_path)
    df_new.show(10)

In [None]:
external_system = '2'
data_date = date(2023, 1, 7)
raw_logs_path = './'
prepare_raw_logs(external_system, f'{data_date.strftime("%Y-%m-%d")}', raw_logs_path, f'prepared/{data_date.strftime("%Y-%m-%d")}')