In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat, lit, when, rand, current_date, date_add, date_format, udf
from pyspark.sql.types import StringType
import os

spark = SparkSession.builder.appName("SyntheticData") \
    .config("spark.master", "local") \
    .config('spark.driver.memory', '4g') \
    .config('spark.executor.cores', '4') \
    .config('spark.executor.memory', '8g') \
    .getOrCreate()

sc = spark.sparkContext

# Указываем сколько строк надо сгенерировать
numbers_row = 100

# Генерация данных для колонок id, name, city, email, age, salary, registration_date
data = [(i, f"Names_{i}", f"Cityname_{i}") for i in range(1, numbers_row + 1)]
df = spark.createDataFrame(data, ["id", "name", "city"])

df = df.withColumn("email", concat(col("name"), lit("@example."), when(col("id") % 2 == 0, "ru").otherwise("com"))) \
    .withColumn("age", (rand() * 78 + 18).cast("int")) \
    .withColumn("salary", (rand() * 150000 + 19242).cast("int")) \
    .withColumn("registration_date", date_add(current_date(), - (rand() * (col("age") - 18)).cast("int") * 365))

df.cache()  # Кеширование DataFrame для более быстрой обработки

# Замена 5% данных на значение NULL
columns = ["name", "email", "city", "age", "salary", "registration_date"]
df = df.select("id", *[when(rand() <= 0.05, None).otherwise(col(column)).alias(column) for column in columns])

encryption_dict = {
    "A": "1212abb", "B": "ad", "C": "vbn345", "D": "xyz678",
    "E": "123eqr", "F": "890fgh", "G": "poi0ty", "H": "qwabc1",
    "I": "dfghe2", "J": "456aks", "K": "zxcvw3", "L": "mnb22",
    "M": "efg56", "N": "78nbc", "O": "hijk4", "P": "or890",
    "Q": "nbvc5", "R": "ytser6", "S": "vc123", "T": "gh78",
    "U": "pqws7", "V": "zxc14", "W": "tyui9", "X": "asqw35",
    "Y": "dert2", "Z": "qwert8",
    
    "a": "2dff4", "b": "fa25gf", "c": "1rs351", "d": "14fs3f",
    "e": "7jgvw34", "f": "7gs3gd3", "g": "2fas3542", "h": "45dw24",
    "i": "967nd", "j": "3fa37h", "k": "asf1fw1", "l": "65ser2",
    "m": "1nak64", "n": "6jsl7", "o": "5gd35", "p": "34ts2f",
    "q": "47jdz", "r": "1d5gnn7", "s": "2vsf52", "t": "34sf33",
    "u": "126asv45", "v": "zex33ggg7", "w": "4ssffs4", "x": "fs5juo",
    "y": "der91ghy", "z": "552gvc",

    "0": "zero1", "1": "one1", "2": "two1", "3": "three1",
    "4": "four4", "5": "five5", "6": "six6", "7": "seven7",
    "8": "eight8", "9": "nine9",
    
    " ": "space0", "!": "lk765i", "\"": "quote2", "#": "hash3",
    "$": "dollar4", "%": "percent5", "&": "and6", "'": "quote1",
    "(": "leftpar7", ")": "rightpar8", "*": "star9", "+": "plus10",
    ",": "comma11", "-": "dash12", ".": "dot13", "/": "slash14",
    ":": "colon15", ";": "semicolon16", "<": "lessthan17",
    "=": "equal18", ">": "greaterthan19", "?": "port489",
    "@": "atmark20", "[": "leftbracket21", "\\": "backslash22",
    "]": "rightbracket23", "^": "caret24", "_": "underscore25",
    "`": "backtick26", "{": "leftcurly27", "|": "pipe28",
    "}": "rightcurly29", "~": "tilde30"
}

# Отправляем на узлы кластера
broadcast_dict = sc.broadcast(encryption_dict).value

# Функция для шифрования
def encrypt_string(input_string):
    if input_string is None:
        return None
    return ''.join(broadcast_dict.get(char, char) for char in str(input_string))

# Определяем UDF для запуска функции шифрования
encrypt_udf = udf(encrypt_string, StringType())

# Применяем UDF к колоннам
df = df.withColumn("name", encrypt_udf(col("name"))) \
    .withColumn("email", encrypt_udf(col("email"))) \
    .withColumn("city", encrypt_udf(col("city"))) \
    .withColumn("age", encrypt_udf(col("age"))) \
    .withColumn("salary", encrypt_udf(col("salary"))) \
    .withColumn("registration_date", encrypt_udf(col("registration_date")))

# Пишу свое имя на первой строке колонки name
df = df.withColumn("name", when(col("id") == 1, "ATAGAEV").otherwise(col("name")))

# Установка текущей даты и количества строк для имени файла
df_date = spark.sql("SELECT current_date()")
current_date = df_date.select(date_format("current_date", "yyyy-MM-dd")).first()[0]
name_csv = encrypt_string(numbers_row)

# Путь к сохранению CSV файла
path = f"/home/jovyan/work/PySpark_test/config/temp/{current_date}-{name_csv}.csv"
path_rename = f"/home/jovyan/work/PySpark_test/config/{current_date}-{name_csv}.csv"
# Запись DataFrame в CSV
df.coalesce(1).write.csv(path, header=True, mode="overwrite")  

# Переименование сгенерированного файла
for file in os.listdir(path):
    file_path = os.path.join(path, file)
    if file.startswith("part-"):
        os.rename(file_path, path_rename)

# Удаление временных файлов и папки
for file in os.listdir(path):
    file_path = os.path.join(path, file)
    os.remove(file_path)
    
os.rmdir(path)

df.show()

spark.stop()

+---+--------------------+--------------------+--------------------+------------+--------------------+--------------------+
| id|                name|               email|                city|         age|              salary|   registration_date|
+---+--------------------+--------------------+--------------------+------------+--------------------+--------------------+
|  1|             ATAGAEV|78nbc2dff41nak647...|vbn345967nd34sf33...|   six6five5|four4three1eight8...|one1nine9eight8se...|
|  2|78nbc2dff41nak647...|78nbc2dff41nak647...|vbn345967nd34sf33...| eight8nine9|eight8one1two1fiv...|two1zero1zero1fiv...|
|  3|78nbc2dff41nak647...|78nbc2dff41nak647...|vbn345967nd34sf33...|  two1seven7|one1four4zero1nin...|two1zero1one1seve...|
|  4|78nbc2dff41nak647...|78nbc2dff41nak647...|vbn345967nd34sf33...|   five5six6|nine9five5eight8e...|one1nine9eight8ei...|
|  5|78nbc2dff41nak647...|78nbc2dff41nak647...|vbn345967nd34sf33...| eight8five5|two1zero1five5zer...|one1nine9seven7th...|
|  6|78n

In [2]:
##########################################################################
# Блок кода для дешифрования
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, current_date, date_format
from pyspark.sql.types import StringType
import json
import re
import os

# Инициализация Spark
spark = SparkSession.builder.appName("SyntheticData") \
    .config("spark.master", "local") \
    .config('spark.driver.memory', '4g') \
    .config('spark.executor.cores', '4') \
    .config('spark.executor.memory', '8g') \
    .getOrCreate()

sc = spark.sparkContext

# Указываем сколько было строк в зашифрованном файле
numbers_row = 100

encryption_dict = {
    "A": "1212abb", "B": "ad", "C": "vbn345", "D": "xyz678",
    "E": "123eqr", "F": "890fgh", "G": "poi0ty", "H": "qwabc1",
    "I": "dfghe2", "J": "456aks", "K": "zxcvw3", "L": "mnb22",
    "M": "efg56", "N": "78nbc", "O": "hijk4", "P": "or890",
    "Q": "nbvc5", "R": "ytser6", "S": "vc123", "T": "gh78",
    "U": "pqws7", "V": "zxc14", "W": "tyui9", "X": "asqw35",
    "Y": "dert2", "Z": "qwert8",
    
    "a": "2dff4", "b": "fa25gf", "c": "1rs351", "d": "14fs3f",
    "e": "7jgvw34", "f": "7gs3gd3", "g": "2fas3542", "h": "45dw24",
    "i": "967nd", "j": "3fa37h", "k": "asf1fw1", "l": "65ser2",
    "m": "1nak64", "n": "6jsl7", "o": "5gd35", "p": "34ts2f",
    "q": "47jdz", "r": "1d5gnn7", "s": "2vsf52", "t": "34sf33",
    "u": "126asv45", "v": "zex33ggg7", "w": "4ssffs4", "x": "fs5juo",
    "y": "der91ghy", "z": "552gvc",

    "0": "zero1", "1": "one1", "2": "two1", "3": "three1",
    "4": "four4", "5": "five5", "6": "six6", "7": "seven7",
    "8": "eight8", "9": "nine9",
    
    " ": "space0", "!": "lk765i", "\"": "quote2", "#": "hash3",
    "$": "dollar4", "%": "percent5", "&": "and6", "'": "quote1",
    "(": "leftpar7", ")": "rightpar8", "*": "star9", "+": "plus10",
    ",": "comma11", "-": "dash12", ".": "dot13", "/": "slash14",
    ":": "colon15", ";": "semicolon16", "<": "lessthan17",
    "=": "equal18", ">": "greaterthan19", "?": "port489",
    "@": "atmark20", "[": "leftbracket21", "\\": "backslash22",
    "]": "rightbracket23", "^": "caret24", "_": "underscore25",
    "`": "backtick26", "{": "leftcurly27", "|": "pipe28",
    "}": "rightcurly29", "~": "tilde30"
}
    
# Отправляем на узлы кластера
broadcast_dict = sc.broadcast(encryption_dict).value

# Функция для шифрования
def encrypt_string(input_string):
    if input_string is None:
        return None
    return ''.join(broadcast_dict.get(char, char) for char in str(input_string))

# Установка текущей даты и количества строк для имени файла
df_date = spark.sql("SELECT current_date()")
current_date = df_date.select(date_format("current_date", "yyyy-MM-dd")).first()[0]
name_csv = encrypt_string(numbers_row)
path_open = f"/home/jovyan/work/PySpark_test/config/{current_date}-{name_csv}.csv"

# Чтение DataFrame из CSV
df = spark.read.csv(path_open, header=True)

# Создание обратного словаря
decryption_dict = {v: k for k, v in broadcast_dict.items()}

# Функция для дешифрования
def decrypt_string(encrypted):
    if encrypted is None:
        return None 
    decrypted = []
    pattern = '|'.join(re.escape(key) for key in decryption_dict.keys())
    matches = re.findall(pattern, encrypted)
    if not matches:
        return encrypted
    for match in matches:
        decrypted.append(decryption_dict.get(match, match))
    return ''.join(decrypted)

# Определяем UDF для дешифрования
decrypt_udf = udf(decrypt_string, StringType())

# Применяем UDF к нужным колоннам
df = df.withColumn("name", decrypt_udf(col("name"))) \
    .withColumn("email", decrypt_udf(col("email"))) \
    .withColumn("city", decrypt_udf(col("city"))) \
    .withColumn("age", decrypt_udf(col("age"))) \
    .withColumn("salary", decrypt_udf(col("salary"))) \
    .withColumn("registration_date", decrypt_udf(col("registration_date")))

# Путь к сохранению CSV файла
path = f"/home/jovyan/work/PySpark_test/config/temp/{current_date}-dev.csv"
path_rename = f"/home/jovyan/work/PySpark_test/config/{current_date}-dev.csv"
# Запись DataFrame в CSV
df.coalesce(1).write.csv(path, header=True, mode="overwrite")  

# Переименование сгенерированного файла
for file in os.listdir(path):
    file_path = os.path.join(path, file)
    if file.startswith("part-"):
        os.rename(file_path, path_rename)

# Удаление временных файлов и папки
for file in os.listdir(path):
    file_path = os.path.join(path, file)
    os.remove(file_path)
    
os.rmdir(path)

df.show()

spark.stop()

+---+--------+--------------------+-----------+----+------+-----------------+
| id|    name|               email|       city| age|salary|registration_date|
+---+--------+--------------------+-----------+----+------+-----------------+
|  1| ATAGAEV| Names_1@example.com| Cityname_1|  65| 43824|       1987-08-11|
|  2| Names_2|  Names_2@example.ru| Cityname_2|  89| 81252|       2005-08-06|
|  3| Names_3| Names_3@example.com| Cityname_3|  27|140990|       2017-08-03|
|  4| Names_4|  Names_4@example.ru| Cityname_4|  56| 95884|       1988-08-10|
|  5| Names_5| Names_5@example.com| Cityname_5|  85| 20506|       1973-08-14|
|  6| Names_6|  Names_6@example.ru| Cityname_6|  25| 35493|       2019-08-03|
|  7| Names_7| Names_7@example.com| Cityname_7|  67|112707|       1990-08-10|
|  8| Names_8|  Names_8@example.ru| Cityname_8|  47|138321|       2004-08-06|
|  9| Names_9| Names_9@example.com|       NULL|  25|149773|       2024-08-01|
| 10|Names_10| Names_10@example.ru|       NULL|  94| 56038|     