In [None]:
! pip install pyspark==3.0.3

In [1]:
import findspark
findspark.init()

In [2]:
import warnings
warnings.filterwarnings('ignore')
import pyspark
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col, sum
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, LongType, TimestampType, BooleanType
import logging

In [3]:
lOCAL_RUN = False

In [4]:
from pyspark.sql import SparkSession

# Если сессия уже существует, убиваем её
if 'spark' in locals() or 'spark' in globals():
    spark.stop()

if lOCAL_RUN:
    # для локального применения свежей версии
    spark = (
        SparkSession.builder
            .appName("Spark ML Research")
            # 1. Используем все ядра (16), но оставляем 1-2 для системы
            .master("local[14]") 

            # 2. Память Драйвера (в локальном режиме это основная настройка)
            # Выделяем 16-20 ГБ, чтобы спокойно делать .toPandas() и обучать модели
            .config("spark.driver.memory", "18g")

            # 3. Лимит на размер объектов, собираемых на драйвере (увеличиваем для тяжелых операций)
            .config("spark.driver.maxResultSize", "8g")

            # 4. Включаем современные оптимизации 2025 года (Adaptive Query Execution)
            .config("spark.sql.adaptive.enabled", "true")

            # 5. Оптимизация работы с памятью при передаче данных в Pandas
            .config("spark.sql.execution.arrow.pyspark.enabled", "true")

            .getOrCreate()
    )
else:
    spark = (
        SparkSession.builder
            .appName("Spark ML Clean Data")
            #.master(f"spark://{MASTER_CONN}") 
            #.config("spark.executor.instances", "3")
            #.config("spark.executor.cores", "3")
            #.config("spark.executor.memory", "10g")
            #.config("spark.executor.memoryOverhead", "1500m")
        
            .config("spark.driver.memory", "12g")
            .config("spark.driver.cores", "3")
            #.config("spark.driver.maxResultSize", "2g")

            #.config("spark.sql.shuffle.partitions", "150")
            #.config("spark.default.parallelism", "150")
            #.config("spark.sql.files.maxPartitionBytes", "128m")  # 1GB # 134217728 128Mb
          
            #.config("spark.memory.fraction", "0.8") 
            #.config("spark.network.timeout", "800s")
        
            .config("spark.sql.adaptive.enabled", "true")
            .config("spark.sql.execution.arrow.pyspark.enabled", "true")
            .getOrCreate()
    )
    
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)  # to pretty print pyspark.DataFrame in jupyter

sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [5]:
print(spark.sparkContext.getConf().get("spark.driver.memory"))

12g


In [12]:
#path = r"/media/rk/500гб/Обучение/MLOps/16 Валидация данных/2022-11-04.txt"
#path = "s3a://otus-bucket2-b1gukkncvsp3tvci7gp3/*.txt"
INPUT_PATH = "/user/ubuntu/data"

# =====================================================
# Чтение данных
# =====================================================

schema = StructType([
    StructField("transaction_id", LongType(), True),
    StructField("tx_datetime", StringType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("terminal_id", StringType(), True),
    StructField("tx_amount", DoubleType(), True),
    StructField("tx_time_seconds", IntegerType(), True),
    StructField("tx_time_days", IntegerType(), True),
    StructField("tx_fraud", IntegerType(), True),
    StructField("tx_fraud_scenario", IntegerType(), True)
])

df = spark.read.csv(
    INPUT_PATH, 
    header=False, 
    comment='#', 
    schema=schema  # Убираем inferSchema, добавляем нашу схему
)

In [10]:
df_fraud = df.filter((df["tx_fraud"] == 1))
df_fraud.limit(5)

transaction_id,tx_datetime,customer_id,terminal_id,tx_amount,tx_time_seconds,tx_time_days,tx_fraud,tx_fraud_scenario
1840838987,2022-11-09 21:30:28,137488,569,111.46,101597428,1175,1,2
1840838991,2022-11-09 08:35:20,137490,178,30.24,101550920,1175,1,2
1840838995,2022-11-09 17:59:58,137492,569,80.41,101584798,1175,1,2
1840839000,2022-11-09 03:41:58,137495,174,13.2,101533318,1175,1,2
1840839001,2022-11-09 11:50:03,137496,30,52.93,101562603,1175,1,2


# EDA

In [11]:
df.printSchema()

root
 |-- transaction_id: long (nullable = true)
 |-- tx_datetime: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- terminal_id: string (nullable = true)
 |-- tx_amount: double (nullable = true)
 |-- tx_time_seconds: integer (nullable = true)
 |-- tx_time_days: integer (nullable = true)
 |-- tx_fraud: integer (nullable = true)
 |-- tx_fraud_scenario: integer (nullable = true)



In [13]:
#выводим общие статистики по всему датасету
df.describe()

summary,transaction_id,tx_datetime,customer_id,terminal_id,tx_amount,tx_time_seconds,tx_time_days,tx_fraud,tx_fraud_scenario
count,1879794138.0,1879794138,1879794138.0,1879781078,1879794138.0,1879794138.0,1879794138.0,1879794138.0,1879794138.0
mean,939895891.839161,,500416.7652254882,24933.921642023033,54.23143669365993,51840456.457804486,599.5075215422339,0.0592292138534161,0.119304460773885
stddev,542649141.3830142,,288588.13040473574,1478655.8994933846,41.286941330491935,29929841.937784765,346.4094747447387,0.236053201862132,0.4774899609210523
min,0.0,2019-08-22 00:00:00,-999999.0,0,0.0,0.0,0.0,0.0,0.0
max,1879791584.0,2022-12-03 24:00:00,999999.0,Err,16539.04,103680000.0,1199.0,1.0,3.0


In [14]:
df.dtypes

[('transaction_id', 'bigint'),
 ('tx_datetime', 'string'),
 ('customer_id', 'int'),
 ('terminal_id', 'string'),
 ('tx_amount', 'double'),
 ('tx_time_seconds', 'int'),
 ('tx_time_days', 'int'),
 ('tx_fraud', 'int'),
 ('tx_fraud_scenario', 'int')]

In [15]:
# Считаем количество null в каждой колонке
null_counts = df.select([
    sum((col(c).isNull()).cast("int")).alias(c) 
    for c in df.columns
])
null_counts.show()

+--------------+-----------+-----------+-----------+---------+---------------+------------+--------+-----------------+
|transaction_id|tx_datetime|customer_id|terminal_id|tx_amount|tx_time_seconds|tx_time_days|tx_fraud|tx_fraud_scenario|
+--------------+-----------+-----------+-----------+---------+---------------+------------+--------+-----------------+
|             0|          0|          0|      13060|        0|              0|           0|       0|                0|
+--------------+-----------+-----------+-----------+---------+---------------+------------+--------+-----------------+

