In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, regexp_replace, trim, length, when

# Создание Spark сессии с указанием зависимости и параметрами памяти
spark = SparkSession.builder \
    .appName("Excel to PySpark") \
    .config("spark.jars.packages", "com.crealytics:spark-excel_2.12:0.14.0") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()

In [2]:
# Загрузка первого листа из Excel
df = spark.read.format("com.crealytics.spark.excel") \
    .option("header", "true") \
    .option("dataAddress", "Данные!A3") \
    .load("/home/jovyan/work/PySpark_test/config/Численность выбывших.xls")

In [3]:
df = df.withColumnRenamed("Классификатор объектов административно-территориального деления (ОКАТО)", "region") \
       .withColumnRenamed("Классификатор стран мира ", "world") \
       .withColumnRenamed("Единица измерения", "unit_name") \
       .withColumnRenamed("Период", "period_name") \
       .withColumnRenamed("Потоки миграции", "migration")

In [4]:
# Обработка region
df = df.withColumn("region", trim(col("region"))) \
       .withColumn("region_okato", split(col("region"), " ").getItem(0)) \
       .withColumn("region", split(col("region"), " ").getItem(1)) \
       .filter(length(col("region_okato")) == 11)

In [5]:
# Обработка world
df = df.withColumn("world", trim(col("world"))) \
       .withColumn("world_okato", split(col("world"), " ").getItem(0)) \
       .withColumn("world", split(col("world"), " ").getItem(1)) \
       .withColumn("world_okato", when(split(col("world_okato"), ":").getItem(1).isNotNull(), split(col("world_okato"), ":").getItem(1)).otherwise(col("world_okato")))

In [6]:
# Обработка unit_name, period_name и migration
df = df.withColumn("unit_name", trim(regexp_replace(col("unit_name"), "\\d+", ""))) \
       .withColumn("period_name", trim(regexp_replace(col("period_name"), "\\d+", ""))) \
       .withColumn("migration", trim(regexp_replace(col("migration"), "\\d+", "")))

In [7]:
months = ['январь', 'февраль', 'март', 'апрель', 'май', 'июнь', 'июль', 'август', 'сентябрь', 'октябрь', 'ноябрь', 'декабрь']
df = df.filter(col("period_name").isin(months))

In [8]:
# Мелтинг данных
cols_to_melt = [str(year) for year in range(1993, 2024)]
cols_to_keep = [col for col in df.columns if col not in cols_to_melt]

from pyspark.sql import functions as F

df_melted = df.select(*cols_to_keep, *[F.expr('stack(31, ' + ', '.join([f"'{year}', `{year}`" for year in cols_to_melt]) + ') as (years, leaving)')])


In [9]:
from pyspark.sql.functions import monotonically_increasing_id

# Добавление индекса в DataFrame
df_melted = df_melted.withColumn("index", monotonically_increasing_id())

# Подготовка итогового DataFrame
df_melted = df_melted.withColumn("leaving", when(col("leaving").isNull(), 0).otherwise(col("leaving"))) \
                     .withColumn("years", col("years").cast("int")) \
                     .orderBy("index", "years")

df_melted = df_melted.select("region", "region_okato", "world", "world_okato", "unit_name", "period_name", "migration", "years", "leaving")


In [10]:
# Сохранение итогового DataFrame в CSV
output_path = "/home/jovyan/work/PySpark_test/config/result.csv"
df_melted.coalesce(1).write.csv(output_path, header=True, mode="overwrite")

Py4JJavaError: An error occurred while calling o155.csv.
: java.lang.OutOfMemoryError: Java heap space
	at java.base/java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:64)
	at java.base/java.nio.ByteBuffer.allocate(ByteBuffer.java:363)
	at shadeio.poi.poifs.filesystem.POIFSFileSystem.<init>(POIFSFileSystem.java:297)
	at shadeio.poi.ss.usermodel.WorkbookFactory.create(WorkbookFactory.java:252)
	at shadeio.poi.ss.usermodel.WorkbookFactory.create(WorkbookFactory.java:221)
	at com.crealytics.spark.excel.DefaultWorkbookReader.$anonfun$openWorkbook$1(WorkbookReader.scala:49)
	at com.crealytics.spark.excel.DefaultWorkbookReader$$Lambda$1142/0x00007f89dc8264a0.apply(Unknown Source)
	at scala.Option.fold(Option.scala:251)
	at com.crealytics.spark.excel.DefaultWorkbookReader.openWorkbook(WorkbookReader.scala:49)
	at com.crealytics.spark.excel.WorkbookReader.withWorkbook(WorkbookReader.scala:14)
	at com.crealytics.spark.excel.WorkbookReader.withWorkbook$(WorkbookReader.scala:13)
	at com.crealytics.spark.excel.DefaultWorkbookReader.withWorkbook(WorkbookReader.scala:45)
	at com.crealytics.spark.excel.ExcelRelation.buildScan(ExcelRelation.scala:64)
	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.$anonfun$apply$6(DataSourceStrategy.scala:340)
	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$$$Lambda$1997/0x00007f89dcb5c280.apply(Unknown Source)
	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.$anonfun$pruneFilterProject$1(DataSourceStrategy.scala:367)
	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$$$Lambda$1998/0x00007f89dcb5c840.apply(Unknown Source)
	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.pruneFilterProjectRaw(DataSourceStrategy.scala:446)
	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.pruneFilterProject(DataSourceStrategy.scala:366)
	at org.apache.spark.sql.execution.datasources.DataSourceStrategy$.apply(DataSourceStrategy.scala:340)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$1(QueryPlanner.scala:63)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner$$Lambda$1993/0x00007f89dcb458a0.apply(Unknown Source)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
	at org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:70)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$3(QueryPlanner.scala:78)
	at org.apache.spark.sql.catalyst.planning.QueryPlanner$$Lambda$1996/0x00007f89dcb50000.apply(Unknown Source)
	at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:196)
	at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:194)
	at scala.collection.Iterator.foreach(Iterator.scala:943)


In [11]:
# Закрытие Spark сессии
spark.stop()